In [1]:
import json
import os
import time
import pandas as pd
import sqlite3

In [11]:
from itertools import product, cycle

In [57]:
stock_info_queries = [
    "Show me the last {duration} of stock data for {company}.",
    "I need stock performance for {company} over the past {duration} .",
    "Can you provide the stock details for {company} for the last {duration}?",
    "I'm looking for {company}'s stock information for the past {duration} .",
    "Display stock data for {company} from the last {duration}.",
     "What's the trading history of {company} for the last {duration}?",
    "Please summarize the stock performance of {company} during the previous {duration}.",
    "I'd appreciate stock market details for {company} over the last {duration}.",
    "What has been the stock trend for {company} in the past {duration}?",
    "Could you fetch the recent stock performance for {company} covering the past {duration}?"
]

# Paraphrases for "What is potential risk associated with buying the stock for it"
stock_risk_queries = [
    "What are the risks of investing in {company}'s stock?",
    "Tell me about the potential dangers of buying {company} stock.",
    "What should I be wary of when purchasing stock in {company}?",
    "Could you explain the risks involved with {company} stocks?",
    "Identify the investment risks for {company}'s stock.",
    "What financial hazards come with {company}'s shares?",
    "Highlight the investment perils associated with {company}.",
    "Are there any significant risks in buying {company}'s stocks?",
    "Detail the stock purchase risks for {company}.",
    "What investment challenges might I face with {company}'s stocks?"
]

# Paraphrases for "What's the latest news related to Apple or it's stock"
latest_news_queries = [
    "Show me the most recent news about {company}.",
    "What's the latest on {company} or its stock?",
    "I want the newest news stories for {company}.",
    "Can you find the latest news articles about {company}?",
    "What are the recent developments with {company}?",
    "Find me the latest updates on {company}.",
    "What's new with {company}'s business or stock market performance?",
    "Give me the current headlines concerning {company}.",
    "Locate the most recent news reports on {company}.",
    "What are the latest happenings at {company}?"
]

# Paraphrases for "What are ESG ratings for this company"
esg_ratings_queries = [
    "Tell me the ESG ratings for {company}.",
    "What is the ESG score of {company}?",
    "Can you provide ESG ratings for {company}?",
    "I need to know the ESG ratings of {company}.",
    "Show ESG scores for {company}.",
     "What's the latest on {company}'s ESG performance?",
    "How does {company} rank in terms of ESG criteria?",
    "Give me an update on the ESG standings of {company}.",
    "Where does {company} stand in environmental, social, and governance scores?",
    "What are the current environmental, social, and governance ratings for {company}?"
]

# Paraphrases for "What are the reviews of this company by it's employees?"
employee_reviews_queries = [
    "What do employees say about {company}?",
    "I'm looking for {company} employee reviews.",
    "Can you show me what {company}'s workers think of it?",
    "Tell me about employee feedback on {company}.",
    "I want to read reviews from {company}'s employees.",
    "How do {company}'s employees rate their workplace?",
    "What's the general sentiment of {company}'s staff about their jobs?",
    "Give an overview of {company}'s work environment as reviewed by employees.",
    "How satisfied are the employees working at {company}?",
    "What feedback do {company}'s employees have about their employer?"
]

# Paraphrases for "Who are the competitors of this company?"
competitors_queries = [
    "Who competes with {company}?",
    "I need a list of {company}'s competitors.",
    "Can you tell me about {company}'s main rivals?",
    "Show me companies competing with {company}.",
    "What are the key competitors of {company}?",
    "List the businesses in direct competition with {company}.",
    "Who are {company}'s industry challengers?",
    "Detail the market competitors of {company}.",
    "Which companies vie closely with {company} in the market?",
    "Identify the direct and indirect competitors of {company}."
]

In [58]:
company_names = ["Apple", "Microsoft", "Google", "Amazon", "Tesla", "Netflix", "Pfizer", "Boeing"]
durations = ["1d", "5d", "1m", "5m", "6m", "1y", "5y"]


In [4]:
base_dir = '/content/drive/MyDrive/ISA_proj-20240208T060848Z-001/ISA_proj'

In [59]:
conversations = []

In [60]:
def generate_all_variations(company, duration):
    stock_info_cycle = cycle(stock_info_queries)
    latest_news_cycle = cycle(latest_news_queries)
    stock_risk_cycle = cycle(stock_risk_queries)
    esg_ratings_cycle = cycle(esg_ratings_queries)
    employee_reviews_cycle = cycle(employee_reviews_queries)
    competitors_cycle = cycle(competitors_queries)

    max_iterations = max(len(stock_info_queries), len(latest_news_queries), len(stock_risk_queries),
                         len(esg_ratings_queries), len(employee_reviews_queries), len(competitors_queries))

    for _ in range(max_iterations):
        conversation = {
            "stock_info": {
                "question": next(stock_info_cycle).format(company=company, duration=duration),
                "response": f"stock_info:{company}:{duration}"
            },
            "latest_news": {
                "question": next(latest_news_cycle).format(company=company),
                "response": f"latest_news:{company}"
            },
            "stock_risk": {
                "question": next(stock_risk_cycle).format(company=company),
                "response": f"stock_risk:{company}"
            },
            "esg_ratings": {
                "question": next(esg_ratings_cycle).format(company=company),
                "response": f"esg_ratings:{company}"
            },
            "employee_reviews": {
                "question": next(employee_reviews_cycle).format(company=company),
                "response": f"employee_reviews:{company}"
            },
            "competitors": {
                "question": next(competitors_cycle).format(company=company),
                "response": f"competitors:{company}"
            }
        }
        conversations.append(conversation)

In [61]:
for company, duration in product(company_names, durations):
    generate_all_variations(company, duration)

In [62]:
with open(f'{base_dir}/conversations.json', 'w') as fp:
  json.dump(conversations, fp, indent=4)

In [63]:
system_prompt = ("As a highly intelligent assistant and successor of google gemma model, your primary goal is to provide accurate, "
                 "relevant, and context-aware responses to user queries based on the provided information. "
                 "Ensure your answers are factual, free from bias, and avoid promoting violence, hate speech, "
                 "or any form of discrimination. Focus on assisting the user effectively and safely.")

In [50]:
flattened_data = []

In [64]:
def get_df_for_conv_data(data):
  flattened_data = []
  for conv_id, conv in enumerate(data):
    for step in conv:
      flattened_data.append({
          "conv_id": conv_id,
          "step": step,
          "system_prompt": system_prompt,
          "question": conv[step]["question"],
          "response": conv[step]["response"]
      })
  df = pd.DataFrame(flattened_data)
  return df

In [65]:
import random

In [66]:
random.seed(42)  # Ensures reproducibility
random.shuffle(conversations)

# Calculate the split index
split_index = int(0.8 * len(conversations))

# Split the conversations
train_conversations = conversations[:split_index]
validation_conversations = conversations[split_index:]


In [67]:
train_df = get_df_for_conv_data(train_conversations)
validation_df = get_df_for_conv_data(validation_conversations)

In [68]:
train_df.to_csv(f'{base_dir}/train_df_gemma.csv', index=False)
validation_df.to_csv(f'{base_dir}/validation_df_gemma.csv', index=False)

In [23]:
!pip install huggingface_hub



In [24]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [42]:
from huggingface_hub import HfApi

hf_api = HfApi()


In [26]:
repo_url = hf_api.create_repo(
                              repo_id="yatharth97/isa_gemma",
                              token="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",
                              repo_type="dataset",
                              private=False)  # Set `private=True` if you want it to be private
print("Repository URL:", repo_url)


Repository URL: https://huggingface.co/datasets/yatharth97/isa_gemma


In [69]:
from huggingface_hub import upload_file

repo_id = "yatharth97/isa_gemma"

train_file_path = f'{base_dir}/train_df_gemma.csv'
train_path_in_repo = "train_df_gemma.csv"  # Name in the repository
upload_file(
    token="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",
    path_or_fileobj=train_file_path,
    path_in_repo=train_path_in_repo,
    repo_id=repo_id,
    repo_type='dataset'
)

# Upload validation data
validation_file_path = f'{base_dir}/validation_df_gemma.csv'
validation_path_in_repo = "validation_df_gemma.csv"  # Name in the repository
upload_file(
    token="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",
    path_or_fileobj=validation_file_path,
    path_in_repo=validation_path_in_repo,
    repo_id=repo_id,
    repo_type='dataset'
)

CommitInfo(commit_url='https://huggingface.co/datasets/yatharth97/isa_gemma/commit/e96c1a5a284bd852d28406acbb2056e489c3e579', commit_message='Upload validation_df_gemma.csv with huggingface_hub', commit_description='', oid='e96c1a5a284bd852d28406acbb2056e489c3e579', pr_url=None, pr_revision=None, pr_num=None)