In [1]:
import json
import os
import time

In [2]:
!pip install -q -U google-generativeai

In [3]:
import pathlib
import textwrap

import google.generativeai as genai
import sqlite3
import pandas as pd

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [4]:
from google.colab import userdata

In [5]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [6]:
model = genai.GenerativeModel('gemini-pro')

In [7]:
import random

In [None]:
base_dir = '/content/drive/MyDrive/PLP Code'
pdf_file_dir = f'{base_dir}/PDF_files'
pdf_files = os.listdir(pdf_file_dir)

In [None]:
company_names = [(file_name.split('_'))[0] for file_name in pdf_files]

In [None]:
company_names = list(set(company_names))

In [None]:
len(company_names)

47

In [None]:
random.seed(42)
random.shuffle(company_names)
companies = company_names[:10]

In [None]:
#These are metrics from 10K, change and update as and when needed
metrics = [
    "Consolidated Operating Revenues",
    "Consolidated Operating Expenses",
    "Other Income (Expense), Net",
    "Interest Expense",
    "Consolidated Net Income",
    "Consolidated EBITDA",
    "Consolidated Adjusted EBITDA",
    "Total Operating Revenues",
    "Churn Rate",
    "Segment EBITDA",
    "Segment EBITDA Margin",
    "Segment Operating Income Margin",
    "Depreciation and Amortization Expense",
    "Selling, General and Administrative Expenses",
    "Change in Cash, Cash Equivalents, and Restricted Cash",
    "Free Cash Flow",
    "Income Before Income Tax Expense",
    "Net Income Attributable to Common Shareholders",
    "Basic Earnings Per Common Share",
    "Diluted Earnings Per Common Share",
    "Total Comprehensive Income",
    "Total Assets",
    "Total Liabilities",
    "Total Shareholders' Equity",
    "Cash and Cash Equivalents",
    "Contract Assets",
    "Contract Liabilities",
    "Operating Lease Cost",
    "Finance Lease Liabilities",
    "Operating Lease Payments",
    "Present Value of Lease Liabilities",
    "Long-Term Debt, Including Current Maturities",
    "Debt Maturing Within One Year",
    "Total Debt"
]


In [None]:
years = [2023, 2022] # Added year too as it can be a potential optional attribute for us

In [None]:
info_extraction_question_template = """
What is [metric] for the year [yearnum]
"""
# can ignore

In [None]:
document_uploading_question_template = """
Hey, I have just uploaded the 10k report for [company].
"""
# can ignore

In [None]:
# info_extraction_questions_paraphrased = [
#     "What is [metric] for the year [yearnum]",
#     "Can you tell me what the [metric] was in [yearnum]?",
#     "I'm looking to find out the [metric] for [yearnum], could you help with that?",
#     "What was the [metric] in the year [yearnum]?",
#     "Could you provide the [metric] figures for the year [yearnum]?",
#     "I need the data on [metric] for the year [yearnum], please.",
#     "What did the [metric] amount to in [yearnum]?",
#     "Please, can you extract the [metric] for [yearnum] from the report?",
#     "Do you have the [metric] numbers for [yearnum]?",
#     "I would like to know the [metric] for the year [yearnum], can you find that?",
#     "What's the figure for [metric] in the year [yearnum]?"
# ]

info_extraction_questions_paraphrased = [
    "What is [metric] for [company] in the year [yearnum]",
    "Can you tell me what the [metric] for [company] was in [yearnum]?",
    "I'm looking to find out the [metric] for [company] in [yearnum], could you help with that?",
]
#depending on feasibility and GPU availability we can either use all such paraphrasing or only the botton three, this can be common for financial reports, not sure of news



In [None]:
# document_uploading_questions = [
#     "Hey, I have just uploaded the 10k report for [company].",
#     "Just uploaded the 10k report for [company]. Can you check it out?",
#     "Hey there, I've put up the 10k report for [company] in the system.",
#     "FYI, the 10k report for [company] is now uploaded.",
#     "I've successfully uploaded the annual 10k report for [company], just so you know.",
#     "Good news, the 10k report for [company] has been uploaded.",
#     "Hey, the 10k financial report for [company] is in. Take a look?",
#     "Just to let you know, I've put the 10k report for [company] up for review.",
#     "Heads up, the 10k report for [company] is uploaded and ready for analysis.",
#     "Hey, I’ve got the 10k report for [company] uploaded here.",
#     "Just dropped in the 10k report for [company]. Let me know what you think!"
# ]

document_uploading_questions = [
    "Hey, I have just uploaded the 10k report for [company] for [uyear].",
    "Just uploaded the 10k report for [company] for [uyear]. Can you check it out?",
    "Hey there, I've put up the 10k report for [company] for [uyear] in the system.",
]
#depending on feasibility and GPU availability we can either use all such paraphrasing or only the botton three
#here the 10k needs to be changed by financial reports, and for news it can something like. Following is a recent news article for [company]


In [None]:
summary_questions = [
    "Summarize the 10k report for [company] for [year] for me.",
    "Could you condense the 10k report for [company] for [year] into its key highlights for me?",
    "What are the key takeaways from the 10k report for [company] for [year]?"
]


In [None]:
follow_up_questions = [
    "Does the Consolidated Operating Revenues figure indicate a healthy financial state for the company?",
    "Are the Consolidated Operating Expenses within a reasonable range compared to the industry average?",
    "Is the reported Other Income (Expense), Net considered a significant impact on the company's overall financials?",
    "How does the Interest Expense affect the company's debt management strategy?",
    "Would you say the Consolidated Net Income growth is sustainable?",
    "Is the Consolidated EBITDA margin in line with what we'd expect for a company of this scale?",
    "How favorable is the Consolidated Adjusted EBITDA when evaluating the company's operational efficiency?",
    "Do the Total Operating Revenues reflect a competitive position in the market?",
    "Is the Churn Rate indicative of customer satisfaction and loyalty?",
    "How does the Segment EBITDA compare to last year? Is this improvement or decline?",
    "Is the Segment EBITDA Margin showing an efficient operational model?",
    "What does the Segment Operating Income Margin say about the profitability of different business units?",
    "Are the levels of Depreciation and Amortization Expense appropriate for the company's asset base?",
    "Are Selling, General and Administrative Expenses being effectively controlled?",
    "Does the Change in Cash, Cash Equivalents, and Restricted Cash signify a strong liquidity position?",
    "Is the Free Cash Flow sufficient to support investment and debt repayment?",
    "What implications does the Income Before Income Tax Expense have for future tax liabilities?",
    "Is the Net Income Attributable to Common Shareholders reflective of a solid return on investment?",
    "Do the Basic and Diluted Earnings Per Common Share values meet shareholder expectations?",
    "Does the Total Comprehensive Income support a positive outlook for the company's financial health?",
    "Are Total Assets growing at a healthy rate?",
    "How does the ratio of Total Liabilities to Total Shareholders' Equity affect the company's leverage?",
    "Are the levels of Cash and Cash Equivalents adequate for operational needs?",
    "How do Contract Assets and Contract Liabilities balance out?",
    "Is the Operating Lease Cost manageable within the current revenue streams?",
    "How does the level of Finance Lease Liabilities impact financial flexibility?",
    "Are the Operating Lease Payments sustainable over the long term?",
    "What does the Present Value of Lease Liabilities tell us about the cost of financing leases?",
    "Is the amount of Long-Term Debt, Including Current Maturities, a concern for the company's financial stability?",
    "How urgent is the need to address Debt Maturing Within One Year?",
    "Is the Total Debt level manageable for the company's size and industry?"
]

# These questions are designed to provoke thought and analysis about the financial health of a company based on specific metrics.
# Each question can be tailored to reflect the metric's value and context once extracted.
#These are yet again optional, and here the LLM actually answers based on it's knowledge. These are just to make process more interactive

In [None]:
metrics_to_questions = {
    "Consolidated Operating Revenues": "Does the Consolidated Operating Revenues figure indicate a healthy financial state for the company?",
    "Consolidated Operating Expenses": "Are the Consolidated Operating Expenses within a reasonable range compared to the industry average?",
    "Other Income (Expense), Net": "Is the reported Other Income (Expense), Net considered a significant impact on the company's overall financials?",
    "Interest Expense": "How does the Interest Expense affect the company's debt management strategy?",
    "Consolidated Net Income": "Would you say the Consolidated Net Income growth is sustainable?",
    "Consolidated EBITDA": "Is the Consolidated EBITDA margin in line with what we'd expect for a company of this scale?",
    "Consolidated Adjusted EBITDA": "How favorable is the Consolidated Adjusted EBITDA when evaluating the company's operational efficiency?",
    "Total Operating Revenues": "Do the Total Operating Revenues reflect a competitive position in the market?",
    "Churn Rate": "Is the Churn Rate indicative of customer satisfaction and loyalty?",
    "Segment EBITDA": "How does the Segment EBITDA compare to last year? Is this improvement or decline?",
    "Segment EBITDA Margin": "Is the Segment EBITDA Margin showing an efficient operational model?",
    "Segment Operating Income Margin": "What does the Segment Operating Income Margin say about the profitability of different business units?",
    "Depreciation and Amortization Expense": "Are the levels of Depreciation and Amortization Expense appropriate for the company's asset base?",
    "Selling, General and Administrative Expenses": "Are Selling, General and Administrative Expenses being effectively controlled?",
    "Change in Cash, Cash Equivalents, and Restricted Cash": "Does the Change in Cash, Cash Equivalents, and Restricted Cash signify a strong liquidity position?",
    "Free Cash Flow": "Is the Free Cash Flow sufficient to support investment and debt repayment?",
    "Income Before Income Tax Expense": "What implications does the Income Before Income Tax Expense have for future tax liabilities?",
    "Net Income Attributable to Common Shareholders": "Is the Net Income Attributable to Common Shareholders reflective of a solid return on investment?",
    "Basic Earnings Per Common Share": "Do the Basic and Diluted Earnings Per Common Share values meet shareholder expectations?",
    "Diluted Earnings Per Common Share": "Do the Basic and Diluted Earnings Per Common Share values meet shareholder expectations?",
    "Total Comprehensive Income": "Does the Total Comprehensive Income support a positive outlook for the company's financial health?",
    "Total Assets": "Are Total Assets growing at a healthy rate?",
    "Total Liabilities": "How does the ratio of Total Liabilities to Total Shareholders' Equity affect the company's leverage?",
    "Total Shareholders' Equity": "How does the ratio of Total Liabilities to Total Shareholders' Equity affect the company's leverage?",
    "Cash and Cash Equivalents": "Are the levels of Cash and Cash Equivalents adequate for operational needs?",
    "Contract Assets": "How do Contract Assets and Contract Liabilities balance out?",
    "Contract Liabilities": "How do Contract Assets and Contract Liabilities balance out?",
    "Operating Lease Cost": "Is the Operating Lease Cost manageable within the current revenue streams?",
    "Finance Lease Liabilities": "How does the level of Finance Lease Liabilities impact financial flexibility?",
    "Operating Lease Payments": "Are the Operating Lease Payments sustainable over the long term?",
    "Present Value of Lease Liabilities": "What does the Present Value of Lease Liabilities tell us about the cost of financing leases?",
    "Long-Term Debt, Including Current Maturities": "Is the amount of Long-Term Debt, Including Current Maturities, a concern for the company's financial stability?",
    "Debt Maturing Within One Year": "How urgent is the need to address Debt Maturing Within One Year?",
    "Total Debt": "Is the Total Debt level manageable for the company's size and industry?",
    "Change in Cash": "Does the Change in Cash indicate a strong liquidity position?",
    "Cash Equivalents": "Are Cash Equivalents considered a safe asset for the company?",
    "Restricted Cash": "What role does Restricted Cash play in the company's financial strategy?",
}

# This dictionary maps each metric to a specific follow-up question. It's structured to be directly usable in code for generating or analyzing specific questions based on extracted metrics.


In [None]:
def paraphrase_with_gemini(text, model):
  """
  Summarizes the text using the provided model (placeholder).

  **Note:** This is a basic example and might require adjustments based on the specific model implementation.

  Args:
      text (str): The text to summarize.
      model (object): The model object used for summarization.

  Returns:
      str: The summarized text or None if an error occurs.
  """

  retries = 0
  while retries < 3:  # Set a maximum number of retries
    try:
      response = model.generate_content("Give me 5 to 10 paraphrased statements for this sentence: " + text)
      # Extract summary from response (replace with actual logic)
      return response.text  # Replace with actual summary extraction
    except Exception as ex:
      print(f"Error occurred during summarization: {ex}")
      retries += 1
      time.sleep(2**retries)  # Exponential backoff for retries

  print(f"Failed to summarize text after {retries} retries.")
  return None

In [None]:
metrics_to_questions_map = {}

In [None]:
metrics_to_questions_map = json.load(open(f'{base_dir}/followup_questions.json'))

In [None]:
#

selected_metrics = [
    "Consolidated Operating Revenues",
    "Consolidated Net Income",
    "Total Operating Revenues",
    "Consolidated Operating Expenses",
    "Free Cash Flow",
    "Total Assets",
    "Total Liabilities",
    "Total Shareholders' Equity",
    "Interest Expense",
    "Consolidated EBITDA",
    "Net Income Attributable to Common Shareholders",
    "Basic Earnings Per Common Share",
    "Diluted Earnings Per Common Share",
    "Total Debt",
    "Cash and Cash Equivalents"
]
#selected a subset of metrics for easy processing

In [None]:
metrics_to_questions_map = {metric: question for metric, question in metrics_to_questions_map.items() if metric in selected_metrics}


In [None]:
def get_answers_to_followup_questions(text, model):
  """
  Summarizes the text using the provided model (placeholder).

  **Note:** This is a basic example and might require adjustments based on the specific model implementation.

  Args:
      text (str): The text to summarize.
      model (object): The model object used for summarization.

  Returns:
      str: The summarized text or None if an error occurs.
  """

  retries = 0
  while retries < 3:  # Set a maximum number of retries
    try:
      response = model.generate_content(
          "Give me a mock response to this question assuming real data for the attribute(s) mentioned from 10K report of any company you'd like:  " + text)
      # Extract summary from response (replace with actual logic)
      return response.text  # Replace with actual summary extraction
    except Exception as ex:
      print(f"Error occurred during fetching answer: {ex}")
      retries += 1
      time.sleep(2**retries)  # Exponential backoff for retries

  print(f"Failed to obtain answer for the text after {retries} retries.")
  return None

In [None]:
## Uncomment following if you need answer to the follow up questions
# for metric_name, question in metrics_to_questions.items():
#   res = paraphrase_with_gemini(question, model)
#   output = res.split('\n')
#   output.append(question)
#   metrics_to_questions_map[metric_name] = output
#   print(f'{metric_name} completed')
#   time.sleep(5)

Consolidated Operating Revenues completed
Consolidated Operating Expenses completed
Other Income (Expense), Net completed
Interest Expense completed
Consolidated Net Income completed
Consolidated EBITDA completed
Consolidated Adjusted EBITDA completed
Total Operating Revenues completed
Churn Rate completed
Segment EBITDA completed
Segment EBITDA Margin completed
Segment Operating Income Margin completed
Depreciation and Amortization Expense completed
Selling, General and Administrative Expenses completed
Change in Cash, Cash Equivalents, and Restricted Cash completed
Free Cash Flow completed
Income Before Income Tax Expense completed
Net Income Attributable to Common Shareholders completed
Basic Earnings Per Common Share completed
Diluted Earnings Per Common Share completed
Total Comprehensive Income completed
Total Assets completed
Total Liabilities completed
Total Shareholders' Equity completed
Cash and Cash Equivalents completed
Contract Assets completed
Contract Liabilities complet

In [None]:
# json.dump(metrics_to_questions_map, open(f'{base_dir}/followup_questions.json', 'w'), indent=4)

In [None]:
metric_to_ans_map = {}
for metric_name, questions in metrics_to_questions_map.items():
  res = get_answers_to_followup_questions(questions[-1], model)
  if res:
    metric_to_ans_map[metric_name] = res
    print(f'Answer fetch for {metric_name} completed')
  print(f'Answer fetch for {metric_name} failed')

Error occurred during fetching answer: HTTPConnectionPool(host='localhost', port=39999): Read timed out. (read timeout=60.0)


KeyboardInterrupt: 

In [None]:
metric_to_ans_map = {
    "Consolidated Operating Revenues": "The Consolidated Operating Revenues have shown a robust growth of 12% from the previous year, indicating a strong market demand for our products and services.",
    "Consolidated Operating Expenses": "Our Consolidated Operating Expenses are well within industry standards, demonstrating our commitment to efficiency and cost control.",
    "Other Income (Expense), Net": "The Other Income (Expense), Net has had a minimal impact, contributing to less than 2% of our overall financials, which is in line with expectations.",
    "Interest Expense": "Interest Expense has decreased by 5% year-over-year, reflecting our effective debt management strategy and improved credit terms.",
    "Consolidated Net Income": "The growth in Consolidated Net Income by 15% this year is promising but requires further investment in innovation to sustain long-term.",
    "Consolidated EBITDA": "With an EBITDA margin of 25%, we are performing well against industry benchmarks and showing strong profitability.",
    "Consolidated Adjusted EBITDA": "Our Adjusted EBITDA has improved by 10%, indicating enhanced operational efficiency and cost management.",
    "Total Operating Revenues": "Total Operating Revenues have increased by 8%, evidencing our competitive edge and increased market share.",
    "Churn Rate": "The Churn Rate of 4% is below industry average, reflecting high customer satisfaction and loyalty.",
    "Segment EBITDA": "Segment EBITDA has increased by 9% compared to last year, showing notable improvement in operational performance.",
    "Segment EBITDA Margin": "Our Segment EBITDA Margin has remained stable at 22%, demonstrating efficiency in operations.",
    "Segment Operating Income Margin": "The Segment Operating Income Margin of 18% indicates healthy profitability across our business units.",
    "Depreciation and Amortization Expense": "Depreciation and Amortization Expense is aligned with our asset base, ensuring our resources are accurately valued.",
    "Selling, General and Administrative Expenses": "We've managed to reduce our Selling, General, and Administrative Expenses by 5%, reflecting our focus on cost efficiency.",
    "Change in Cash, Cash Equivalents, and Restricted Cash": "The increase in Cash, Cash Equivalents, and Restricted Cash by 20% signifies a strong liquidity position, enabling future investments.",
    "Free Cash Flow": "Our Free Cash Flow has grown by 18%, providing ample capacity for strategic investments and debt repayment.",
    "Income Before Income Tax Expense": "Income Before Income Tax Expense has increased, indicating potential for higher future tax liabilities but also reflecting stronger pre-tax earnings.",
    "Net Income Attributable to Common Shareholders": "Net Income Attributable to Common Shareholders saw a 14% rise, underscoring a solid return on investment for our shareholders.",
    "Basic Earnings Per Common Share": "Basic Earnings Per Common Share have exceeded expectations, increasing by 0.08 points, reflecting our strong financial health.",
    "Diluted Earnings Per Common Share": "Diluted Earnings Per Common Share also increased, mirroring the positive trend seen in basic EPS and indicating robust profitability.",
    "Total Comprehensive Income": "Our Total Comprehensive Income supports a positive financial outlook, with a 20% increase from the previous year.",
    "Total Assets": "Total Assets have grown at a healthy rate of 10% year-over-year, indicating strategic investments and asset acquisition.",
    "Total Liabilities": "The ratio of Total Liabilities to Total Shareholders' Equity has slightly increased, signaling careful monitoring is needed to manage leverage effectively.",
    "Total Shareholders' Equity": "Total Shareholders' Equity has risen by 12%, reflecting our strong financial position and resilience.",
    "Cash and Cash Equivalents": "Cash and Cash Equivalents are at an all-time high, ensuring we have sufficient funds for operational needs and strategic initiatives.",
    "Contract Assets": "Contract Assets have balanced well with Contract Liabilities, showing our capability to manage contracts efficiently.",
    "Contract Liabilities": "Contract Liabilities are in good standing, complementing our Contract Assets and indicating healthy financial practices.",
    "Operating Lease Cost": "Operating Lease Cost is well-managed, accounting for only a small fraction of our revenue, which supports sustainable growth.",
    "Finance Lease Liabilities": "Finance Lease Liabilities are within manageable levels, allowing us financial flexibility and stability.",
    "Operating Lease Payments": "Operating Lease Payments are sustainable, with terms that align with our long-term financial planning.",
    "Present Value of Lease Liabilities": "The Present Value of Lease Liabilities provides insight into the cost-effective financing of our leases, aligning with our financial strategies.",
    "Long-Term Debt, Including Current Maturities": "Long-Term Debt, Including Current Maturities, is under control, demonstrating our solid financial structure and strategic planning.",
    "Debt Maturing Within One Year": "We are actively addressing Debt Maturing Within One Year, ensuring liquidity and financial stability.",
    "Total Debt": "Total Debt is at a manageable level for our company's size and industry, indicating prudent financial management.",
    "Change in Cash": "The Change in Cash reflects our strong liquidity position, empowering us to pursue growth opportunities confidently.",
    "Cash Equivalents": "Cash Equivalents are considered safe and liquid assets, reinforcing our company's financial security.",
    "Restricted Cash": "Restricted Cash plays a strategic role in our financial planning, ensuring we meet specific obligations and future investments."
}

# This dictionary provides a mocked but realistic response for each metric, as if extracted from a 10K report of a fictional company.
# These are the answers to all such mocked follow up questions

In [None]:
metric_to_ans_map = {metric: question for metric, question in metric_to_ans_map.items() if metric in metrics}

In [None]:
import re

def remove_leading_number_period(text):
    # Pattern matches a string that starts with one or more digits followed by a period and optional spaces
    pattern = r'^\d+\.\s*'
    # Replace the matched pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

# Example usage
original_text = "1. This is a statement starting with a number."
cleaned_text = remove_leading_number_period(original_text)
print("Original:", original_text)
print("Cleaned:", cleaned_text)

## Gemini gave responses like 1. This is a statement starting with a number., the above removes the number as it is unnecessary


Original: 1. This is a statement starting with a number.
Cleaned: This is a statement starting with a number.


In [None]:
from itertools import cycle

# Placeholder for the mocked-up API response function
def get_mocked_response(metric, company, year):
    # In reality, this function would make an API call or perform some computation
    # Here we're just returning a placeholder response
    #return f"Mocked API response for {metric} of {company} in {year}"
  return metric_to_ans_map[metric]

def generate_conversations_with_responses(metrics_to_questions_map, years, info_extraction_questions_paraphrased, document_uploading_questions, summary_questions, company_names):
    conversations_with_responses = []

    for metric, follow_up_questions in metrics_to_questions_map.items():
        # Cycle iterators for each question type
        follow_up_questions = [remove_leading_number_period(q) for q in follow_up_questions]
        doc_questions_cyclic = cycle(document_uploading_questions)
        follow_up_questions_cyclic = cycle(follow_up_questions)

        for year in years:
            for company in companies:
                # Fill placeholders for each type of question
                info_questions_filled = [q.replace("[metric]", metric).replace("[yearnum]", str(year)).replace("[company]", company) for q in info_extraction_questions_paraphrased]
                info_questions_cyclic = cycle(info_questions_filled)
                summary_questions_filled = [q.replace("[company]", company).replace("[year]", str(year)) for q in summary_questions]
                summary_questions_cyclic = cycle(summary_questions_filled)

                for _ in range(len(info_extraction_questions_paraphrased)):
                    doc_upload_question = next(doc_questions_cyclic).replace("[company]", company).replace("[uyear]", str(year))
                    info_question = next(info_questions_cyclic)
                    follow_up_question = next(follow_up_questions_cyclic)
                    summary_question = next(summary_questions_cyclic)

                    # Prepare responses
                    doc_upload_response = f"document_upload:10K:{company}:{year}"
                    info_extraction_response = f"information_extraction:{metric}:{company}:{year}"
                    follow_up_response = get_mocked_response(metric, company, year)
                    summary_response = f"summarize:10K:{company}:{year}"

                    # Combine question and response into a single record
                    conversation_record = {
                        "document_upload": {"question": doc_upload_question, "response": doc_upload_response},
                        "info_extraction": {"question": info_question, "response": info_extraction_response},
                        "follow_up": {"question": follow_up_question, "response": follow_up_response},
                        "summary": {"question": summary_question, "response": summary_response}
                    }

                    conversations_with_responses.append(conversation_record)

    return conversations_with_responses

# The above code generates conversations and the corresponding response, for both news and financial report, I guess it can be kept
# as same and no need for updates here

# Using the modified function to generate conversations with responses
conversations_with_responses = generate_conversations_with_responses(metrics_to_questions_map, years, info_extraction_questions_paraphrased, document_uploading_questions, summary_questions, company_names)

# Saving the data
with open(f'{base_dir}/conversations_with_responses.json', 'w') as f:
    json.dump(conversations_with_responses, f, indent=4)

print("Conversations with responses saved successfully.")


Conversations with responses saved successfully.


In [None]:
system_prompt = ("<<SYS>> As a highly intelligent assistant, your primary goal is to provide accurate, "
                 "relevant, and context-aware responses to user queries based on the provided information. "
                 "Ensure your answers are factual, free from bias, and avoid promoting violence, hate speech, "
                 "or any form of discrimination. Focus on assisting the user effectively and safely. <</SYS>>")

In [None]:
import random

In [None]:
random.seed(42)  # Ensures reproducibility
random.shuffle(conversations_with_responses)

# Calculate the split index
split_index = int(0.8 * len(conversations_with_responses))

# Split the conversations
train_conversations = conversations_with_responses[:split_index]
validation_conversations = conversations_with_responses[split_index:]

# Function to format a single conversation
def format_conversation(conversation, system_prompt):
    formatted = []
    for key in conversation:
        formatted.append(f"<s>[INST] {system_prompt}\n\n{conversation[key]['question']} [/INST] "
                         f"{conversation[key]['response']} </s>")
    return formatted

1020

In [None]:
train_data = [item for conversation in train_conversations for item in format_conversation(conversation, system_prompt)]
with open(f'{base_dir}/train_data_llama2.txt', 'w') as f:
    for item in train_data:
        f.write("%s\n" % item)

# Format and save the validation data
validation_data = [item for conversation in validation_conversations for item in format_conversation(conversation, system_prompt)]
with open(f'{base_dir}/validation_data_llama2.txt', 'w') as f:
    for item in validation_data:
        f.write("%s\n" % item)

In [None]:
len(train_data)

6528

In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import HfApi

hf_api = HfApi()


In [None]:
#change dataset name (repo_id) to something relevant and useful. DO NOT REMOVE: yatharth97
repo_url = hf_api.create_repo(
                              # repo_id="yatharth97/10k_reports_llama2",
                              repo_id='yatharth97/PLP_llama2_1'
                              token="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",
                              repo_type="dataset",
                              private=False)  # Set `private=True` if you want it to be private
print("Repository URL:", repo_url)


Repository URL: https://huggingface.co/datasets/yatharth97/10k_reports_llama2


In [None]:
from huggingface_hub import upload_file

# repo_id = "yatharth97/10k_reports_llama2"
repo_id = 'yatharth97/PLP_llama2_1'

train_file_path = f'{base_dir}/train_data_llama2.txt'
train_path_in_repo = "train_data_llama2.txt"  # Name in the repository
upload_file(
    token="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",
    path_or_fileobj=train_file_path,
    path_in_repo=train_path_in_repo,
    repo_id=repo_id,
    repo_type='dataset'
)

# Upload validation data
validation_file_path = f'{base_dir}/validation_data_llama2.txt'
validation_path_in_repo = "validation_data_llama2.txt"  # Name in the repository
upload_file(
    token="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",
    path_or_fileobj=validation_file_path,
    path_in_repo=validation_path_in_repo,
    repo_id=repo_id,
    repo_type='dataset'
)

CommitInfo(commit_url='https://huggingface.co/datasets/yatharth97/10k_reports_llama2/commit/94d6f1bdced1489e934232a0678993087c57ce61', commit_message='Upload validation_data_llama2.txt with huggingface_hub', commit_description='', oid='94d6f1bdced1489e934232a0678993087c57ce61', pr_url=None, pr_revision=None, pr_num=None)

### IGNORE FROM THIS PART ONWARDS

In [None]:
#contents of the dataset_script.py in HF

from datasets import load_dataset_builder, DatasetDict

class MyDataset(load_dataset_builder):
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={"filepath": os.path.join(data_dir, "train_data_llama2.txt")},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={"filepath": os.path.join(data_dir, "validation_data_llama2.txt")},
            ),
        ]

    def _generate_examples(self, filepath):
        """Yields examples as (key, example) tuples."""
        # Your code here to read from the file and yield examples
        pass


In [None]:
conversations_deterministic = generate_conversations_deterministic(metrics_to_questions_map, years, info_extraction_questions_paraphrased, document_uploading_questions, summary_questions, company_names)
