In [1]:
#install packages for Google Colab
!pip install openai
!pip install chromadb
!pip install tiktoken
!pip install langchain
!pip install langchain-community
!pip install gradio

Collecting tiktoken
  Using cached tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
Installing collected packages: tiktoken
Successfully installed tiktoken-0.9.0
Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-ins

In [2]:
#import packages
from openai import OpenAI
import chromadb
import numpy as np
import pandas as pd
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
#load .env file with OPEN_API_KEY
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
# Initialize OpenAI API
client = OpenAI()

In [5]:
# set embedding model name
embedding_model_name = "text-embedding-ada-002"
#embedding_model_name = "text-embedding-3-large"  # tried this model but did not find any improvement

In [6]:
# function to generate embeddings for a chunk of text using the text-embedding-ada-002 model
def generate_embedding(text):
    response = client.embeddings.create(
        model=embedding_model_name,
        input=text
    )
    return response.data[0].embedding

In [7]:
#Function to chunk text
#Firstly tried to use a chunk size of 1000 characters with an overlap of 200 characters

#def chunk_text(text, chunk_size=1000, chunk_overlap=200):

#Change chunk size to 3000 characters and overlap to 500 characters
def chunk_text(text, chunk_size=3000, chunk_overlap=500):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

In [8]:
# Store embeddings and metadata
def store_embeddings(df):

    if len(collection.get().get("ids", [])) > 0:
        print("Vector database already exists. Skipping embedding storage.")
        return

    for index, row in df.iterrows():
        text_chunks = chunk_text(row['Transcript'])

        # Generate unique IDs for chunks
        chunk_ids = [f"{row['Ticker']}_{row['Year']}_{row['Quarter']}_{i}" for i in range(len(text_chunks))]


        for chunk, chunk_id in zip(text_chunks, chunk_ids):
            metadata = {
                "ticker": row['Ticker'],
                "year": row['Year'],
                "quarter": row['Quarter'],
                "industry": row['Industry'],
                "date": row['Date']
            }
            embedding = generate_embedding(chunk)
            collection.add(
                documents=[chunk],
                embeddings=[embedding],
                metadatas=[metadata],
                # Provide the chunk_id as the 'ids' argument
                ids=[chunk_id]
            )
    print(f"Stored {sum(len(chunk_text(row['Transcript'])) for _, row in df.iterrows())} document chunks with metadata")


In [9]:
# Create a vector embedding for the question and query the vector database using the embedding and metadata search criteria
def query_rag(query, ticker, year, quarter, top_n=3):
    query_embedding = generate_embedding(query)
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_n,
        where={"$and": [{"ticker": ticker}, {"year": year}, {"quarter": quarter}]}  # Metadata filtering
    )
    if results.get("documents", [[]])[0]:
        top_docs = results["documents"][0][:top_n]
        top_metas = results["metadatas"][0][:top_n]

        context = "\n\n".join([f"Document Chunk {i+1}: \"{doc}\"" for i, doc in enumerate(top_docs)])

        # Create a prompt with the user question and document chunks and pass to the LLM GPT-4
        prompt = f"""
        Given the following extracted document chunks:
        {context}

        Provide a concise answer to the query: "{query}"
        """

        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a financial research assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=300
        )

        return {
            "answer": response.choices[0].message.content.strip(),
            "metadata": top_metas[0],
            "document_chunks": top_docs
        }
    else:
        return {"answer": "No relevant documents found.", "metadata": {}, "document_chunks": []}


In [10]:
# Function to evaluate faithfulness and relevance
def evaluate_response(answer, query, context):

    # Create a prompt to evaluate the faithfulness and the relevance of the answer - LLM as a Judge
    evaluation_prompt = f"""
    Given the user query: "{query}"
    And the following document context:
    {context}

    Evaluate the answer: "{answer}"

    1. Faithfulness: Does the answer correctly reflect the facts presented in the document context? (Score: 1-5)
    2. Relevance: How well does the answer address the user's query? If the LLM is unable to answer the question as the information is not present in the document chunks, the relevance should be 1 (Score: 1-5)

    Provide an overall quality score from 1 to 5 based on these two criteria.
    """

    #Use a more powerful LLM e.g GPT-3-Turbo to judge the answer
    eval_response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": "You are an AI assistant that evaluates text responses."},
            {"role": "user", "content": evaluation_prompt}
        ],
        max_tokens=300
    )

    return eval_response.choices[0].message.content.strip()


In [11]:
# Google Colab - Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
#Create DB Path for Vector Database - Path is different depending on the embedding model
DB_PATH = f"/content/drive/My Drive/Earning_Calls/embeddings/{embedding_model_name}/chroma_db"

In [13]:
# Create directories if do not exist already
os.makedirs(DB_PATH, exist_ok=True)

In [14]:
# Create ChromaDB client
chroma_client = chromadb.PersistentClient(path=DB_PATH)

In [15]:
# Create or Get ChromaDB collection
collection = chroma_client.get_or_create_collection("earnings_calls")

In [16]:
# Load in the CSV with the Transcripts
transcripts_df = pd.read_csv("Earnings_Call_Transcripts_Formatted_2024.csv")

In [17]:
# Look at the information in the dataframe
# There are no null values
transcripts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Ticker      219 non-null    object
 1   Industry    219 non-null    object
 2   Year        219 non-null    int64 
 3   Quarter     219 non-null    int64 
 4   Date        219 non-null    object
 5   Transcript  219 non-null    object
dtypes: int64(2), object(4)
memory usage: 10.4+ KB


In [18]:
# Examine the first five rows
transcripts_df.head()

Unnamed: 0,Ticker,Industry,Year,Quarter,Date,Transcript
0,AAPL,AI-focused Tech,2024,1,2024-02-01,"[Operator] Good day, and welcome to the Apple ..."
1,AAPL,AI-focused Tech,2024,2,2024-05-02,"[Suhasini Chandramouli] Good Afternoon, and we..."
2,AAPL,AI-focused Tech,2024,3,2024-08-01,[Suhasini Chandramouli] Good afternoon and wel...
3,AAPL,AI-focused Tech,2024,4,2024-10-31,"[Suhasini Chandramouli] Good afternoon, and we..."
4,ABBV,Health Care,2024,1,2024-04-26,[Operator] Good morning and thank you for stan...


In [19]:
# Limit dataframe to just IBM, MSFT and Google to speed up creation of vector embeddings and reduce OpenAI cost
transcripts_df = transcripts_df[transcripts_df['Ticker'].isin(['IBM', 'MSFT', 'GOOGL'])]

In [20]:
# Check all rows are there for the three tickers as expected
transcripts_df.head(15)

Unnamed: 0,Ticker,Industry,Year,Quarter,Date,Transcript
83,GOOGL,AI-focused Tech,2024,1,2024-04-25,"[Operator] Welcome, everyone. Thank you for st..."
84,GOOGL,AI-focused Tech,2024,2,2024-07-23,"[Operator] Welcome, everyone. Thank you for st..."
85,GOOGL,AI-focused Tech,2024,3,2024-10-29,"[Operator] Welcome, everyone. Thank you for st..."
86,GOOGL,AI-focused Tech,2024,4,2025-02-04,"[Operator] Welcome, everyone. Thank you for st..."
95,IBM,AI-focused Tech,2024,1,2024-04-24,"[Operator] Welcome, and thank you for standing..."
96,IBM,AI-focused Tech,2024,2,2024-07-24,[Operator] Welcome and thank you for standing-...
97,IBM,AI-focused Tech,2024,3,2024-10-23,"[Operator] Welcome, and thank you for standing..."
98,IBM,AI-focused Tech,2024,4,2025-01-29,"[Operator] Welcome, and thank you for standing..."
135,MSFT,AI-focused Tech,2024,1,2023-10-24,"[Operator] Greetings, and welcome to the Micro..."
136,MSFT,AI-focused Tech,2024,2,2024-01-30,"[Operator] Greetings, and welcome to the Micro..."


In [21]:
# Check different values of industry in dataframe
transcripts_df['Industry'].value_counts()

Unnamed: 0_level_0,count
Industry,Unnamed: 1_level_1
AI-focused Tech,12


In [22]:
# get alphabetical list of Industries
industries_list = sorted(transcripts_df['Industry'].unique().tolist())

In [23]:
# Display list of industries
industries_list

['AI-focused Tech']

In [24]:
# Check different values of ticker in the dataframe
transcripts_df['Ticker'].value_counts()

Unnamed: 0_level_0,count
Ticker,Unnamed: 1_level_1
GOOGL,4
IBM,4
MSFT,4


In [25]:
# get alphabetical list of Tickers where industry is AI-focused Tech
tickers_list = sorted(transcripts_df[transcripts_df['Industry'] == 'AI-focused Tech']['Ticker'].unique().tolist())

In [26]:
# display list of tickers
tickers_list

['GOOGL', 'IBM', 'MSFT']

In [27]:
# Create list of years
year_list = sorted(transcripts_df['Year'].unique().tolist())

In [28]:
# Display list of years
year_list

[2024]

In [29]:
# Create list of quarters
quarter_list = sorted(transcripts_df['Quarter'].unique().tolist())

In [30]:
# Display list of quarters
quarter_list

[1, 2, 3, 4]

In [31]:
# Retrieve an example transcript from the dataframe and print
transcript = transcripts_df.iloc[0]["Transcript"]
print(transcript)

[Operator] Welcome, everyone. Thank you for standing by for the Alphabet First Quarter 2024 Earnings Conference Call. [Operator Instructions]   I would now like to hand the conference over to your speaker today, Jim Friedland, Director of Investor Relations. Please go ahead.  [James Friedland] Thank you. Good afternoon, everyone, and welcome to Alphabet's First Quarter 2024 Earnings Conference Call. With us today are Sundar Pichai, Philipp Schindler, and Ruth Porat.   Now I'll quickly cover the safe harbor. Some of the statements that we make today regarding our business, operations, and financial performance may be considered forward-looking. Such statements are based on current expectations and assumptions that are subject to a number of risks and uncertainties. Actual results could differ materially. Please refer to our Forms 10-K and 10-Q, including the risk factors. We undertake no obligation to update any forward-looking statement.   During this call, we will present both GAAP an

In [32]:
# Store embeddings with metadata
store_embeddings(transcripts_df)


Vector database already exists. Skipping embedding storage.


In [33]:
# Manually ask a query
# Ask user for search criteria
user_query = input("Enter your question: ")
user_ticker = input("Enter company ticker: ")
user_year = int(input("Enter year: "))
user_quarter = int(input("Enter quarter: "))
print("\n")

# print out the answer and metadata and document chunks
answer = query_rag(user_query, user_ticker, user_year, user_quarter)
print("Answer:", answer["answer"], "\n")
print("Metadata:", answer["metadata"], "\n")
print("Document Chunks:", '\n\n\n'.join(answer["document_chunks"]))


Enter your question: Who is on the call?
Enter company ticker: GOOGL
Enter year: 2024
Enter quarter: 3


Answer: The individuals present on the call are Jim Friedland, Senior Director of Investor Relations; Sundar Pichai; Philipp Schindler; Anat Ashkenazi; Ross Sandler; Justin Post; Stephen Ju; and the unnamed Operator. 

Metadata: {'date': '2024-10-29', 'industry': 'AI-focused Tech', 'quarter': 3, 'ticker': 'GOOGL', 'year': 2024} 

Document Chunks: [Operator] Welcome, everyone. Thank you for standing by for the Alphabet Third Quarter 2024 Earnings Conference Call. At this time, all participants are in a listen-only mode. After the speaker presentation, there will be a question-and-answer session. [Operator Instructions] I would now like to hand the conference over to your speaker today, Jim Friedland, Senior Director of Investor Relations. Please go ahead. [Jim Friedland] Thank you. Good afternoon, everyone, and welcome to Alphabet's third quarter 2024 earnings conference call. With u

In [34]:
# Function to take user selection of industry, ticker, year and quarter and user question and retrieve answer via RAG
def query_and_display(user_industry, user_ticker, user_year, user_quarter, user_query):
    answer = query_rag(user_query, user_ticker, int(user_year), int(user_quarter))

    # Using the user question, answer and document chunks, evaluate the answer for faithfulness and relevance
    quality_score = evaluate_response(answer["answer"], user_query, '\n\n\n'.join(answer["document_chunks"]))

    #return the values to be displayed in the UI
    return answer["answer"], '\n\n\n'.join(answer["document_chunks"]), answer["metadata"]["date"], quality_score


In [35]:
# import gradio for UI
import gradio as gr

In [36]:
# Function to update the list of tickers whenever a different industry is selected by the user
def update_ticker_dropdown(selected_industry):
    """Updates the ticker dropdown choices based on the selected industry."""
    filtered_df = transcripts_df[transcripts_df['Industry'] == selected_industry]
    ticker_choices = sorted(filtered_df['Ticker'].unique().tolist())

    # Return the new choices for the ticker dropdown
    return gr.update(choices=ticker_choices)

In [37]:
# Frequently Asked Earnings Call Questions
faq_questions = [
    "What was the company's total revenue?",
    "What was the company's net income?",
    "What was the gross profit for the quarter?",
    "What were the total expenses reported?",
    "Did the company mention any major investments?",
    "What was the reported earnings per share (EPS)?",
    "Did the company discuss any cost reductions?",
    "What were the main reasons for revenue increase or decrease?",
    "Did the company announce any new products or services?",
    "Were any specific market trends mentioned in the call?",
    "Did the company provide any financial projections for the next quarter?",
    "Was there any mention of changes in leadership or management?",
    "Did the company talk about customer growth or decline?",
    "Were any major business partnerships or deals discussed?",
    "Did the company mention any changes in pricing or costs?",
]

In [38]:
# Enhanced Gradio UI
with gr.Blocks(css="""
    .gradio-container {
        max-width: 1000px;
        margin: auto;
        padding: 30px;
        border-radius: 8px;
        background: #ffffff;
        box-shadow: 0px 4px 10px rgba(0, 0, 0, 0.1);
    }
    .gr-textbox textarea, .gr-dropdown select {
        width: 100%;
        height: 180px;  /* Further increased height for answers */
        background: #f8f9fa;
        color: #2c3e50;
        border: 1px solid #ccc;
        padding: 15px;
        border-radius: 5px;
        font-size: 16px;
    }
    .gr-button-primary {
        background: linear-gradient(135deg, #004085, #007BFF);
        color: white;
        border-radius: 5px;
        padding: 16px;
        font-weight: bold;
        width: 100%;
        font-size: 18px;
        transition: background 0.3s ease-in-out;
    }
    .gr-button-primary:hover {
        background: #002752;
    }
""") as app:

    gr.Markdown("# Earnings Call Analyzer")
    gr.Markdown("### Gain insights into financial performance with AI-driven analysis.")

    with gr.Row():
        industry_dropdown = gr.Dropdown(choices=industries_list, label="Select Industry", interactive=True)
        ticker_dropdown = gr.Dropdown(choices=tickers_list, label="Select Ticker", interactive=True)

    industry_dropdown.change(update_ticker_dropdown, inputs=industry_dropdown, outputs=ticker_dropdown)

    with gr.Row():
        year_dropdown = gr.Dropdown(choices=year_list, label="Select Year", interactive=True)
        quarter_dropdown = gr.Dropdown(choices=quarter_list, label="Select Quarter", interactive=True)

    user_query_textbox = gr.Textbox(lines=3, label="Ask a question", interactive=True)
    faq_dropdown = gr.Dropdown(choices=faq_questions, label="Frequently Asked Questions", interactive=True)
    faq_dropdown.change(lambda q: gr.update(value=q), inputs=faq_dropdown, outputs=user_query_textbox)

    submit_btn = gr.Button("Analyze", variant="primary")

    with gr.Row():
        answer_textbox = gr.Textbox(label="Analysis Result", interactive=False, lines=8)
    with gr.Row():
        doc_chunks_textbox = gr.Textbox(label="Transcript Extracts", interactive=False, lines=8)
    with gr.Row():
        doc_date_textbox = gr.Textbox(label="Transcript Date", interactive=False)
    with gr.Row():
        quality_score_textbox = gr.Textbox(label="Quality Score", interactive=False)

    submit_btn.click(
        query_and_display,
        inputs=[industry_dropdown, ticker_dropdown, year_dropdown, quarter_dropdown, user_query_textbox],
        outputs=[answer_textbox, doc_chunks_textbox, doc_date_textbox, quality_score_textbox]
    )

app.launch(show_error=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9d0d3675d79d7891c7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


