# NVIDIA Earnings Transcript Scraper (Q1–Q4 FY2025)

This notebook scrapes earnings call transcripts for NVIDIA (Q1–Q4 FY2025) from Fool.com, cleans the raw text to remove disclaimers and noise, and saves each cleaned transcript to a separate file.

**Steps Covered:**
1. Scrape earnings call transcripts using Selenium
2. Clean the scraped text
3. Save a single cleaned `.txt` file for each quarter

In [11]:
# Import libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import matplotlib.pyplot as plt
import time  
import os
import re
import sys
from openai import OpenAI
import json
import io


# Step 1: Scrape Transcript from Fool.com

This function uses Selenium to open the transcript webpage, accept cookies, and extract all paragraph text from the main article body.


In [19]:
# Define scraping function
def scrape_clean_transcript(url):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)
    time.sleep(5)

    # Accept cookie banner if present
    try:
        driver.find_element(By.XPATH,'/html/body/div[13]/div[2]/div/div/div[2]/div/div/button[2]').click()
        time.sleep(1)
    except:
        pass

    # Extract earning call content
    try:
        article_body = driver.find_element(By.XPATH, "/html/body/div[9]/div[3]/div[2]/section[2]/div/div[2]/div[1]/div[1]")
        paragraphs = article_body.find_elements(By.XPATH, "//p")
        raw_text = "\n".join([p.text for p in paragraphs if p.text.strip()])
    except Exception as e:
        print(f"Error extracting transcript from {url}: {e}")
        raw_text = ""

    driver.quit()
    return raw_text


# Step 2: Clean the Transcript Text

This function removes empty lines and disclaimer/legal language. It also collapses multiple line breaks into a single one.


In [13]:
def clean_transcript(text):
    lines = text.strip().split("\n")

    # Remove  "header" lines 
    cleaned_lines = []
    for line in lines:
        if (
            any(kw in line.lower() for kw in ["image source", "nvidia", "earnings call", "et"]) and
            len(cleaned_lines) < 5
        ):
            continue
        cleaned_lines.append(line.strip())

    text = "\n".join(cleaned_lines).strip()

    # Truncate footer section
    footer_start = text.find("More NVDA analysis")
    if footer_start != -1:
        text = text[:footer_start].strip()

    # Clean up legal terms & short lines
    final_lines = []
    for line in text.split("\n"):
        if len(line.strip()) < 5:
            continue
        if any(term in line.lower() for term in ["forward-looking", "safe harbor", "disclaimer"]):
            continue
        final_lines.append(line.strip())

    cleaned_text = "\n".join(final_lines)
    return re.sub(r'\n+', '\n', cleaned_text)



# Step 3: Save Cleaned Transcript to File

This function saves the cleaned transcript to a `.txt` file inside the `clean_transcripts/` directory.


In [14]:
# Save clean trasncripts as txt files
def save_transcript(text, filename):
    os.makedirs("clean_transcripts", exist_ok=True)
    path = os.path.join("clean_transcripts", filename)
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Saved: {path}")

# Step 4: Process All Four Quarters (Q1–Q4 FY2025)

We define a dictionary of transcript URLs and filenames. The script loops through them, scrapes the data, cleans it, and saves the result.


In [15]:
transcripts = {
    "NVDA_q1_2025.txt": "https://www.fool.com/earnings/call-transcripts/2024/05/29/nvidia-nvda-q1-2025-earnings-call-transcript/",
    "NVDA_q2_2025.txt": "https://www.fool.com/earnings/call-transcripts/2024/08/28/nvidia-nvda-q2-2025-earnings-call-transcript/",
    "NVDA_q3_2025.txt": "https://www.fool.com/earnings/call-transcripts/2024/11/20/nvidia-nvda-q3-2025-earnings-call-transcript/",
    "NVDA_q4_2025.txt": "https://www.fool.com/earnings/call-transcripts/2025/02/26/nvidia-nvda-q4-2025-earnings-call-transcript/"
}

# Process each transcript URL
for filename, url in transcripts.items():
    print(f"\nProcessing {filename} ...")
    raw = scrape_clean_transcript(url)
    clean = clean_transcript(raw)
    save_transcript(clean, filename)


Processing NVDA_q1_2025.txt ...


KeyboardInterrupt: 

# Step 5: Preprocess the Cleaned Transcripts

In [None]:
# Define the input transcript files and associated quarters
transcript_files = {
    'Q1_2025': 'clean_transcripts/NVDA_q1_2025.txt',
    'Q2_2025': 'clean_transcripts/NVDA_q2_2025.txt',
    'Q3_2025': 'clean_transcripts/NVDA_q3_2025.txt',
    'Q4_2025': 'clean_transcripts/NVDA_q4_2025.txt',
}

records = []

# Improved regex pattern for speaker lines
speaker_pattern = re.compile(
    r'^(?P<speaker>([A-Z][a-zA-Z.,\'-]+\s){1,3}[A-Z][a-zA-Z.,\'-]+)\s--\s(?P<title>[A-Za-z][^:\n]+)$'
)

for quarter, path in transcript_files.items():
    with open(path, 'r', encoding='utf-8') as f:
        current_speaker = None
        current_title = None
        buffer = []

        for raw_line in f:
            line = raw_line.strip()

            # Remove notes like [Operator Instructions]
            line = re.sub(r'\[.*?\]', '', line).strip()
            if not line:
                continue

            match = speaker_pattern.match(line)
            if match:
                # Save previous speaker block
                if current_speaker and buffer:
                    content = ' '.join(buffer).strip()
                    if len(content) > 30:
                        records.append({
                            'quarter': quarter,
                            'speaker': current_speaker,
                            'title': current_title,
                            'content': content
                        })

                # Start new block
                current_speaker = match.group('speaker').strip()
                current_title = match.group('title').strip()
                buffer = []

            else:
                # If line is misparsed (e.g. a regular sentence starting with "And I..."), add to buffer
                buffer.append(line)

        # If there is content in buffer with current speaker > 30 characters then add to final record
        if current_speaker and buffer:
            content = ' '.join(buffer).strip()
            if len(content) > 30:
                records.append({
                    'quarter': quarter,
                    'speaker': current_speaker,
                    'title': current_title,
                    'content': content
                })

# Save to DataFrame
df = pd.DataFrame(records)

# Forward-fill missing speakers and titles
df['speaker'].ffill(inplace=True)
df['title'].ffill(inplace=True)

# Save to CSV
output_path = 'clean_transcripts/NVIDIA_all_quarters_speaker_blocks.csv'
df.to_csv(output_path, index=False)
print(f"Saved {len(df)} speaker blocks to {output_path}")

df['speaker'].ffill(inplace=True)
if 'title' in df.columns:
    df['title'].ffill(inplace=True)

# Save cleaned speaker blocks
output_path = 'clean_transcripts/NVIDIA_all_quarters_speaker_blocks.csv'
df.to_csv(output_path, index=False)

print(f"Saved {len(df)} speaker blocks to {output_path}")


Saved 103 speaker blocks to clean_transcripts/NVIDIA_all_quarters_speaker_blocks.csv
Saved 103 speaker blocks to clean_transcripts/NVIDIA_all_quarters_speaker_blocks.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['speaker'].ffill(inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['title'].ffill(inplace=True)


In [None]:
# Load the uploaded CSV file containing all quarters' speaker blocks
df_speaker_blocks = pd.read_csv("clean_transcripts/NVIDIA_all_quarters_speaker_blocks.csv")

# Display the first few rows to confirm structure
df_speaker_blocks.head()

Unnamed: 0,quarter,speaker,title,content
0,Q1_2025,Simona Jankowski,"Vice President, Investor Relations",They just revealed what they believe are the 1...
1,Q1_2025,Colette Kress,"Executive Vice President, Chief Financial Officer","Thanks, Simona. Q1 was another record quarter...."
2,Q1_2025,Jensen Huang,President and Chief Operating Officer,"Thanks, Colette. The industry is going through..."
3,Q1_2025,Simona Jankowski,"Vice President, Investor Relations","Thank you, Jensen. We will now open the call f..."
4,Q1_2025,Stacy Rasgon,AllianceBernstein -- Analyst,"Hi, guys. Thanks for taking my questions. My f..."


# Step 6: Speaker-level Sentiment Analysis

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import pandas as pd

# Load your speaker blocks file (all quarters)
df = pd.read_csv("clean_transcripts/NVIDIA_all_quarters_speaker_blocks.csv")

# FinBERT model setup
model_name = "ProsusAI/finbert"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

import textwrap

def get_sentiment_chunks(text, max_chunk_words=200):
    if not isinstance(text, str) or not text.strip():
        return [("EMPTY", 0.0)]

    # Split text into words, then wrap into word chunks
    words = text.split()
    chunks = [' '.join(words[i:i + max_chunk_words]) for i in range(0, len(words), max_chunk_words)]

    results = []
    for chunk in chunks:
        try:
            result = finbert(chunk)[0]  # Let pipeline handle tokenization
            results.append((result["label"], result["score"]))
        except:
            results.append(("ERROR", 0.0))

    return results

# Apply FinBERT to each content (in chunks)
df["sentiment_chunks"] = df["content"].apply(get_sentiment_chunks)

# Aggregation function (majority vote + max score for top label)
def aggregate_sentiment(chunk_list):
    if not chunk_list or chunk_list == [("EMPTY", 0.0)]:
        return "EMPTY", 0.0

    from collections import Counter
    label_counts = Counter([label for label, _ in chunk_list])
    top_label = label_counts.most_common(1)[0][0]
    max_conf = max(score for label, score in chunk_list if label == top_label)
    return top_label, max_conf

# Apply aggregation
df[["sentiment", "confidence"]] = df["sentiment_chunks"].apply(lambda x: pd.Series(aggregate_sentiment(x)))


def map_five_categories(row):
    label = row['sentiment'].lower()
    score = row['confidence']

    if label == 'positive':
        return 'Strong Positive' if score > 0.85 else 'Slightly Positive'
    elif label == 'negative':
        return 'Strong Negative' if score > 0.85 else 'Sli Negative'
    elif label == 'neutral':
        return 'Neutral'
    else:
        return 'Uncertain'

df['sentiment_category'] = df.apply(map_five_categories, axis=1)

#  Re-order columns
df = df[['quarter', 'speaker', 'title', 'content', 'sentiment', 'confidence', 'sentiment_category']]

# Save output
df.to_csv("clean_transcripts/NVIDIA_finbert_output.csv", index=False)


Device set to use mps:0


# Step 7: using LLMs to get sentiment score for each block

## API call

In [None]:
api_key = 'sk-58c0df73519c42debe27d41e164d455a'
base_url = "https://api.deepseek.com"
client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
seed = 42

## Getting the sentiment score for each blocks of text using an LLM.

In [None]:
# Build a payload of {id, text} objects
payload = (
    df[["content"]]
    .reset_index()                             # bring the DataFrame index into a column
    .rename(columns={"index": "id", "content": "text"})
    .to_dict(orient="records")
)

# Craft your messages
system_prompt = """
You are a sentiment-analysis engine.
I will give you a JSON array of objects like:
  [ { "id": 0, "text": "…"}, { "id": 1, "text": "…"}, … ]

For each text, return a probability distribution over the five sentiment classes:
Strong Negative, Slightly Negative, Neutral, Slightly Positive, Strong Positive.
Label your columns:

id,
LLM_sentiment, 
LLM_pct_strong_positive,
LLM_pct_slightly_positive,
LLM_pct_neutral,
LLM_pct_slightly_negative,
LLM_pct_strong_negative

—with each LLM_pct_* a float from 0.0–1.0 summing to 1.0, one row per paragraph,
no markdown fences or commentary, pure CSV.
-for sentiment, choose one of the five classes. 
"""

resp = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": json.dumps(payload, indent=2)}
    ],
    temperature=0.0,
    top_p=1.0
)


## Merging with the original df

In [None]:
msg = resp.choices[0].message
csv_str = msg.content
# Convert CSV string to DataFrame
sent_df = pd.read_csv(io.StringIO(csv_str))
df = df.reset_index(drop=False).rename(columns={'index':'id'})
df_with_sentiment = df.merge(
    sent_df,            # contains id, sentiment, pct_*
    on='id',            # join key
    how='left'          # keep every original row
)

df_with_sentiment.drop(columns=['id'], inplace=True)  # drop the id column

df_with_sentiment

Unnamed: 0,quarter,speaker,title,content,sentiment,confidence,sentiment_category,role_category,LLM_sentiment,LLM_pct_strong_positive,LLM_pct_slightly_positive,LLM_pct_neutral,LLM_pct_slightly_negative,LLM_pct_strong_negative
0,Q1_2025,Simona Jankowski,"Vice President, Investor Relations",They just revealed what they believe are the 1...,neutral,0.949520,Neutral,Other,Neutral,0.1,0.20,0.60,0.1,0.0
1,Q1_2025,Colette Kress,"Executive Vice President, Chief Financial Officer","Thanks, Simona. Q1 was another record quarter....",positive,0.952649,Strong Positive,CFO,Strong Positive,0.7,0.20,0.10,0.0,0.0
2,Q1_2025,Jensen Huang,President and Chief Operating Officer,"Thanks, Colette. The industry is going through...",neutral,0.758598,Neutral,Other,Strong Positive,0.8,0.15,0.05,0.0,0.0
3,Q1_2025,Simona Jankowski,"Vice President, Investor Relations","Thank you, Jensen. We will now open the call f...",neutral,0.931907,Neutral,Other,Neutral,0.0,0.10,0.80,0.1,0.0
4,Q1_2025,Stacy Rasgon,AllianceBernstein -- Analyst,"Hi, guys. Thanks for taking my questions. My f...",neutral,0.893501,Neutral,Analyst,Neutral,0.1,0.20,0.60,0.1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,Q4_2025,Jensen Huang,President and Chief Executive Officer,"Yeah. I appreciate it. First of all, people ar...",neutral,0.899786,Neutral,CEO,Neutral,0.1,0.20,0.60,0.1,0.0
99,Q4_2025,Atif Malik,Analyst,Hi. Thank you for taking my question. I have a...,neutral,0.807783,Neutral,Analyst,Neutral,0.1,0.20,0.60,0.1,0.0
100,Q4_2025,Colette M. Kress,"Chief Financial Officer, Executive Vice President",Yeah. Thanks for the question. Our gross margi...,neutral,0.770526,Neutral,CFO,Neutral,0.1,0.20,0.60,0.1,0.0
101,Q4_2025,Colette M. Kress,"Chief Financial Officer, Executive Vice President",We are going to open up to Jensen. A couple of...,neutral,0.855628,Neutral,CFO,Neutral,0.1,0.20,0.60,0.1,0.0


In [21]:
#save df_with_sentiment to csv
output_path = 'clean_transcripts/NVIDIA_finbert_deepseek_output.csv'
df_with_sentiment.to_csv(output_path, index=False)
print(f"Saved sentiment output to {output_path}")

NameError: name 'df_with_sentiment' is not defined

In [None]:
import pandas as pd

def clean_mf_junk_in_df(df, text_col="content"):
    df = df.copy()
    df[text_col] = df[text_col].astype(str).map(_demojibake_fix)
    mask_junk = df[text_col].str.contains(_MF_JUNK_RE, na=False)
    df = df.loc[~mask_junk].copy()
    df[text_col] = df[text_col].str.replace(r"\s+", " ", regex=True).str.strip()
    df = df[df[text_col].str.len() > 3].copy()
    return df.reset_index(drop=True)

# One-off cleanup of a saved file:
path = "clean_transcripts/NVIDIA_finbert_deepseek_output.csv"  # change as needed
df = pd.read_csv(path)
df_clean = clean_mf_junk_in_df(df, text_col="content")
out_path = path.replace(".csv", "_clean.csv")
df_clean.to_csv(out_path, index=False)
print(f"Removed {len(df)-len(df_clean)} junk rows. Saved: {out_path}")


In [22]:
NVIDIA_final = pd.read_csv("NVIDIA_full.csv")

# Select only the required columns
NVIDIA_final = NVIDIA_final[[
    "quarter",
    "content",
    "sentiment_category",
    "role_category",
    "LLM_sentiment",
    "Manual Annotation"
]]

# Rename columns
NVIDIA_final = NVIDIA_final.rename(columns={
    "title": "role_category",
    "sentiment_category": "FinBERT_sentiment",
    "Manual Annotations": "manual_annotation"
})

# Rearrange columns so role_category comes before FinBERT_sentiment
NVIDIA_final = NVIDIA_final[[
    "quarter",
    "content",
    "role_category",
    "FinBERT_sentiment",
    "LLM_sentiment",
    "manual_annotation"
]]

# Check the updated dataframe
print(NVIDIA_final)

FileNotFoundError: [Errno 2] No such file or directory: 'NVIDIA_full.csv'