In [337]:
import json
import re
import pdfplumber
import pandas as pd
import os
import spacy

# Step 1: convert pdf to dataframe (sentence level)

In [None]:
# Text tokenization with spaCy
nlp = spacy.load("en_core_web_sm")

def split_sentences(text: str) -> list[str]:
    # strip to avoid leading/trailing blanks
    doc = nlp(text.strip())
    return [sent.text for sent in doc.sents]

In [339]:
# Define 3 speaker patterns
speaker_patterns = [
    # 1. "Name, Title"
    re.compile(
        r'^(?=[A-Z][A-Za-z0-9 ]{3,20},\s*[A-Z][A-Za-z ]{1,20}$)'
        r'(?P<speaker>[A-Z][A-Za-z0-9 ]+,\s*[A-Z][A-Za-z ]+)\s*$',
        re.MULTILINE
    ),

    # 2. "Name:"
    re.compile(
        r'^(?=[A-Z][A-Za-z0-9 ]{1,20}:)'
        r'(?P<speaker>(?:[A-Z]{2,}|[A-Z][a-z]+)'
        r'(?:\s+(?:[A-Z]{2,}|[A-Z][a-z]+)){0,4}):',
        re.MULTILINE
    ),

    # 3. "Name, Title, Subtitle"
    re.compile(
        r'^(?P<speaker>'                                
        r'[A-Z][A-Za-z0-9]+(?:\s+[A-Z][A-Za-z0-9]+)*'   
        r'(?:'                                          
          r',\s*[A-Z][A-Za-z]+(?:\s+[A-Za-z]+)*'        
        r'){2,}'                                        
        r')\s*$',                                       
        re.MULTILINE
    ),
]

In [340]:
def find_speaker_matches(text):
    matches = []
    for pat in speaker_patterns:
        matches.extend(pat.finditer(text))
    return sorted(matches, key=lambda m: m.start())

In [341]:
def extract_quarter(filename: str) -> str:
    m = re.search(r'-Q([1-4])-(\d{4})-', filename)
    if m:
        quarter, year = m.group(1), m.group(2)
        return f"{year}Q{quarter}"
    else:
        return "Unknown"

In [342]:
def extract_name(speaker: str) -> str:
    base = speaker.strip().rstrip(':').strip()
    return base.split(',', 1)[0].strip()

In [343]:
def extract_position(name: str) -> str:
    position_map = {
        "Mark Zuckerberg": "CEO",
        "Susan Li": "CFO",
        "Kenneth Dorell": "Investor Relations Director",
        "Operator": "Operator",
    }
    return position_map.get(name, "Analyst")

In [344]:
# Remove page numbers from text
def remove_page_numbers(page_text):
    lines = page_text.splitlines()
    cleaned_lines = []
    for line in lines:
        if not re.match(r'^\s*(Page\s*)?\d+\s*$', line):
            cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

In [345]:
def extract_transcript(pdf_path):
    output = []
    with pdfplumber.open(pdf_path) as pdf:
        # Remove page numbers from each page
        text = "\n".join(
            remove_page_numbers(page.extract_text() or "") for page in pdf.pages
        )

    matches = find_speaker_matches(text)
    for idx, m in enumerate(matches):
        raw_speaker = m.group('speaker').strip()
        start = m.end()
        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
        block_text = text[start:end].replace("\n", " ").strip()
        sentences = [s for s in split_sentences(block_text) if s and len(s.split()) > 7]
        output.append({
            "raw_speaker": raw_speaker,
            "sentences": sentences,
        })

    return output

In [346]:
def process_folder(folder_path, output_json_path):
    all_records = []
    for filename in os.listdir(folder_path):
        if not filename.lower().endswith(".pdf"):
            continue
        pdf_path = os.path.join(folder_path, filename)
        print(f"Processing {filename}...")

        quarter = extract_quarter(filename)
        for block in extract_transcript(pdf_path):
            raw = block["raw_speaker"]
            name = extract_name(raw)
            pos  = extract_position(name)
            all_records.append({
                "filename": filename,
                "quarter": quarter,
                "speaker": name,       
                "position": pos,
                "sentences": block["sentences"],
            })

    print(f"Saving {len(all_records)} records to {output_json_path}...")
    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(all_records, f, indent=2, ensure_ascii=False)
    print("Done.")



In [347]:
if __name__ == "__main__":
    folder = "Earnings Call Transcript"
    output_file = "Earnings Call Transcript.json"
    process_folder(folder, output_file)

Processing META-Q1-2024-Earnings-Call-Transcript.pdf...
Processing META-Q1-2025-Earnings-Call-Transcript.pdf...
Processing META-Q2-2024-Earnings-Call-Transcript.pdf...
Processing META-Q3-2024-Earnings-Call-Transcript.pdf...
Processing META-Q4-2024-Earnings-Call-Transcript.pdf...
Saving 176 records to Earnings Call Transcript.json...
Done.


In [348]:
df = pd.read_json("Earnings Call Transcript.json")
df = df.explode("sentences").rename(columns={"sentences": "sentence"})

In [349]:
df

Unnamed: 0,filename,quarter,speaker,position,sentence
0,META-Q1-2024-Earnings-Call-Transcript.pdf,2024Q1,Ken Dorell,Analyst,Good afternoon and welcome to Meta Platforms f...
0,META-Q1-2024-Earnings-Call-Transcript.pdf,2024Q1,Ken Dorell,Analyst,Joining me today to discuss our results are Ma...
0,META-Q1-2024-Earnings-Call-Transcript.pdf,2024Q1,Ken Dorell,Analyst,"Before we get started, I would like to take th..."
0,META-Q1-2024-Earnings-Call-Transcript.pdf,2024Q1,Ken Dorell,Analyst,Actual results may differ materially from thos...
0,META-Q1-2024-Earnings-Call-Transcript.pdf,2024Q1,Ken Dorell,Analyst,Factors that could cause these results to diff...
...,...,...,...,...,...
173,META-Q4-2024-Earnings-Call-Transcript.pdf,2024Q4,Mark Zuckerberg,CEO,So the actual business opportunity for Meta AI...
173,META-Q4-2024-Earnings-Call-Transcript.pdf,2024Q4,Mark Zuckerberg,CEO,And I think that’s an important thing for us t...
173,META-Q4-2024-Earnings-Call-Transcript.pdf,2024Q4,Mark Zuckerberg,CEO,"But nonetheless, we’ve run a process like this..."
174,META-Q4-2024-Earnings-Call-Transcript.pdf,2024Q4,Kenneth Dorell,Investor Relations Director,And we look forward to speaking with you again...


Each sentence is assinged a unique ID to maintain traceability

In [350]:
df['id'] = range(1, len(df) + 1)
df.drop('filename', axis=1, inplace=True)

In [351]:
cols = ['id'] + [c for c in df.columns if c != 'id']
df = df[cols]

In [352]:
df

Unnamed: 0,id,quarter,speaker,position,sentence
0,1,2024Q1,Ken Dorell,Analyst,Good afternoon and welcome to Meta Platforms f...
0,2,2024Q1,Ken Dorell,Analyst,Joining me today to discuss our results are Ma...
0,3,2024Q1,Ken Dorell,Analyst,"Before we get started, I would like to take th..."
0,4,2024Q1,Ken Dorell,Analyst,Actual results may differ materially from thos...
0,5,2024Q1,Ken Dorell,Analyst,Factors that could cause these results to diff...
...,...,...,...,...,...
173,1913,2024Q4,Mark Zuckerberg,CEO,So the actual business opportunity for Meta AI...
173,1914,2024Q4,Mark Zuckerberg,CEO,And I think that’s an important thing for us t...
173,1915,2024Q4,Mark Zuckerberg,CEO,"But nonetheless, we’ve run a process like this..."
174,1916,2024Q4,Kenneth Dorell,Investor Relations Director,And we look forward to speaking with you again...


In [353]:
df = df.dropna()

In [354]:
df

Unnamed: 0,id,quarter,speaker,position,sentence
0,1,2024Q1,Ken Dorell,Analyst,Good afternoon and welcome to Meta Platforms f...
0,2,2024Q1,Ken Dorell,Analyst,Joining me today to discuss our results are Ma...
0,3,2024Q1,Ken Dorell,Analyst,"Before we get started, I would like to take th..."
0,4,2024Q1,Ken Dorell,Analyst,Actual results may differ materially from thos...
0,5,2024Q1,Ken Dorell,Analyst,Factors that could cause these results to diff...
...,...,...,...,...,...
173,1912,2024Q4,Mark Zuckerberg,CEO,"This year, the improvements to the business ar..."
173,1913,2024Q4,Mark Zuckerberg,CEO,So the actual business opportunity for Meta AI...
173,1914,2024Q4,Mark Zuckerberg,CEO,And I think that’s an important thing for us t...
173,1915,2024Q4,Mark Zuckerberg,CEO,"But nonetheless, we’ve run a process like this..."


- Every earnings call is comprised of 2 parts: a prepared statement by the CEO and CFO, and a Q&A session with analysts. The prepared statement is usually structured and follows a script, while the Q&A session is more dynamic and can cover a wide range of topics.
- In Meta's earnings call, the Q&A session starts with a specific marker: "With that, Krista, let’s open up the call for questions." This marker is used to identify the beginning of the Q&A section.

In [355]:
qna_marker = ["With that, Krista, let’s open up the call for questions."]
mask = df['sentence'].apply(lambda s: any(marker in s for marker in qna_marker))
qna_sentences = df.loc[mask,['quarter', 'speaker', 'sentence']]
print(qna_sentences)

    quarter   speaker                                           sentence
2    2024Q1  Susan Li  With that, Krista, let’s open up the call for ...
37   2025Q1  Susan Li  With that, Krista, let’s open up the call for ...
73   2024Q2  Susan Li  With that, Krista, let’s open up the call for ...
109  2024Q3  Susan Li  With that, Krista, let’s open up the call for ...
143  2024Q4  Susan Li  With that, Krista, let’s open up the call for ...


We then extract the ID of the sentences that act as a marker for the Q&A section in each earnings call

In [356]:
qna_id = qna_sentences.index

We create a new column call 'section', and assign the value 'Q&A' to the sentences that are part of the Q&A section, which are those sentences that come after the marker 

In [357]:
# Initialize all as Presentation first
df['section'] = 'Presentation'

# Dictionary mapping quarters to Q&A start indices
qna_start_indices = {
    '2024Q1': qna_id[0],
    '2025Q1': qna_id[1],
    '2024Q2': qna_id[2],
    '2024Q3': qna_id[3],
    '2024Q4': qna_id[4],
}

for quarter, start_idx in qna_start_indices.items():
    mask = (df['quarter'] == quarter) & (df.index > start_idx)
    df.loc[mask, 'section'] = 'Q&A'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['section'] = 'Presentation'


In [358]:
df1 = df[df['section'] == 'Q&A'][['quarter','speaker','position','sentence']]
df1[df1['quarter']  == '2025Q1']

Unnamed: 0,quarter,speaker,position,sentence
38,2025Q1,Operator,Operator,We will now open the lines for a question and ...
38,2025Q1,Operator,Operator,"To ask a question, please press star one on yo..."
38,2025Q1,Operator,Operator,"To withdraw your question, again press star one."
38,2025Q1,Operator,Operator,Please pick up your handset before asking your...
38,2025Q1,Operator,Operator,"If you are streaming today’s call, please mute..."
...,...,...,...,...
69,2025Q1,Mark Zuckerberg,CEO,But then as a bunch of the products start to h...
69,2025Q1,Mark Zuckerberg,CEO,"And then at some point, just like the other pr..."
69,2025Q1,Mark Zuckerberg,CEO,But -- that’s kind of where we’re at on it.
69,2025Q1,Mark Zuckerberg,CEO,We’re definitely focused on doing the work mor...


In [359]:
df.to_csv('output.csv', index=False, encoding='utf-8-sig')

# Step 2: Text Pre-Processing

This is necessary step for FinBERT and DeepSeek 

In [None]:
import numpy as np
import re
import nltk
import string
import spacy
import unicodedata

In [None]:
# Import text-extracted dataset
df = pd.read_csv("output.csv")
df

In [None]:
# Create two copies of the dataframe for FinBERT and LLM processing
df_finbert = df
df_llm = df

For all models, we performed the following **data preprocessing** steps: 
- All text was converted to lowercase, except for App names, such as Facebook, Instagram, Threads...
- Irrelevant characters were removed ('--')
- Short sentences were removed to provide meaningful insights (less than 8 words), which was done in the text extraction step
- All text was then normalized to its original form

**Large Language Models**

In [None]:
nlp = spacy.load("en_core_web_sm")

def data_preprocess_pipeline_llm(text):
    # Normalize Unicode characters
    text = unicodedata.normalize('NFKC', text)

    # Replace multiple dashes with a space
    text = re.sub(r'--+', ' ', text)

    # Run spaCy NLP pipeline
    doc = nlp(text)
        
    # Keep all tokens except spaces
    tokens = [token.lemma_ for token in doc if not token.is_space]

    return ' '.join(tokens)

In [None]:
# Apply data-preprocessing pipeline
df_llm['sentence'] = df_llm['sentence'].apply(data_preprocess_pipeline_llm)

In [None]:
# Extract Q&A section only
df1 = df_llm[df_llm['section'] == 'Q&A']

In [None]:
df1.to_csv("df1.csv")

**FinBERT**

For FinBERT model, we performed:
-  Stopword removal.
- Remove commas: In long and complex sentences, different parts of the sentence may be separated by commas.In some cases, these commas are followed by the main verb or conjunction. We remove these commas to simply the sentence structure and improve the accuracy of the model, especially when the conjunction is the starting point of a contrasting argument.
- Sentiment focus: Complex sentences often key words such as 'but','while', 'partially' ... In these sentences, there are mixed sentiment. However, it is clear that the main sentiment lies in the second part of the sentence that follows these key words. Therefore, we apply the sentiment focus to shift the emphasis of a sentence based on these keywords. 

In [None]:
nlp = spacy.load("en_core_web_sm")

def data_preprocess_pipeline_finbert(text):
    text = unicodedata.normalize('NFKC', text) 
    # Remove special characters '--' in the earnings call transcripts
    text = re.sub(r'--+', ' ', text)
    doc = nlp(text)
    tokens = []
    # Remove stop words
    for token in doc:
        if not token.is_stop:
            tokens.append(token.text)
            
    # Next, we will perform text normalization and lowercase conversion only for words that are not named entities
    # Name-entity recognition 
    for token in doc:
        if token.ent_type_:
            tokens.append(token.text)
        else:
            # Text normalization (lemmatization) 
            lemmatization = token.lemma_
            # Convert to lowercase
            tokens.append(lemmatization.lower())
    
    return ' '.join(tokens)

In [None]:
# Apply data-preprocessing pipeline
df_finbert['sentence'] = df_finbert['sentence'].apply(data_preprocess_pipeline_finbert)

In [None]:
# Remove commas
def remove_commas_spacy(text):
    if not isinstance(text, str):
        return text
    text = unicodedata.normalize('NFKC', text)
    doc = nlp(text)

    filtered_tokens = []
    for token in doc:
        # Skip commas that are punctuation
        if token.text == ',' and token.dep_ == "punct":
            continue
        filtered_tokens.append(token)

    # Rebuild text with spaces between tokens except when punctuation directly follows a word
    new_text = ""
    for i, token in enumerate(filtered_tokens):
        new_text += token.text
        # Add a space if:
        # - this token is not the last token
        # - and the next token is NOT punctuation (so words get separated)
        if i < len(filtered_tokens) - 1:
            next_token = filtered_tokens[i+1]
            if not next_token.is_punct:
                new_text += " "

    return new_text

In [None]:
# Apply removing commas
from tqdm.auto import tqdm
tqdm.pandas()
df_finbert["sentence"] = df_finbert["sentence"].progress_apply(remove_commas_spacy)

In [None]:
def sentiment_focus(text):
    doc = nlp(text)
    focus = ""
    focus_changed = 1
    # For sentences containing 'but', we focus on the part after 'but'
    for token in doc[:-1]:
      if token.lower_ == "but":
          focus = doc[token.i + 1:]
          return str(focus).strip(),focus_changed
        
    # For sentences containing 'partially offset', we focus on the part before 'partially'
    for token in doc:
      if token.lower_ == "partially":
           focus = doc[:token.i].text
           return str(focus).strip(), focus_changed

    if doc[0].lower_ == "while":
      try:
        comma_index_back1 = [token2.i for token2 in doc if token2.text == ','][0]
      except IndexError:
        return str(doc).strip(),focus_changed
      focus = doc[comma_index_back1+1:].text
      return str(focus).strip(),focus_changed

    focus_changed = 0
    return str(doc).strip(),focus_changed

In [None]:
# Apply sentiment focus
df_finbert[['sentence_simple', 'focus_changed']] = df_finbert['sentence'].progress_apply(sentiment_focus).apply(pd.Series)

df_finbert['focus_ornot'] = df_finbert['focus_changed'].apply(lambda x: 1 if x else 0)

df_finbert.drop('focus_changed', axis=1, inplace=True)

In [None]:
# Filter to Q&A section only
df_finbert = df_finbert[df_finbert['section'] == 'Q&A']

In [None]:
df_finbert.to_csv("df_finbert.csv")