This is a complete NLP pipeline for Meta's earnings call sentiment extraction with LLMs, without the needs of human-labeled dataset

**1. Data Extraction**

In [None]:
import json
import re
import pdfplumber
import pandas as pd
import os
import spacy

In [None]:
# Text tokenization with spaCy
nlp = spacy.load("en_core_web_sm")

def split_sentences(text: str) -> list[str]:
    # strip to avoid leading/trailing blanks
    doc = nlp(text.strip())
    return [sent.text for sent in doc.sents]

In [None]:
# Define 3 speaker patterns
speaker_patterns = [
    # 1. "Name, Title"
    re.compile(
        r'^(?=[A-Z][A-Za-z0-9 ]{3,20},\s*[A-Z][A-Za-z ]{1,20}$)'
        r'(?P<speaker>[A-Z][A-Za-z0-9 ]+,\s*[A-Z][A-Za-z ]+)\s*$',
        re.MULTILINE
    ),

    # 2. "Name:"
    re.compile(
        r'^(?=[A-Z][A-Za-z0-9 ]{1,20}:)'
        r'(?P<speaker>(?:[A-Z]{2,}|[A-Z][a-z]+)'
        r'(?:\s+(?:[A-Z]{2,}|[A-Z][a-z]+)){0,4}):',
        re.MULTILINE
    ),

    # 3. "Name, Title, Subtitle"
    re.compile(
        r'^(?P<speaker>'                                
        r'[A-Z][A-Za-z0-9]+(?:\s+[A-Z][A-Za-z0-9]+)*'   
        r'(?:'                                          
          r',\s*[A-Z][A-Za-z]+(?:\s+[A-Za-z]+)*'        
        r'){2,}'                                        
        r')\s*$',                                       
        re.MULTILINE
    ),
]

In [None]:
def find_speaker_matches(text):
    matches = []
    for pat in speaker_patterns:
        matches.extend(pat.finditer(text))
    return sorted(matches, key=lambda m: m.start())

In [None]:
def extract_quarter(filename: str) -> str:
    m = re.search(r'-Q([1-4])-(\d{4})-', filename)
    if m:
        quarter, year = m.group(1), m.group(2)
        return f"{year}Q{quarter}"
    else:
        return "Unknown"

In [None]:
def extract_name(speaker: str) -> str:
    base = speaker.strip().rstrip(':').strip()
    return base.split(',', 1)[0].strip()

In [None]:
def extract_position(name: str) -> str:
    position_map = {
        "Mark Zuckerberg": "CEO",
        "Susan Li": "CFO",
        "Kenneth Dorell": "Investor Relations Director",
        "Operator": "Operator",
    }
    return position_map.get(name, "Analyst")

In [None]:
# Remove page numbers from text
def remove_page_numbers(page_text):
    lines = page_text.splitlines()
    cleaned_lines = []
    for line in lines:
        if not re.match(r'^\s*(Page\s*)?\d+\s*$', line):
            cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

In [None]:
def extract_transcript(pdf_path):
    output = []
    with pdfplumber.open(pdf_path) as pdf:
        # Remove page numbers from each page
        text = "\n".join(
            remove_page_numbers(page.extract_text() or "") for page in pdf.pages
        )

    matches = find_speaker_matches(text)
    for idx, m in enumerate(matches):
        raw_speaker = m.group('speaker').strip()
        start = m.end()
        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
        block_text = text[start:end].replace("\n", " ").strip()
        sentences = [s for s in split_sentences(block_text) if s and len(s.split()) > 7]
        output.append({
            "raw_speaker": raw_speaker,
            "sentences": sentences,
        })

    return output

In [None]:
def process_folder(folder_path, output_json_path):
    all_records = []
    for filename in os.listdir(folder_path):
        if not filename.lower().endswith(".pdf"):
            continue
        pdf_path = os.path.join(folder_path, filename)
        print(f"Processing {filename}...")

        quarter = extract_quarter(filename)
        for block in extract_transcript(pdf_path):
            raw = block["raw_speaker"]
            name = extract_name(raw)
            pos  = extract_position(name)
            all_records.append({
                "filename": filename,
                "quarter": quarter,
                "speaker": name,       
                "position": pos,
                "sentences": block["sentences"],
            })

    print(f"Saving {len(all_records)} records to {output_json_path}...")
    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(all_records, f, indent=2, ensure_ascii=False)
    print("Done.")

In [None]:
if __name__ == "__main__":
    folder = "Earnings Call Transcript"
    output_file = "Earnings Call Transcript.json"
    process_folder(folder, output_file)

In [None]:
df = pd.read_json("Earnings Call Transcript.json")
df = df.explode("sentences").rename(columns={"sentences": "sentence"})

In [None]:
df['id'] = range(1, len(df) + 1)
df.drop('filename', axis=1, inplace=True)

In [None]:
cols = ['id'] + [c for c in df.columns if c != 'id']
df = df[cols]

In [None]:
df = df.dropna()

In [None]:
qna_marker = ["With that, Krista, letâ€™s open up the call for questions."]
mask = df['sentence'].apply(lambda s: any(marker in s for marker in qna_marker))
qna_sentences = df.loc[mask,['quarter', 'speaker', 'sentence']]
print(qna_sentences)

In [None]:
qna_id = qna_sentences.index

In [None]:
# Initialize all as Presentation first
df['section'] = 'Presentation'

# Dictionary mapping quarters to Q&A start indices
qna_start_indices = {
    '2024Q1': qna_id[0],
    '2025Q1': qna_id[1],
    '2024Q2': qna_id[2],
    '2024Q3': qna_id[3],
    '2024Q4': qna_id[4],
}

for quarter, start_idx in qna_start_indices.items():
    mask = (df['quarter'] == quarter) & (df.index > start_idx)
    df.loc[mask, 'section'] = 'Q&A'

In [None]:
df1 = df[df['section'] == 'Q&A'][['quarter','speaker','position','sentence']]
df1[df1['quarter']  == '2025Q1']

In [None]:
df.to_csv('output.csv', index=False, encoding='utf-8-sig')

**2.Data Preprocessing**

In [None]:
import numpy as np
import re
import nltk
import string
import spacy
import unicodedata

In [None]:
# Import text-extracted dataset
df = pd.read_csv("output.csv")
df

In [None]:
nlp = spacy.load("en_core_web_sm")

def data_preprocess_pipeline_llm(text):
    # Normalize Unicode characters
    text = unicodedata.normalize('NFKC', text)

    # Replace multiple dashes with a space
    text = re.sub(r'--+', ' ', text)

    # Run spaCy NLP pipeline
    doc = nlp(text)
        
    # Keep all tokens except spaces
    tokens = [token.lemma_ for token in doc if not token.is_space]

    return ' '.join(tokens)

In [None]:
# Apply data-preprocessing pipeline
df['sentence'] = df['sentence'].apply(data_preprocess_pipeline_llm)

In [None]:
# Extract Q&A section only
df1 = df[df['section'] == 'Q&A']

In [None]:
df1.to_csv("df1.csv")

**3. Sentiment classification**

In [None]:
df1 = df1.drop(columns=['Unnamed: 0'], inplace=True)
df = df1[~df1['speaker'].isin(['Operator', 'Kenneth Dorell'])]
df 

In [None]:
data = df[['id','quarter','sentence']]
data

In [None]:
train_data = data[data['quarter'].isin(['2024Q1', '2024Q2', '2024Q3'])]
train_data.shape

In [None]:
test_data = data.drop(train_data.index)
test_data.shape

**ZERO-SHOT PROMPT**

In [None]:
test_data['pred_label'] = ''

In [None]:
pip install openai

In [None]:
from openai import OpenAI
import json

In [None]:
client = OpenAI(api_key="sk-58c0df73519c42debe27d41e164d455a", base_url="https://api.deepseek.com")

In [None]:
batches = []
batch_size = 10

for i in range(0,len(test_data),batch_size):
    batches.append(test_data[i:i+batch_size])

In [None]:
def get_completion(batch,current_batch,total_batch, model='deepseek-chat'):
    print(f"Processing batch {current_batch+1} of {total_batch}")
    
    json_data = batch[['sentence','pred_label']].to_json(orient='records')
    
    zero_shot_prompt = f""" You are an advanced sentiment analysis assistant. 
    Your task is to classify sentiment and give sentiment score for each sentence extracted from an earnings call transcript as -1 for negative sentiment, 0 for neutral sentiment, 1 for positive sentiment. 
    The sentiment score should be an integer. 
    The purpose is to extract the trading sentiment so as to have an edge in after-market trading. 
    The sentences are provided between three backticks below.
    Return **only** a valid JSON code as output - which is provided between three backticks.
    Update the predicted sentiment score under the 'pred_label' in the JSON code.
    Do not make any changes to the JSON format.
    
    ```
    {json_data}
    ```
    """
    print(zero_shot_prompt)
    messages=[{"role": "user", "content": zero_shot_prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": zero_shot_prompt}],
        stream=False
    )
    return response.choices[0].message.content
    

In [None]:
batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
    response = get_completion(batches[i], i, batch_count)
    responses.append(response)

In [None]:
import json
import re

df_total0 = pd.DataFrame()

for response in responses: 
    cleaned = re.sub(r"^```(json)?|```$", "", response.strip()).strip()
    data = json.loads(cleaned)
    df_temp = pd.DataFrame(data)
    df_total0 = pd.concat([df_total0, df_temp], ignore_index=True)

    
print(df_total0)   

In [None]:
test_data['pred_label'] = df_total0['pred_label'].values

In [None]:
test_data