In [None]:
!pip install -q transformers accelerate bitsandbytes torch pandas tqdm pyarrow

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
import pandas as pd
import numpy as np
import torch
from time import sleep
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from google.colab import drive

In [None]:
drive.mount('/content/drive')
DATA_DIR = "/content/drive/My Drive/DL Proj/data/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Llama 3.2 Model Setup

In [23]:
# Load the model and tokenizer with quantization
print("Loading Llama 3.2 3B Instruct model...")
model_id = "meta-llama/Llama-3.2-3B-Instruct"

pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.float16,
    device_map="auto",
)

print("Model loaded successfully!")

Loading Llama 3.2 3B Instruct model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded successfully!


# Helper Functions

In [20]:
def get_llama_response(text):
    """Generate a response from Llama 3.2 model using the official pattern"""
    messages = [
        {"role": "system", "content": "You are an expert in stock analysis focusing on earning call analysis. Your task is to determine if the sentiment of earnings call transcripts is positive, negative, or neutral. Respond with exactly one word."},
        {"role": "user", "content": f"Analyze the sentiment of this earnings call transcript: {text}"}
    ]

    # Format messages in Llama chat format
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Generate response
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=20,
            do_sample=False,
        )

    # Extract response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    assistant_part = response.split("<assistant>")[-1].strip()

    return assistant_part

In [None]:
def count_tokens(text):
    """Count the number of tokens in a text"""
    return len(tokenizer.encode(text))

# Sentiment Analysis on Earnings Call Transcripts

In [None]:
# Create or load results dataframe
try:
    llama_results = pd.read_csv(DATA_DIR + "llama_3_2_results.csv")
    exist_transcript_id = llama_results["transcript_id"].tolist()
    row = len(llama_results)
    print(f"Loaded existing results with {row} entries")
except FileNotFoundError:
    llama_results = pd.DataFrame(columns=["transcript_id", "sentiment"])
    exist_transcript_id = []
    row = 0
    print("Creating new results dataframe")

Creating new results dataframe


In [None]:
# Process transcripts batch by batch
question = "Following is the one text, please only give one word output as positive, negative or neutral: "

# Silence specific warnings
import warnings
warnings.filterwarnings("ignore", message=".*do_sample.*")
warnings.filterwarnings("ignore", message=".*temperature.*")
warnings.filterwarnings("ignore", message=".*top_p.*")

for i in ([0, 1]):
    data = pd.read_parquet(DATA_DIR + f'batch_{i}_transcriptcomponent.parquet')
    transcript_id_list = sorted(list(data["transcriptid"].unique()))

    for idx, transcript_id in enumerate(transcript_id_list):
        try:
            if transcript_id in exist_transcript_id:
                continue

            id_data = data[data["transcriptid"] == transcript_id]
            id_data = id_data.sort_values(by="componentorder")
            text = id_data["componenttext"].tolist()
            text = " ".join(text).replace("\"", "\'")

            # Count tokens for reporting
            token_count = len(pipe.tokenizer.encode(text))

            # Simplified system prompt focused on single-word response
            messages = [
                {"role": "system", "content": "You are a sentiment analysis expert. Respond with exactly ONE WORD: 'positive', 'negative', or 'neutral'."},
                {"role": "user", "content": f"Analyze the sentiment of this earnings call transcript: {text}"}
            ]

            # Generate response
            output = pipe(
                messages,
                max_new_tokens=10,  # Reduced for faster, more focused responses
                do_sample=False,    # Deterministic output for consistency
            )

            # Get the full response as a string
            full_response = output[0]["generated_text"] if isinstance(output[0]["generated_text"], str) else str(output[0]["generated_text"])

            # Extract sentiment using pattern matching
            import re
            # Look for positive, negative, or neutral anywhere in the response
            sentiment_match = re.search(r'\b(positive|negative|neutral)\b', full_response.lower())

            if sentiment_match:
                res = sentiment_match.group(0)
                llama_results.loc[row] = [transcript_id, res]
                print(f"batch {i}: {round(idx/len(transcript_id_list),3)} has finished, curr message has {token_count} tokens with answer: {res}")
            else:
                # If no exact match, try to find the closest match
                if "pos" in full_response.lower():
                    res = "positive"
                elif "neg" in full_response.lower():
                    res = "negative"
                elif "neut" in full_response.lower():
                    res = "neutral"
                else:
                    res = np.nan

                llama_results.loc[row] = [transcript_id, res]
                if res is not np.nan:
                    print(f"batch {i}: {round(idx/len(transcript_id_list),3)} has finished, curr message has {token_count} tokens with answer: {res} (extracted)")
                else:
                    print(f"Could not extract sentiment from response for transcript {transcript_id}: {full_response[:50]}...")

            # Save results periodically
            if idx % 5 == 0:
                llama_results.to_csv(DATA_DIR + "llama_3_2_results.csv", index=False)

            sleep(1)
            row += 1

        except Exception as e:
            print(f"Error processing transcript {transcript_id}: {str(e)}")
            llama_results.loc[row] = [transcript_id, np.nan]
            row += 1

    # Save results after processing each batch
    llama_results.to_csv(DATA_DIR + "llama_3_2_results.csv", index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
batch 0: 0.877 has finished, curr message has 6724 tokens with answer: positive
batch 0: 0.877 has finished, curr message has 11138 tokens with answer: positive
batch 0: 0.877 has finished, curr message has 11379 tokens with answer: positive
batch 0: 0.877 has finished, curr message has 9585 tokens with answer: positive
batch 0: 0.878 has finished, curr message has 9613 tokens with answer: positive
batch 0: 0.878 has finished, curr message has 10932 tokens with answer: positive
batch 0: 0.878 has finished, curr message has 11027 tokens with answer: positive
batch 0: 0.878 has finished, curr message has 16259 tokens with answer: positive
batch 0: 0.878 has finished, curr message has 13451 tokens with answer: positive
batch 0: 0.879 has finished, curr message has 16395 tokens with answer: positive
batch 0: 0.879 has finished, curr message has 13479 tokens with answer: positive
batch 0: 0.879 has finished, curr message has 1

# Results Analysis

In [None]:
# Display results
print("Results:")
llama_results.head()

In [None]:
print("\nSentiment distribution:")
llama_results["sentiment"].value_counts(dropna=False)