# Import necessary packages

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import pipeline
import openai
from sklearn.metrics import classification_report
from tqdm import tqdm
from langchain_ollama.llms import OllamaLLM
from langchain_google_vertexai import ChatVertexAI
from langchain.chains import LLMChain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.messages.ai import AIMessage
from langchain_google_vertexai import ChatVertexAI
import plotly.express as px
import plotly.graph_objects as go
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import neattext.functions as nfx
import os
from typing import List, Dict, Any, Optional
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from sklearn.metrics import f1_score

  from .autonotebook import tqdm as notebook_tqdm


# Load MELD dataset

In [2]:
df_train = pd.read_csv("datasets/meld/train_sent_emo.csv")
df_test = pd.read_csv("datasets/meld/test_sent_emo.csv")

In [3]:
example_text = "I thought things would get better with time, but it just keeps hurting more."
emotion_labels = df_train["Emotion"].unique().tolist()
emotion_labels

['neutral', 'surprise', 'fear', 'sadness', 'joy', 'disgust', 'anger']

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9989 entries, 0 to 9988
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Sr No.        9989 non-null   int64 
 1   Utterance     9989 non-null   object
 2   Speaker       9989 non-null   object
 3   Emotion       9989 non-null   object
 4   Sentiment     9989 non-null   object
 5   Dialogue_ID   9989 non-null   int64 
 6   Utterance_ID  9989 non-null   int64 
 7   Season        9989 non-null   int64 
 8   Episode       9989 non-null   int64 
 9   StartTime     9989 non-null   object
 10  EndTime       9989 non-null   object
dtypes: int64(5), object(6)
memory usage: 858.6+ KB


In [5]:
df_train.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
1,2,You must’ve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389"
3,4,So let’s talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572"
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"


### 📊 Emotion Distribution in the MELD Training Set

Before training any models, I explore the class distribution of emotions in the MELD training dataset. 
    
To visualize this, I create a **pie chart** showing the relative frequency of each emotion label, with percentage values displayed in the legend. This provides an intuitive overview of the emotional makeup of the dataset and helps inform decisions regarding model evaluation and potential rebalancing strategies.


In [6]:
df_train["Emotion"].value_counts()

Emotion
neutral     4710
joy         1743
surprise    1205
anger       1109
sadness      683
disgust      271
fear         268
Name: count, dtype: int64

In [7]:
# Get counts and percentages
df_value_counts = df_train["Emotion"].value_counts()
labels = list(df_value_counts.keys())
sizes = df_value_counts.tolist()

# Calculate formatted labels: "emotion (xx.xx%)"
total = sum(sizes)
legend_labels = [f"{label} ({(count/total)*100:.2f}%)" for label, count in zip(labels, sizes)]

# Create pie chart
fig = go.Figure(
    data=[go.Pie(
        labels=legend_labels,     # legend-style labels
        values=sizes,
        textinfo='none',          # remove text from inside the pie
        hole=0.3
    )]
)

# Update layout: legend in top-right
fig.update_layout(
    title="Emotion Distribution in MELD Dataset",
    width=600,
    height=450,
    margin=dict(l=20, r=20, t=50, b=20),
    title_x=0.5,
    legend=dict(
        orientation="v",
        x=1.02,
        y=1,
        xanchor="left",
        yanchor="top",
        font=dict(size=12)
    )
)

fig.show()


**To ensure faster evaluation and reduce API costs during large language model inference, I limit the MELD test set to the first 500 examples.**

In [8]:
test_size = 500
print(f"test size is reduced from {len(df_test)} to {test_size}")
df_test = df_test[:test_size]

test size is reduced from 2610 to 500


# Evaluation

In [9]:
model_scores = {} # save weighted f1 scores for each model

## 🧪 Supervised Emotion Classification with Traditional ML Models

In this section, I apply three traditional supervised learning models—**Logistic Regression**, **Support Vector Classifier (SVC)**, and **Random Forest Classifier**—to the MELD dataset for text-based emotion classification.

First, I preprocess the training utterances by removing user handles and stopwords. Then, I use the `CountVectorizer` to convert the cleaned text into numerical feature vectors. These vectors serve as input to the classifiers.

Each model is wrapped in a `scikit-learn` pipeline for streamlined preprocessing and training. After training, I evaluate the models on the test split of the MELD dataset using standard classification metrics (precision, recall, and F1-score).


Preprocess the utterances and assign the cleaned utterances and corresponding emotions to x_train and y_train, respectively, for training.

In [10]:
x_train, y_train = df_train["Utterance"].apply(nfx.remove_userhandles).apply(nfx.remove_stopwords), df_train['Emotion']

### Logistic Regression

In [11]:
pipe_lr = Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression(max_iter=100))])
pipe_lr.fit(x_train, y_train)
print("Logistic regrestion model is ready to use!")

Logistic regrestion model is ready to use!


In [12]:
lr_predictions = pipe_lr.predict(df_test["Utterance"].values)

In [13]:
print("Evaluation results on the MELD dataset using Logistic Regression:\n")
print(classification_report(df_test["Emotion"], lr_predictions, zero_division=0))
model_scores["Logistic Regression"] = f1_score(df_test["Emotion"], lr_predictions, average='weighted', zero_division=0)

Evaluation results on the MELD dataset using Logistic Regression:

              precision    recall  f1-score   support

       anger       0.35      0.26      0.30        62
     disgust       0.33      0.09      0.14        11
        fear       0.00      0.00      0.00        10
         joy       0.28      0.18      0.22        91
     neutral       0.54      0.64      0.59       238
     sadness       0.31      0.32      0.31        38
    surprise       0.27      0.38      0.31        50

    accuracy                           0.43       500
   macro avg       0.30      0.27      0.27       500
weighted avg       0.41      0.43      0.41       500



### SVC

In [14]:
pipe_svc = Pipeline(steps=[('cv', CountVectorizer()), ('svc', SVC(kernel='rbf', C=10))])
pipe_svc.fit(x_train, y_train)
print("SVC model is ready to use!")

SVC model is ready to use!


In [15]:
svc_predictions = pipe_svc.predict(df_test["Utterance"].values)

In [16]:
print("Evaluation results on the MELD dataset using SVC:\n")
print(classification_report(df_test["Emotion"], svc_predictions, zero_division=0))
model_scores["SVC"] = f1_score(df_test["Emotion"], svc_predictions, average='weighted', zero_division=0)

Evaluation results on the MELD dataset using SVC:

              precision    recall  f1-score   support

       anger       0.18      0.06      0.10        62
     disgust       0.33      0.09      0.14        11
        fear       0.00      0.00      0.00        10
         joy       0.37      0.18      0.24        91
     neutral       0.52      0.79      0.62       238
     sadness       0.22      0.05      0.09        38
    surprise       0.46      0.52      0.49        50

    accuracy                           0.47       500
   macro avg       0.30      0.24      0.24       500
weighted avg       0.41      0.47      0.41       500



### Random Forest Classifier

In [17]:
pipe_rf = Pipeline(steps=[('cv', CountVectorizer()), ('rf', RandomForestClassifier(n_estimators=10))])
pipe_rf.fit(x_train, y_train)
print("Random Forest Classifier model is ready to use!")

Random Forest Classifier model is ready to use!


In [18]:
rf_predictions = pipe_rf.predict(df_test["Utterance"].values)

In [19]:
print("Evaluation results on the MELD dataset using Random Forest Classifier:\n")
print(classification_report(df_test["Emotion"], rf_predictions, zero_division=0))
model_scores["Random Forest Classifier"] = f1_score(df_test["Emotion"], rf_predictions, average='weighted', zero_division=0)

Evaluation results on the MELD dataset using Random Forest Classifier:

              precision    recall  f1-score   support

       anger       0.19      0.11      0.14        62
     disgust       0.00      0.00      0.00        11
        fear       0.00      0.00      0.00        10
         joy       0.23      0.12      0.16        91
     neutral       0.53      0.70      0.60       238
     sadness       0.13      0.05      0.08        38
    surprise       0.32      0.50      0.39        50

    accuracy                           0.42       500
   macro avg       0.20      0.21      0.20       500
weighted avg       0.36      0.42      0.38       500



In [20]:
weighted_f1 = f1_score(df_test["Emotion"], rf_predictions, average='weighted', zero_division=0)
print(f"Weighted F1-score: {weighted_f1:.4f}")

Weighted F1-score: 0.3785


## 🔍 Zero-Shot Emotion Classification with `facebook/bart-large-mnli`
To evaluate an alternative approach without supervised training, I use the [`facebook/bart-large-mnli`](https://huggingface.co/facebook/bart-large-mnli) model from Hugging Face for **zero-shot classification**. This model allows emotion classification by comparing each utterance directly against a set of candidate labels, without requiring task-specific fine-tuning.

For each utterance, the model ranks the candidate emotions based on how well they match the text. The top-ranked label is selected as the predicted emotion.

In [21]:
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Device set to use mps:0


In [22]:
result = zero_shot_classifier(example_text, candidate_labels=emotion_labels)
result["labels"][0]

'surprise'

In [23]:
zero_shot_predictions = []
for text in tqdm(df_test["Utterance"]):
    output = zero_shot_classifier(text, candidate_labels=emotion_labels)
    zero_shot_predictions.append(output["labels"][0])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:19<00:00,  6.28it/s]


In [24]:
print("Evaluation results on the MELD dataset using facebook/bart-large-mnli (zero-shot classification):\n")
print(classification_report(df_test["Emotion"], zero_shot_predictions, zero_division=0))
model_scores["facebook/bart-large-mnli"] = f1_score(df_test["Emotion"], zero_shot_predictions, average='weighted', zero_division=0)

Evaluation results on the MELD dataset using facebook/bart-large-mnli (zero-shot classification):

              precision    recall  f1-score   support

       anger       0.55      0.10      0.16        62
     disgust       0.21      0.27      0.24        11
        fear       0.25      0.10      0.14        10
         joy       0.82      0.35      0.49        91
     neutral       0.70      0.14      0.23       238
     sadness       0.75      0.16      0.26        38
    surprise       0.13      0.96      0.22        50

    accuracy                           0.26       500
   macro avg       0.49      0.30      0.25       500
weighted avg       0.63      0.26      0.27       500



## 🤖 Emotion Classification with `emotion-english-distilroberta-base`

In this step, I evaluate a transformer-based model fine-tuned specifically for emotion classification: [`j-hartmann/emotion-english-distilroberta-base`](https://huggingface.co/j-hartmann/emotion-english-distilroberta-base). This model is built on top of DistilRoBERTa and has been trained on multiple English emotion datasets.

Using the Hugging Face `text-classification` pipeline, I apply the model to each utterance in the dataset and extract the top predicted emotion label. I then compare the predicted labels to the ground-truth annotations using standard classification metrics.

In [25]:
distilroberta_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

Device set to use mps:0


In [26]:
result = distilroberta_classifier(example_text)
result

[{'label': 'sadness', 'score': 0.8036125302314758}]

In [27]:
predictions_distilroberta = []
for text in tqdm(df_test["Utterance"]):
    output = distilroberta_classifier(text, top_k=1)
    predictions_distilroberta.append(output[0]["label"])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:06<00:00, 80.77it/s]


In [28]:
print("Evaluation results on the MELD dataset using j-hartmann/emotion-english-distilroberta-base (text classification):\n")
print(classification_report(df_test["Emotion"], predictions_distilroberta, zero_division=0))
model_scores["j-hartmann/emotion-english-distilroberta-base"] = f1_score(df_test["Emotion"], predictions_distilroberta, average='weighted', zero_division=0)

Evaluation results on the MELD dataset using j-hartmann/emotion-english-distilroberta-base (text classification):

              precision    recall  f1-score   support

       anger       0.29      0.32      0.31        62
     disgust       0.06      0.27      0.10        11
        fear       0.00      0.00      0.00        10
         joy       0.59      0.43      0.50        91
     neutral       0.75      0.55      0.64       238
     sadness       0.50      0.29      0.37        38
    surprise       0.33      0.74      0.46        50

    accuracy                           0.48       500
   macro avg       0.36      0.37      0.34       500
weighted avg       0.57      0.48      0.51       500



## 🤖 Emotion Classification using Large Language Models and Prompt Engineering

In this section, I evaluate the emotion classification capabilities of several large language models (LLMs) using **prompt-based inference**, without task-specific fine-tuning. This approach leverages the few-shot and instruction-following capabilities of modern LLMs for **in-context learning**.

#### 💡 Models Used

I tested the following LLMs across various prompting strategies:

* **`llama3.1:8b`** via **Ollama**
* **`gpt-4.1-nano`** via **OpenAI API**
* **`gemini-2.5-flash-lite`** via **Vertex AI**

These models were evaluated using a consistent framework to ensure comparability.

#### 🧠 Prompting Strategies

To understand how different types of prompts affect performance, I evaluated each model with the following five prompt types:

1. **Zero-shot prompt**: The model receives only the target utterance and is expected to infer the emotion label from a predefined list, with no prior examples.
2. **One-shot prompt**: A single labeled example is provided before the target utterance.
3. **Few-shot prompt**: Multiple labeled examples (covering all emotion classes) are provided to guide the model's prediction.
4. **Contextual prompt**: The model receives the target utterance along with the preceding utterances from the same conversation to account for dialogue context.
5. **Retrieval-Augmented Contextual prompt**: In addition to the conversation history, the model is shown one **relevant example** from the training dataset, retrieved via semantic similarity using a **vector store (FAISS)**. This combines **retrieval-augmented generation (RAG)** principles with conversational context.

#### 🧰 Framework Components

To support this evaluation, I implemented several utility classes:

* **`MELDDataManager`**: Organizes utterances by dialogue and retrieves conversation history given a dialogue and utterance ID.
* **`DemonstrationRetriever`**: Builds a semantic vector store using sentence embeddings (`all-MiniLM-L6-v2`) and retrieves the most similar utterance from the training set for use in retrieval-augmented prompts.
* **`PromptGenerator`**: Generates different prompt templates in a modular way, including support for history and demonstrations.
* **`evaluate_emotion_classification`**: A unified evaluation function that constructs the prompt, queries the model, extracts predicted labels, and reports performance metrics.

#### 📊 Evaluation

Each model was tested on the same MELD test split with all five prompt types. The outputs were evaluated using **precision, recall, and F1-score**, and the predicted labels were extracted using a rule-based parser that identifies the first valid emotion label in the model output.

In [29]:
class MELDDataManager:
    def __init__(self, dataframe: pd.DataFrame):
        self.df = dataframe.copy()
        self.dialogues: Dict[int, List[Dict[str, Any]]] = {}
        for dialogue_id, group in self.df.groupby('Dialogue_ID'):
            self.dialogues[dialogue_id] = group.sort_values(by='Utterance_ID').to_dict('records')

    def get_conversation_history(self, dialogue_id: int, utterance_id: int, window_size: int = 9) -> List[Dict[str, Any]]:
        if dialogue_id not in self.dialogues: return []
        conversation = self.dialogues[dialogue_id]
        target_idx = next((i for i, u in enumerate(conversation) if u['Utterance_ID'] == utterance_id), -1)
        if target_idx == -1: return []
        start_idx = max(0, target_idx - window_size)
        return conversation[start_idx:target_idx]

In [30]:
class DemonstrationRetriever:
    """
    Handles creating an in-memory vector store and retrieving demonstrations.
    This version does NOT save the vector store to disk.
    """
    def __init__(self, train_df: pd.DataFrame, embedding_model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initializes the retriever by creating an in-memory FAISS vector store from train_df.
        """
        print("Initializing DemonstrationRetriever and creating in-memory vector store...")
        self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
        documents = train_df['Utterance'].tolist()
        metadatas = train_df[['Emotion']].to_dict('records')
        self.vector_store = FAISS.from_texts(documents, self.embeddings, metadatas=metadatas)
        print("In-memory vector store created successfully.")

    def retrieve_demonstration(self, query_utterance: str) -> str:
        results = self.vector_store.similarity_search(query_utterance, k=1)
        if not results: return "No demonstration found."
        retrieved_doc = results[0]
        return f"Utterance: \"{retrieved_doc.page_content}\"\nEmotion: {retrieved_doc.metadata.get('Emotion', 'N/A')}"


In [31]:
class PromptGenerator:
    """
    A toolkit for generating various ChatPromptTemplates for emotion recognition.
    """
    MELD_EMOTION_LABELS = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]

    def _format_history(self, history: List[Dict[str, Any]]) -> str:
        if not history: return "This is the first utterance in the conversation."
        return "\n".join([f"{u['Speaker']}: \"{u['Utterance']}\"" for u in history])

    def _format_target_utterance(self, target_utterance: Dict[str, Any]) -> str:
        return f"{target_utterance['Speaker']}: \"{target_utterance['Utterance']}\""

    def get_zero_shot_template(self) -> ChatPromptTemplate:
        """Returns a basic zero-shot classification prompt."""
        emotion_options = ", ".join(self.MELD_EMOTION_LABELS)
        template = ChatPromptTemplate.from_messages([
            ("system", f"You are a classification assistant. Only output one of the following emotion labels: {emotion_options}. No explanation."),
            ("user", "Utterance: {utterance}\nEmotion:")
        ])
        template.name = "zero_shot_prompt"
        return template

    def get_one_shot_template(self) -> ChatPromptTemplate:
        """Returns a one-shot classification prompt with a single example."""
        emotion_options = ", ".join(self.MELD_EMOTION_LABELS)
        template = ChatPromptTemplate.from_messages([
            ("system", f"You are an assistant that labels user utterances with one of the following emotions: {emotion_options}. Please give only the label!"),
            ("user", "Utterance: That’s fine. I’m okay with it. → Emotion: neutral\nUtterance: {utterance} → Emotion:")
        ])
        template.name = "one_shot_prompt"
        return template

    def get_few_shot_template(self) -> ChatPromptTemplate:
        """Returns a few-shot classification prompt with multiple examples."""
        emotion_options = ", ".join(self.MELD_EMOTION_LABELS)
        template = ChatPromptTemplate.from_messages([
            ("system", f"You are an assistant that labels user utterances with one of the following emotions: {emotion_options}. Please give only the label!"),
            ("user", """Examples:

Utterance: I can’t believe this happened to me.
Emotion: sadness

Utterance: Wow, that’s amazing!
Emotion: joy

Utterance: Why would you do that?!
Emotion: anger

Utterance: That’s fine. I’m okay with it.
Emotion: neutral

Utterance: Oh my God, I didn’t expect that!
Emotion: surprise

Utterance: Yuck, that’s gross.
Emotion: disgust

Utterance: I’m really scared.
Emotion: fear

Now classify:
Utterance: "{utterance}"
Emotion:""")
        ])
        template.name = "few_shot_prompt"
        return template

    def get_contextual_template(self) -> ChatPromptTemplate:
        """Returns an InstructERC-style prompt with history but no demonstration."""
        emotion_options = ", ".join(self.MELD_EMOTION_LABELS)
        system_template = ("You are an expert in conversational analysis. Your task is to identify the emotion of the "
                           "TARGET UTTERANCE based on the CONVERSATION HISTORY. "
                           f"You must only output one of the following emotion labels: <{emotion_options}>. Do not provide any explanation.")
        user_template = ("### CONVERSATION HISTORY ###\n{history}\n\n"
                         "### TARGET UTTERANCE ###\n{target_utterance_with_speaker}\n\n"
                         "Emotion:")
        template = ChatPromptTemplate.from_messages([("system", system_template), ("user", user_template)])
        template.name = "contextual_prompt"
        return template

    def get_retrieval_augmented_contextual_template(self) -> ChatPromptTemplate:
        """Returns the full InstructERC-style prompt with history and a demonstration."""
        emotion_options = ", ".join(self.MELD_EMOTION_LABELS)
        system_template = ("You are an expert in conversational analysis. Your task is to identify the emotion of the "
                           "TARGET UTTERANCE based on the provided DEMONSTRATION example and the CONVERSATION HISTORY. "
                           f"You must only output one of the following emotion labels: <{emotion_options}>. Do not provide any explanation.")
        user_template = ("### DEMONSTRATION ###\n{demonstration}\n\n"
                         "### CONVERSATION HISTORY ###\n{history}\n\n"
                         "### TARGET UTTERANCE ###\n{target_utterance_with_speaker}\n\n"
                         "Emotion:")
        template = ChatPromptTemplate.from_messages([("system", system_template), ("user", user_template)])
        template.name = "retrieval_augmented_contextual_prompt"
        return template

In [32]:
def extract_emotion_from_output(output_text: str, valid_labels: List[str]) -> Optional[str]:
    output_text = output_text.lower()
    first_found = None
    first_index = len(output_text) + 1
    for label in valid_labels:
        try:
            idx = output_text.find(label)
            if idx != -1 and idx < first_index:
                first_index = idx
                first_found = label
        except AttributeError:
            return "neutral"
    return first_found if first_found else "neutral"

def print_classification_report(labels: List[str], predictions: List[str], llm_name: str, prompt_name: str):
    print(f"\n📌 Evaluating with model: {llm_name}")
    print(f"📄 Prompt Template Type: {prompt_name}\n")
    print(classification_report(labels, predictions, zero_division=0))
    
def evaluate_emotion_classification(df: pd.DataFrame, llm, prompt: ChatPromptTemplate, data_manager: MELDDataManager, retriever: Optional[DemonstrationRetriever] = None, limit: Optional[int] = None, verbose: bool = False, print_report: bool = False):
    if verbose:
        print(f"\n📌 Evaluating with model: {llm.model}")
        print(f"📄 Prompt Template Type: {prompt.name}")
    results = []
    df_subset = df.head(limit) if limit else df
    for idx, row in tqdm(df_subset.iterrows(), total=len(df_subset), desc=f"Evaluating {prompt.name} using {llm.name}"):
        input_dict = {}
        if prompt.name in ["zero_shot_prompt", "one_shot_prompt", "few_shot_prompt"]:
            input_dict = {"utterance": row['Utterance']}
        elif prompt.name in ["contextual_prompt", "retrieval_augmented_contextual_prompt"]:
            history_list = data_manager.get_conversation_history(row['Dialogue_ID'], row['Utterance_ID'])
            input_dict["history"] = prompt_generator._format_history(history_list)
            input_dict["target_utterance_with_speaker"] = prompt_generator._format_target_utterance(row)
            if prompt.name == "retrieval_augmented_contextual_prompt":
                input_dict["demonstration"] = retriever.retrieve_demonstration(row['Utterance'])
        prompt_input = prompt.invoke(input_dict)
        raw_output = llm.invoke(prompt_input)
        if isinstance(raw_output, AIMessage):
            raw_output = raw_output.content
        predicted_emotion = extract_emotion_from_output(raw_output, PromptGenerator.MELD_EMOTION_LABELS)
        results.append(predicted_emotion)
    
    model_scores[f"{llm.name} with {prompt.name}"] = f1_score(df_test["Emotion"], results, average='weighted', zero_division=0)
    
    if print_report:
        print_classification_report(df_subset["Emotion"].tolist(), results, llm.name, prompt.name)
    return results

In [33]:
retriever = DemonstrationRetriever(train_df=df_train)
data_manager = MELDDataManager(df_test)
prompt_generator = PromptGenerator()

Initializing DemonstrationRetriever and creating in-memory vector store...
In-memory vector store created successfully.


### LLM: "llama3.1:8b"

In [34]:
llama3_8b = OllamaLLM(model="llama3.1:8b")
llama3_8b.name = "llama3.1:8b"

In [35]:
_ = evaluate_emotion_classification(
    df=df_test, llm=llama3_8b, prompt=prompt_generator.get_zero_shot_template(),
    data_manager=data_manager, print_report=True)

Evaluating zero_shot_prompt using llama3.1:8b: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [02:15<00:00,  3.68it/s]


📌 Evaluating with model: llama3.1:8b
📄 Prompt Template Type: zero_shot_prompt

              precision    recall  f1-score   support

       anger       0.35      0.37      0.36        62
     disgust       0.12      0.36      0.19        11
        fear       0.09      0.10      0.10        10
         joy       0.55      0.55      0.55        91
     neutral       0.72      0.63      0.67       238
     sadness       0.47      0.53      0.49        38
    surprise       0.37      0.36      0.36        50

    accuracy                           0.53       500
   macro avg       0.38      0.41      0.39       500
weighted avg       0.56      0.53      0.54       500






In [36]:
_ = evaluate_emotion_classification(
    df=df_test, llm=llama3_8b, prompt=prompt_generator.get_one_shot_template(),
    data_manager=data_manager, print_report=True)

Evaluating one_shot_prompt using llama3.1:8b: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [02:08<00:00,  3.89it/s]


📌 Evaluating with model: llama3.1:8b
📄 Prompt Template Type: one_shot_prompt

              precision    recall  f1-score   support

       anger       0.32      0.44      0.37        62
     disgust       0.06      0.27      0.10        11
        fear       0.11      0.20      0.14        10
         joy       0.61      0.51      0.55        91
     neutral       0.84      0.38      0.53       238
     sadness       0.44      0.50      0.47        38
    surprise       0.29      0.68      0.40        50

    accuracy                           0.44       500
   macro avg       0.38      0.43      0.36       500
weighted avg       0.62      0.44      0.48       500






In [37]:
_ = evaluate_emotion_classification(
    df=df_test, llm=llama3_8b, prompt=prompt_generator.get_few_shot_template(),
    data_manager=data_manager, print_report=True)

Evaluating few_shot_prompt using llama3.1:8b: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:57<00:00,  4.26it/s]


📌 Evaluating with model: llama3.1:8b
📄 Prompt Template Type: few_shot_prompt

              precision    recall  f1-score   support

       anger       0.36      0.37      0.37        62
     disgust       0.10      0.27      0.15        11
        fear       0.06      0.10      0.08        10
         joy       0.70      0.47      0.57        91
     neutral       0.77      0.61      0.68       238
     sadness       0.33      0.63      0.44        38
    surprise       0.39      0.56      0.46        50

    accuracy                           0.53       500
   macro avg       0.39      0.43      0.39       500
weighted avg       0.61      0.53      0.56       500






In [38]:
_ = evaluate_emotion_classification(
    df=df_test, llm=llama3_8b, prompt=prompt_generator.get_contextual_template(),
    data_manager=data_manager, print_report=True)

Evaluating contextual_prompt using llama3.1:8b: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:39<00:00,  2.28it/s]


📌 Evaluating with model: llama3.1:8b
📄 Prompt Template Type: contextual_prompt

              precision    recall  f1-score   support

       anger       0.37      0.58      0.45        62
     disgust       0.25      0.64      0.36        11
        fear       0.17      0.10      0.12        10
         joy       0.50      0.43      0.46        91
     neutral       0.73      0.38      0.50       238
     sadness       0.30      0.63      0.41        38
    surprise       0.29      0.52      0.37        50

    accuracy                           0.45       500
   macro avg       0.37      0.47      0.38       500
weighted avg       0.55      0.45      0.46       500






In [39]:
_ = evaluate_emotion_classification(
    df=df_test, llm=llama3_8b, prompt=prompt_generator.get_retrieval_augmented_contextual_template(),
    data_manager=data_manager, retriever=retriever, print_report=True)

Evaluating retrieval_augmented_contextual_prompt using llama3.1:8b: 100%|██████████████████████████████████████████████████████████████████████████| 500/500 [05:35<00:00,  1.49it/s]


📌 Evaluating with model: llama3.1:8b
📄 Prompt Template Type: retrieval_augmented_contextual_prompt

              precision    recall  f1-score   support

       anger       0.29      0.65      0.40        62
     disgust       0.11      0.18      0.14        11
        fear       0.20      0.10      0.13        10
         joy       0.62      0.45      0.52        91
     neutral       0.82      0.41      0.54       238
     sadness       0.23      0.45      0.31        38
    surprise       0.29      0.48      0.36        50

    accuracy                           0.44       500
   macro avg       0.37      0.39      0.34       500
weighted avg       0.59      0.44      0.47       500






### LLM: "gpt-4.1-nano"

In [40]:
gpt_4_1_nano = ChatOpenAI(model="gpt-4.1-nano", temperature=0)
gpt_4_1_nano.name = "gpt_4_1_nano"

In [41]:
_ = evaluate_emotion_classification(
    df=df_test, llm=gpt_4_1_nano, prompt=prompt_generator.get_zero_shot_template(),
    data_manager=data_manager, print_report=True)

Evaluating zero_shot_prompt using gpt_4_1_nano: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:53<00:00,  2.14it/s]


📌 Evaluating with model: gpt_4_1_nano
📄 Prompt Template Type: zero_shot_prompt

              precision    recall  f1-score   support

       anger       0.40      0.29      0.34        62
     disgust       0.10      0.18      0.12        11
        fear       0.09      0.10      0.10        10
         joy       0.42      0.70      0.52        91
     neutral       0.81      0.58      0.67       238
     sadness       0.70      0.37      0.48        38
    surprise       0.42      0.68      0.52        50

    accuracy                           0.54       500
   macro avg       0.42      0.41      0.39       500
weighted avg       0.61      0.54      0.55       500






In [42]:
_ = evaluate_emotion_classification(
    df=df_test, llm=gpt_4_1_nano, prompt=prompt_generator.get_one_shot_template(),
    data_manager=data_manager, print_report=True)

Evaluating one_shot_prompt using gpt_4_1_nano: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:38<00:00,  2.29it/s]


📌 Evaluating with model: gpt_4_1_nano
📄 Prompt Template Type: one_shot_prompt

              precision    recall  f1-score   support

       anger       0.39      0.50      0.44        62
     disgust       0.09      0.18      0.12        11
        fear       0.25      0.10      0.14        10
         joy       0.42      0.60      0.49        91
     neutral       0.92      0.43      0.58       238
     sadness       0.56      0.37      0.44        38
    surprise       0.32      0.82      0.46        50

    accuracy                           0.49       500
   macro avg       0.42      0.43      0.38       500
weighted avg       0.64      0.49      0.51       500






In [43]:
_ = evaluate_emotion_classification(
    df=df_test, llm=gpt_4_1_nano, prompt=prompt_generator.get_few_shot_template(),
    data_manager=data_manager, print_report=True)

Evaluating few_shot_prompt using gpt_4_1_nano: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:31<00:00,  2.37it/s]


📌 Evaluating with model: gpt_4_1_nano
📄 Prompt Template Type: few_shot_prompt

              precision    recall  f1-score   support

       anger       0.45      0.45      0.45        62
     disgust       0.15      0.18      0.17        11
        fear       0.20      0.30      0.24        10
         joy       0.57      0.67      0.62        91
     neutral       0.80      0.72      0.76       238
     sadness       0.69      0.47      0.56        38
    surprise       0.48      0.58      0.52        50

    accuracy                           0.63       500
   macro avg       0.48      0.48      0.47       500
weighted avg       0.65      0.63      0.63       500






In [44]:
_ = evaluate_emotion_classification(
    df=df_test, llm=gpt_4_1_nano, prompt=prompt_generator.get_contextual_template(),
    data_manager=data_manager, print_report=True)

Evaluating contextual_prompt using gpt_4_1_nano: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:42<00:00,  2.25it/s]


📌 Evaluating with model: gpt_4_1_nano
📄 Prompt Template Type: contextual_prompt

              precision    recall  f1-score   support

       anger       0.68      0.21      0.32        62
     disgust       0.20      0.55      0.29        11
        fear       0.43      0.30      0.35        10
         joy       0.40      0.64      0.49        91
     neutral       0.70      0.65      0.67       238
     sadness       0.76      0.34      0.47        38
    surprise       0.43      0.52      0.47        50

    accuracy                           0.55       500
   macro avg       0.51      0.46      0.44       500
weighted avg       0.60      0.55      0.54       500






In [45]:
_ = evaluate_emotion_classification(
    df=df_test, llm=gpt_4_1_nano, prompt=prompt_generator.get_retrieval_augmented_contextual_template(),
    data_manager=data_manager, retriever=retriever, print_report=True)

Evaluating retrieval_augmented_contextual_prompt using gpt_4_1_nano: 100%|█████████████████████████████████████████████████████████████████████████| 500/500 [04:02<00:00,  2.06it/s]


📌 Evaluating with model: gpt_4_1_nano
📄 Prompt Template Type: retrieval_augmented_contextual_prompt

              precision    recall  f1-score   support

       anger       0.63      0.19      0.30        62
     disgust       0.11      0.55      0.18        11
        fear       0.50      0.30      0.38        10
         joy       0.46      0.64      0.53        91
     neutral       0.78      0.62      0.69       238
     sadness       0.74      0.37      0.49        38
    surprise       0.34      0.58      0.43        50

    accuracy                           0.54       500
   macro avg       0.51      0.46      0.43       500
weighted avg       0.64      0.54      0.56       500






### LLM: "gemini-2.5-flash-lite"

In [46]:
gemini_2_5_flash_lite = ChatVertexAI(model="gemini-2.5-flash-lite", temperature=0)
gemini_2_5_flash_lite.name = "gemini-2.5-flash-lite"

In [47]:
_ = evaluate_emotion_classification(
    df=df_test, llm=gemini_2_5_flash_lite, prompt=prompt_generator.get_zero_shot_template(),
    data_manager=data_manager, print_report=True)

Evaluating zero_shot_prompt using gemini-2.5-flash-lite: 100%|█████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:04<00:00,  2.71it/s]


📌 Evaluating with model: gemini-2.5-flash-lite
📄 Prompt Template Type: zero_shot_prompt

              precision    recall  f1-score   support

       anger       0.56      0.29      0.38        62
     disgust       0.16      0.45      0.23        11
        fear       0.22      0.20      0.21        10
         joy       0.54      0.62      0.58        91
     neutral       0.75      0.72      0.73       238
     sadness       0.46      0.55      0.50        38
    surprise       0.50      0.50      0.50        50

    accuracy                           0.60       500
   macro avg       0.46      0.48      0.45       500
weighted avg       0.62      0.60      0.60       500






In [48]:
_ = evaluate_emotion_classification(
    df=df_test, llm=gemini_2_5_flash_lite, prompt=prompt_generator.get_one_shot_template(),
    data_manager=data_manager, print_report=True)

Evaluating one_shot_prompt using gemini-2.5-flash-lite: 100%|██████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:26<00:00,  2.42it/s]


📌 Evaluating with model: gemini-2.5-flash-lite
📄 Prompt Template Type: one_shot_prompt

              precision    recall  f1-score   support

       anger       0.45      0.40      0.42        62
     disgust       0.15      0.55      0.24        11
        fear       0.15      0.20      0.17        10
         joy       0.53      0.57      0.55        91
     neutral       0.83      0.68      0.74       238
     sadness       0.50      0.50      0.50        38
    surprise       0.52      0.62      0.56        50

    accuracy                           0.59       500
   macro avg       0.45      0.50      0.46       500
weighted avg       0.64      0.59      0.61       500






In [49]:
_ = evaluate_emotion_classification(
    df=df_test, llm=gemini_2_5_flash_lite, prompt=prompt_generator.get_few_shot_template(),
    data_manager=data_manager, print_report=True)

Evaluating few_shot_prompt using gemini-2.5-flash-lite: 100%|██████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:11<00:00,  2.61it/s]


📌 Evaluating with model: gemini-2.5-flash-lite
📄 Prompt Template Type: few_shot_prompt

              precision    recall  f1-score   support

       anger       0.49      0.39      0.43        62
     disgust       0.31      0.36      0.33        11
        fear       0.17      0.10      0.12        10
         joy       0.73      0.49      0.59        91
     neutral       0.70      0.87      0.78       238
     sadness       0.53      0.50      0.51        38
    surprise       0.62      0.46      0.53        50

    accuracy                           0.65       500
   macro avg       0.51      0.45      0.47       500
weighted avg       0.64      0.65      0.63       500






In [50]:
_ = evaluate_emotion_classification(
    df=df_test, llm=gemini_2_5_flash_lite, prompt=prompt_generator.get_contextual_template(),
    data_manager=data_manager, print_report=True)

Evaluating contextual_prompt using gemini-2.5-flash-lite: 100%|████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:06<00:00,  2.68it/s]


📌 Evaluating with model: gemini-2.5-flash-lite
📄 Prompt Template Type: contextual_prompt

              precision    recall  f1-score   support

       anger       0.59      0.27      0.37        62
     disgust       0.18      0.27      0.21        11
        fear       0.50      0.20      0.29        10
         joy       0.69      0.58      0.63        91
     neutral       0.72      0.88      0.79       238
     sadness       0.47      0.58      0.52        38
    surprise       0.65      0.44      0.52        50

    accuracy                           0.66       500
   macro avg       0.54      0.46      0.48       500
weighted avg       0.65      0.66      0.64       500






In [51]:
_ = evaluate_emotion_classification(
    df=df_test, llm=gemini_2_5_flash_lite, prompt=prompt_generator.get_retrieval_augmented_contextual_template(),
    data_manager=data_manager, retriever=retriever, print_report=True)

Evaluating retrieval_augmented_contextual_prompt using gemini-2.5-flash-lite: 100%|████████████████████████████████████████████████████████████████| 500/500 [03:58<00:00,  2.10it/s]


📌 Evaluating with model: gemini-2.5-flash-lite
📄 Prompt Template Type: retrieval_augmented_contextual_prompt

              precision    recall  f1-score   support

       anger       0.52      0.19      0.28        62
     disgust       0.23      0.45      0.30        11
        fear       0.18      0.20      0.19        10
         joy       0.67      0.56      0.61        91
     neutral       0.72      0.83      0.77       238
     sadness       0.46      0.55      0.50        38
    surprise       0.46      0.44      0.45        50

    accuracy                           0.62       500
   macro avg       0.46      0.46      0.44       500
weighted avg       0.62      0.62      0.61       500






### 📊 Visualizing Model Performance with Weighted F1 Scores

To compare the effectiveness of different models on the MELD dataset, I visualize their **weighted F1 scores** using a horizontal bar chart. The weighted F1 score accounts for class imbalance by weighting each class’s F1 score by its support (i.e., number of true instances).

In [52]:
fig = go.Figure(go.Bar(
    x=list(model_scores.values()),
    y=list(model_scores.keys()),
    orientation='h',
    text=[f"{v:.2f}" for v in model_scores.values()],
    textposition='auto'
))

fig.update_layout(
    title="Model Comparison by Weighted F1 Score on the MELD dataset",
    xaxis_title="Weighted F1 Score",
    yaxis_title="Model",
    xaxis_range=[0, 1],
    width=1200,
    height=600,
    margin=dict(l=50, r=50, t=50, b=50),
)

fig.show()
