# Part 2: Speech-to-Text Integration

## Setup

Libraries

In [None]:
#!rm -rf /content/*

In [None]:
!pip install openai-whisper
!pip install spacy

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m583.7/800.5 kB[0m [31m18.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda

In [None]:
import whisper
import numpy as np
import zipfile
import os
import shutil
import pandas as pd
import re
from IPython.display import display
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras import layers
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.layers import Layer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import spacy
spacy.cli.download("en_core_web_md")
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils.class_weight import compute_class_weight
import random


SEED = 42

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Load Data

In [None]:
# Unzip
with zipfile.ZipFile("Audios.zip", 'r') as zip_ref:
    zip_ref.extractall("audios")

# Delete the macOS system folder
shutil.rmtree("audios/__MACOSX", ignore_errors=True)

# Check files
os.listdir("audios")


['Audios']

In [None]:
df_labels_sent = pd.read_csv("label_sent.csv", delimiter = ";")
df_labels_sent.head()

Unnamed: 0,filename,sentiment
0,1.mp4,negative
1,2.mp4,positive
2,3.mp4,negative
3,4.mp4,positive
4,5.mp4,positive


In [None]:
df_labels_emo = pd.read_csv("label_emo.csv", delimiter = ";")
df_labels_emo.head()

Unnamed: 0,filename,emotion
0,1.mp4,disgust
1,2.mp4,admiration
2,3.mp4,disappointment
3,4.mp4,joy
4,5.mp4,love


In [None]:
df_transcription = pd.read_csv("Transcription.csv", delimiter=";")
df_transcription.head()

Unnamed: 0,filename,real_transcription
0,1.mp4,Terrible quality. The ship wasn't even straigh...
1,2.mp4,My husband loved this set. The quality was go...
2,3.mp4,The toy broke within minutes of opening the box.
3,4.mp4,So cute and fluffy
4,5.mp4,"Great little truck, son loves it and plays wit..."


## Whisper Model

Whisper

In [None]:
# Load Whisper model
model = whisper.load_model("base")

# Folder where your .mp4 files are located
folder = "audios/Audios"

# Transcribe each file
transcripts = []
for filename in os.listdir(folder):
    if filename.endswith(".mp4"):
        path = os.path.join(folder, filename)
        result = model.transcribe(path)
        transcripts.append({
            "filename": filename,
            "whisper_transcription": result["text"]
        })

# Create DataFrame
df_transcripts = pd.DataFrame(transcripts)

100%|███████████████████████████████████████| 139M/139M [00:01<00:00, 98.4MiB/s]


In [None]:
df_transcripts.head()

Unnamed: 0,filename,whisper_transcription
0,100.mp4,8 year old son loved it.
1,133.mp4,I got this for my son for his fourth birthday...
2,38.mp4,"Very nice costume, just a bunch that I had to..."
3,14.mp4,"Very cheap quality, but it did the trick."
4,52.mp4,Mike it loves Ligo and loves Minecraft. He li...


In [None]:
df_transcripts.to_csv("whisper.csv", index=False)

In [None]:
df_transcripts = pd.read_csv("whisper.csv")

## Merging the Data

When we have one model for sentiments and other for emotions we will use this datasets

In [None]:
# Merge Whisper transcriptions with labels
df_whisperonemodel_sent = pd.merge(df_transcripts, df_labels_sent, on="filename")

# Merge with real transcriptions
df_realonemodel_sent = pd.merge(df_transcription, df_labels_sent, on="filename")

In [None]:
df_whisperonemodel_sent.head()

Unnamed: 0,filename,whisper_transcription,sentiment
0,100.mp4,8 year old son loved it.,positive
1,133.mp4,I got this for my son for his fourth birthday...,negative
2,38.mp4,"Very nice costume, just a bunch that I had to...",negative
3,14.mp4,"Very cheap quality, but it did the trick.",negative
4,52.mp4,Mike it loves Ligo and loves Minecraft. He li...,positive


In [None]:
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df_whisperonemodel_sent["whisper_transcription"] = df_whisperonemodel_sent["whisper_transcription"].apply(clean_text)

In [None]:
df_realonemodel_sent.head()

Unnamed: 0,filename,real_transcription,sentiment
0,1.mp4,Terrible quality. The ship wasn't even straigh...,negative
1,2.mp4,My husband loved this set. The quality was go...,positive
2,3.mp4,The toy broke within minutes of opening the box.,negative
3,4.mp4,So cute and fluffy,positive
4,5.mp4,"Great little truck, son loves it and plays wit...",positive


In [None]:
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df_realonemodel_sent["real_transcription"] = df_realonemodel_sent["real_transcription"].apply(clean_text)

In [None]:
# Merge Whisper transcriptions with labels
df_whisperonemodel_emo = pd.merge(df_transcripts, df_labels_emo, on="filename")

# Merge with real transcriptions
df_realonemodel_emo = pd.merge(df_transcription, df_labels_emo, on="filename")

In [None]:
df_whisperonemodel_emo.head()

Unnamed: 0,filename,whisper_transcription,emotion
0,100.mp4,8 year old son loved it.,joy
1,133.mp4,I got this for my son for his fourth birthday...,disappointment
2,38.mp4,"Very nice costume, just a bunch that I had to...",annoyance
3,14.mp4,"Very cheap quality, but it did the trick.",annoyance
4,52.mp4,Mike it loves Ligo and loves Minecraft. He li...,admiration


In [None]:
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df_whisperonemodel_emo["whisper_transcription"] = df_whisperonemodel_emo["whisper_transcription"].apply(clean_text)

In [None]:
df_realonemodel_emo.head()

Unnamed: 0,filename,real_transcription,emotion
0,1.mp4,Terrible quality. The ship wasn't even straigh...,disgust
1,2.mp4,My husband loved this set. The quality was go...,admiration
2,3.mp4,The toy broke within minutes of opening the box.,disappointment
3,4.mp4,So cute and fluffy,joy
4,5.mp4,"Great little truck, son loves it and plays wit...",love


In [None]:
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df_realonemodel_emo["real_transcription"] = df_realonemodel_emo["real_transcription"].apply(clean_text)

When we have a model for both sentiments and emotions we will use this datasets

In [None]:
# Merge Whisper transcriptions with labels
df_whisperbothmodel_sent = pd.merge(df_transcripts, df_labels_sent, on="filename")

# Merge with real transcriptions
df_realbothmodel_sent = pd.merge(df_transcription, df_labels_sent, on="filename")

In [None]:
df_whisperbothmodel_sent.head()

Unnamed: 0,filename,whisper_transcription,sentiment
0,100.mp4,8 year old son loved it.,positive
1,133.mp4,I got this for my son for his fourth birthday...,negative
2,38.mp4,"Very nice costume, just a bunch that I had to...",negative
3,14.mp4,"Very cheap quality, but it did the trick.",negative
4,52.mp4,Mike it loves Ligo and loves Minecraft. He li...,positive


In [None]:
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df_whisperbothmodel_sent["whisper_transcription"] = df_whisperbothmodel_sent["whisper_transcription"].apply(clean_text)

In [None]:
df_realbothmodel_sent.head()

Unnamed: 0,filename,real_transcription,sentiment
0,1.mp4,Terrible quality. The ship wasn't even straigh...,negative
1,2.mp4,My husband loved this set. The quality was go...,positive
2,3.mp4,The toy broke within minutes of opening the box.,negative
3,4.mp4,So cute and fluffy,positive
4,5.mp4,"Great little truck, son loves it and plays wit...",positive


In [None]:
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df_realbothmodel_sent["real_transcription"] = df_realbothmodel_sent["real_transcription"].apply(clean_text)

In [None]:
# Merge Whisper transcriptions with labels
df_whisperbothmodel_emo = pd.merge(df_transcripts, df_labels_emo, on="filename")

# Merge with real transcriptions
df_realbothmodel_emo = pd.merge(df_transcription, df_labels_emo, on="filename")

In [None]:
df_whisperbothmodel_emo.head()

Unnamed: 0,filename,whisper_transcription,emotion
0,100.mp4,8 year old son loved it.,joy
1,133.mp4,I got this for my son for his fourth birthday...,disappointment
2,38.mp4,"Very nice costume, just a bunch that I had to...",annoyance
3,14.mp4,"Very cheap quality, but it did the trick.",annoyance
4,52.mp4,Mike it loves Ligo and loves Minecraft. He li...,admiration


In [None]:
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df_whisperbothmodel_emo["whisper_transcription"] = df_whisperbothmodel_emo["whisper_transcription"].apply(clean_text)

In [None]:
df_realbothmodel_emo.head()

Unnamed: 0,filename,real_transcription,emotion
0,1.mp4,Terrible quality. The ship wasn't even straigh...,disgust
1,2.mp4,My husband loved this set. The quality was go...,admiration
2,3.mp4,The toy broke within minutes of opening the box.,disappointment
3,4.mp4,So cute and fluffy,joy
4,5.mp4,"Great little truck, son loves it and plays wit...",love


In [None]:
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df_realbothmodel_emo["real_transcription"] = df_realbothmodel_emo["real_transcription"].apply(clean_text)

We create a dataframe from the real transcriptions and for the whisper ones to later compare how transcription errors impact sentiment analysis accuracy

## Apply the Models

Load Tokenizers

In [None]:
# Load the saved tokenizer (for sentiment)
with open("tokenizer_sentimento.pkl", "rb") as f:
    tokenizer_sent = pickle.load(f)

# Load the saved vectorizer (for emotion)
with open("tokenizer_emocoes.pkl", "rb") as f:
    tokenizer_emo = pickle.load(f)

# Load the saved maxlen for emotion
with open("maxlen_emocoes.pkl", "rb") as f:
    maxlen_emo = pickle.load(f)

In [None]:
# Define emotion classes
emotion_labels = ["admiration", "amusement", "anger", "annoyance", "approval", "caring",
                  "confusion", "curiosity", "desire", "disappointment", "disapproval",
                  "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
                  "joy", "love", "nervousness", "optimism", "pride", "realization", "relief",
                  "remorse", "sadness", "surprise", "neutral"]

### Separate Models for Sentiment and Emotion

Load models

In [None]:
# Load sentiment model
sentiment_model = load_model("best_model.h5", compile=False)

sentiment_model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

In [None]:
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1), initializer="zeros")
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        et = tf.nn.tanh(tf.matmul(x, self.W) + self.b)  # shape: (batch_size, seq_len, 1)
        at = tf.nn.softmax(et, axis=1)  # shape: (batch_size, seq_len, 1)
        output = x * at  # Apply attention weights (broadcasted multiplication)
        return output  # shape: (batch_size, seq_len, features)

def focal_loss(gamma=2., alpha=0.25):
    def loss(y_true, y_pred):
        epsilon = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)

        cross_entropy = -y_true * tf.math.log(y_pred) - (1 - y_true) * tf.math.log(1 - y_pred)
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        alpha_factor = y_true * alpha + (1 - y_true) * (1 - alpha)
        modulating_factor = tf.pow(1.0 - p_t, gamma)

        return tf.reduce_mean(alpha_factor * modulating_factor * cross_entropy)
    return loss

In [None]:
# Load emotion model
emotion_model = load_model(
    'best_model_emo.h5',
    custom_objects={
        'AttentionLayer': AttentionLayer,
        'loss': focal_loss(),
    },
    compile=False
)

emotion_model.compile(
    loss=focal_loss(),
    optimizer='adam',
    metrics=[
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall()
    ]
)

#### Apply Sentiment and Emotion Models to Whisper Transcriptions

In [None]:
def tokenize_and_pad(text, label):
    texts = []
    labels = []

    for t in text:
        texts.append(t)

    for l in label:
        labels.append(l)

    sequences = tokenizer_sent.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=400, padding="post", truncating="post")

    return tf.data.Dataset.from_tensor_slices((padded, labels)).batch(512).prefetch(tf.data.AUTOTUNE)

X_sent = tokenize_and_pad(df_whisperonemodel_sent['whisper_transcription'], df_whisperonemodel_sent['sentiment'])

sentiment_probs = sentiment_model.predict(X_sent)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


In [None]:
text_seq_emotion = tokenizer_emo.texts_to_sequences(df_whisperonemodel_emo["whisper_transcription"])
X_padded_emotion = pad_sequences(text_seq_emotion, maxlen=maxlen_emo, padding="post", truncating="post")

mlb = MultiLabelBinarizer(classes=emotion_labels)
df_whisperonemodel_emo["emotion"] = df_whisperonemodel_emo["emotion"].apply(
    lambda x: [x] if isinstance(x, str) else x
)
y_emotion_bin = mlb.fit_transform(df_whisperonemodel_emo["emotion"])

X_emo = tf.data.Dataset.from_tensor_slices((X_padded_emotion, y_emotion_bin))
X_emo = X_emo.shuffle(1000, seed=42, reshuffle_each_iteration=False).batch(256).prefetch(tf.data.AUTOTUNE)

emotion_probs = emotion_model.predict(X_emo)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


In [None]:
df_whisperonemodel_sent["predicted_sentiment"] = np.where(sentiment_probs >= 0.5, "positive", "negative")

def decode_top_emotion(probs):
    top_idx = np.argmax(probs)
    return [emotion_labels[top_idx]]  # wrapped in a list to match your format

df_whisperonemodel_emo["predicted_emotion"] = [decode_top_emotion(p) for p in emotion_probs]


# Convert true labels
df_whisperonemodel_emo["emotion"] = df_whisperonemodel_emo["emotion"].apply(lambda x: [x] if isinstance(x, str) else x)

# Evaluation

## Sentiment
print("Sentiment Evaluation:")
print("Accuracy:", accuracy_score(df_whisperonemodel_sent["sentiment"], df_whisperonemodel_sent["predicted_sentiment"]))
print(classification_report(df_whisperonemodel_sent["sentiment"], df_whisperonemodel_sent["predicted_sentiment"]))

## Emotion
mlb = MultiLabelBinarizer(classes=emotion_labels)
y_true = mlb.fit_transform(df_whisperonemodel_emo["emotion"])
y_pred = mlb.transform(df_whisperonemodel_emo["predicted_emotion"])

print("Emotion Evaluation:")
print(classification_report(y_true, y_pred, target_names=emotion_labels))


Sentiment Evaluation:
Accuracy: 0.845
              precision    recall  f1-score   support

    negative       0.72      0.48      0.58        44
    positive       0.87      0.95      0.91       156

    accuracy                           0.84       200
   macro avg       0.79      0.71      0.74       200
weighted avg       0.83      0.84      0.83       200

Emotion Evaluation:
                precision    recall  f1-score   support

    admiration       0.10      0.50      0.17        16
     amusement       0.00      0.00      0.00        14
         anger       0.00      0.00      0.00         2
     annoyance       0.00      0.00      0.00        11
      approval       0.00      0.00      0.00        15
        caring       0.00      0.00      0.00        12
     confusion       0.00      0.00      0.00         2
     curiosity       0.00      0.00      0.00         2
        desire       0.00      0.00      0.00         1
disappointment       0.00      0.00      0.00        1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
df_whisperonemodel = pd.merge(
    df_whisperonemodel_sent,
    df_whisperonemodel_emo,
    on=["filename", "whisper_transcription"]
)

In [None]:
df_whisperonemodel.head()

Unnamed: 0,filename,whisper_transcription,sentiment,predicted_sentiment,emotion,predicted_emotion
0,100.mp4,8 year old son loved it,positive,positive,[joy],[love]
1,133.mp4,i got this for my son for his fourth birthday ...,negative,positive,[disappointment],[love]
2,38.mp4,very nice costume just a bunch that i had to p...,negative,positive,[annoyance],[approval]
3,14.mp4,very cheap quality but it did the trick,negative,positive,[annoyance],[admiration]
4,52.mp4,mike it loves ligo and loves minecraft he like...,positive,positive,[admiration],[love]


#### Apply Sentiment and Emotion Models to Real Transcriptions

In [None]:
def tokenize_and_pad(text, label):
    texts = []
    labels = []

    for t in text:
        texts.append(t)

    for l in label:
        labels.append(l)

    sequences = tokenizer_sent.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=400, padding="post", truncating="post")

    return tf.data.Dataset.from_tensor_slices((padded, labels)).batch(512).prefetch(tf.data.AUTOTUNE)

X_sent = tokenize_and_pad(df_realonemodel_sent['real_transcription'], df_realonemodel_sent['sentiment'])

sentiment_probs = sentiment_model.predict(X_sent)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 731ms/step


In [None]:
text_seq_emotion = tokenizer_emo.texts_to_sequences(df_realonemodel_emo["real_transcription"])
X_padded_emotion = pad_sequences(text_seq_emotion, maxlen=maxlen_emo, padding="post", truncating="post")

mlb = MultiLabelBinarizer(classes=emotion_labels)
df_realonemodel_emo["emotion"] = df_realonemodel_emo["emotion"].apply(
    lambda x: [x] if isinstance(x, str) else x
)
y_emotion_bin = mlb.fit_transform(df_realonemodel_emo["emotion"])

X_emo = tf.data.Dataset.from_tensor_slices((X_padded_emotion, y_emotion_bin))
X_emo = X_emo.shuffle(1000, seed=42, reshuffle_each_iteration=False).batch(256).prefetch(tf.data.AUTOTUNE)

emotion_probs = emotion_model.predict(X_emo)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step


In [None]:
df_realonemodel_sent["predicted_sentiment"] = np.where(sentiment_probs >= 0.5, "positive", "negative")

def decode_top_emotion(probs):
    top_idx = np.argmax(probs)
    return [emotion_labels[top_idx]]

df_realonemodel_emo["predicted_emotion"] = [decode_top_emotion(p) for p in emotion_probs]

# Convert true labels
df_realonemodel_emo["emotion"] = df_realonemodel_emo["emotion"].apply(lambda x: [x] if isinstance(x, str) else x)

# Evaluation

## Sentiment
print("Sentiment Evaluation:")
print("Accuracy:", accuracy_score(df_realonemodel_sent["sentiment"], df_realonemodel_sent["predicted_sentiment"]))
print(classification_report(df_realonemodel_sent["sentiment"], df_realonemodel_sent["predicted_sentiment"]))

## Emotion
mlb = MultiLabelBinarizer(classes=emotion_labels)
y_true = mlb.fit_transform(df_realonemodel_emo["emotion"])
y_pred = mlb.transform(df_realonemodel_emo["predicted_emotion"])

print("Emotion Evaluation:")
print(classification_report(y_true, y_pred, target_names=emotion_labels))

Sentiment Evaluation:
Accuracy: 0.855
              precision    recall  f1-score   support

    negative       0.74      0.52      0.61        44
    positive       0.88      0.95      0.91       156

    accuracy                           0.85       200
   macro avg       0.81      0.74      0.76       200
weighted avg       0.85      0.85      0.85       200

Emotion Evaluation:
                precision    recall  f1-score   support

    admiration       0.10      0.50      0.17        16
     amusement       0.15      0.14      0.15        14
         anger       0.00      0.00      0.00         2
     annoyance       0.00      0.00      0.00        11
      approval       0.10      0.07      0.08        15
        caring       0.00      0.00      0.00        12
     confusion       0.00      0.00      0.00         2
     curiosity       0.00      0.00      0.00         2
        desire       0.00      0.00      0.00         1
disappointment       0.00      0.00      0.00        1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
df_realonemodel = pd.merge(
    df_realonemodel_sent,
    df_realonemodel_emo,
    on=["filename", "real_transcription"]
)

In [None]:
df_realonemodel.head()

Unnamed: 0,filename,real_transcription,sentiment,predicted_sentiment,emotion,predicted_emotion
0,1.mp4,terrible quality the ship wasnt even straight ...,negative,negative,[disgust],[approval]
1,2.mp4,my husband loved this set the quality was good...,positive,positive,[admiration],[admiration]
2,3.mp4,the toy broke within minutes of opening the box,negative,positive,[disappointment],[admiration]
3,4.mp4,so cute and fluffy,positive,positive,[joy],[admiration]
4,5.mp4,great little truck son loves it and plays with...,positive,positive,[love],[admiration]


### Multi-Task Learning

Load model

In [None]:
def masked_binary_crossentropy(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)
    #y_pred = safe_sigmoid(y_pred)

    # Calculate per-element loss
    per_element_loss = tf.keras.backend.binary_crossentropy(y_true, y_pred)

    # Mask and sum losses per sample, then divide by valid counts
    masked_loss = per_element_loss * mask
    valid_counts = tf.reduce_sum(mask, axis=1) + 1e-7
    per_sample_loss = tf.reduce_sum(masked_loss, axis=1) / valid_counts

    return per_sample_loss  # Shape: (batch_size,)

def masked_focal_loss(gamma=2.0, alpha=0.25):
    def loss(y_true, y_pred):
        mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)
        #y_pred = safe_sigmoid(y_pred)
        p_t = y_true * y_pred + (1-y_true)*(1-y_pred)
        alpha_factor = y_true*alpha + (1-y_true)*(1-alpha)
        modulating_factor = tf.pow(1-p_t, gamma)
        ce_loss = -tf.math.log(tf.maximum(p_t, 1e-7))
        per_element_loss = alpha_factor * modulating_factor * ce_loss
        masked_loss = per_element_loss * mask
        valid_counts = tf.reduce_sum(mask, axis=1) + 1e-7
        return tf.reduce_sum(masked_loss, axis=1) / valid_counts  # Shape: (batch_size,)
    return loss

#### Apply Multi-Task Model to Whisper Transcriptions

Tokenizer

In [None]:
def tokenize_and_pad(text, label):
    texts = []
    labels = []

    for t in text:
        texts.append(t)

    for l in label:
        labels.append(l)

    sequences = tokenizer_sent.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=400, padding="post", truncating="post")

    return tf.data.Dataset.from_tensor_slices((padded, labels)).batch(512).prefetch(tf.data.AUTOTUNE)

X_sent_w = tokenize_and_pad(df_whisperbothmodel_sent['whisper_transcription'], df_whisperbothmodel_sent['sentiment'])

In [None]:
# Initialize empty lists
X_text_w = []
y_sentiment_w = []

# Loop through the dataset and collect values
for text_batch, label_batch in X_sent_w:
    X_text_w.extend(text_batch.numpy())
    y_sentiment_w.extend(label_batch.numpy())


X_text_w = np.array(X_text_w)
y_sentiment_w = np.array(y_sentiment_w)

In [None]:
text_seq_emotion_w = tokenizer_emo.texts_to_sequences(df_whisperbothmodel_emo["whisper_transcription"])
X_padded_emotion_w = pad_sequences(text_seq_emotion_w, maxlen=maxlen_emo, padding="post", truncating="post")

mlb = MultiLabelBinarizer(classes=emotion_labels)
df_whisperbothmodel_emo["emotion"] = df_whisperbothmodel_emo["emotion"].apply(
    lambda x: [x] if isinstance(x, str) else x
)
y_emotion_bin_w = mlb.fit_transform(df_whisperbothmodel_emo["emotion"])

X_emo_w = tf.data.Dataset.from_tensor_slices((X_padded_emotion_w, y_emotion_bin_w))
X_emo_w = X_emo_w.shuffle(1000, seed=42, reshuffle_each_iteration=False).batch(256).prefetch(tf.data.AUTOTUNE)

In [None]:
X_text_e_w = []
y_emotion_e_w = []

for text_batch, label_batch in X_emo_w:
    X_text_e_w.extend(text_batch.numpy())
    y_emotion_e_w.extend(label_batch.numpy())

X_text_e_w = np.array(X_text_e_w)
y_emotion_e_w = np.array(y_emotion_e_w)

In [None]:
X_text_e_w = pad_sequences(X_text_e_w, maxlen=400, padding='post', truncating='post')

In [None]:
X_text_all_w = np.concatenate([X_text_w, X_text_e_w])

In [None]:
y_sentiment_all_w = np.concatenate([y_sentiment_w, np.full((len(y_emotion_e_w),), -1)])

In [None]:
y_emotion_all_w = np.concatenate([np.full((len(y_sentiment_w), 28), -1), y_emotion_e_w])

In [None]:
test_ds_w = tf.data.Dataset.from_tensor_slices(
    (X_text_all_w, {'sentiment': y_sentiment_all_w, 'emotion': y_emotion_all_w})
)

test_ds_w = test_ds_w.shuffle(buffer_size=len(X_text_all_w), seed=42, reshuffle_each_iteration=False)
test_ds_w = test_ds_w.batch(512).prefetch(tf.data.AUTOTUNE)

Compute the class weights to load the model

In [None]:
# Load model
model = load_model(
    "best_final_model.h5",
    custom_objects={
        'masked_binary_crossentropy': masked_binary_crossentropy,
        'masked_focal_loss': masked_focal_loss
    },
    compile = False
)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(
    optimizer=optimizer,
    loss={
        'sentiment': masked_binary_crossentropy,
        'emotion': masked_focal_loss()
    },
    loss_weights={
        'sentiment': 6.0,
        'emotion': 1.0
    },
    metrics={
        'sentiment': 'accuracy',
        'emotion': [tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall()]
    })

Evaluate

In [None]:
sentiment_preds_w, emotion_preds_w = model.predict(test_ds_w)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step


In [None]:
# Split predictions
sentiment_preds_only_w = sentiment_preds_w[:len(df_whisperbothmodel_sent)]
emotion_preds_only_w = emotion_preds_w[len(df_whisperbothmodel_sent):]

In [None]:
df_whisperbothmodel_sent["predicted_sentiment"] = np.where(
    sentiment_preds_only_w >= 0.5, "positive", "negative"
)

def decode_top_emotion(probs):
    top_idx = np.argmax(probs)
    return [emotion_labels[top_idx]]

df_whisperbothmodel_emo["predicted_emotion"] = [
    decode_top_emotion(p) for p in emotion_preds_only_w
]

# Convert true labels
df_whisperbothmodel_emo["emotion"] = df_whisperbothmodel_emo["emotion"].apply(lambda x: [x] if isinstance(x, str) else x)

# Evaluation

## Sentiment
print("Sentiment Evaluation:")
print("Accuracy:", accuracy_score(df_whisperbothmodel_sent["sentiment"], df_whisperbothmodel_sent["predicted_sentiment"]))
print(classification_report(df_whisperbothmodel_sent["sentiment"], df_whisperbothmodel_sent["predicted_sentiment"]))

## Emotion
mlb = MultiLabelBinarizer(classes=emotion_labels)
y_true = mlb.fit_transform(df_whisperbothmodel_emo["emotion"])
y_pred = mlb.transform(df_whisperbothmodel_emo["predicted_emotion"])

print("Emotion Evaluation:")
print(classification_report(y_true, y_pred, target_names=emotion_labels))

Sentiment Evaluation:
Accuracy: 0.765
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        44
    positive       0.78      0.98      0.87       156

    accuracy                           0.77       200
   macro avg       0.39      0.49      0.43       200
weighted avg       0.61      0.77      0.68       200

Emotion Evaluation:
                precision    recall  f1-score   support

    admiration       0.08      0.12      0.10        16
     amusement       0.08      0.07      0.08        14
         anger       0.00      0.00      0.00         2
     annoyance       0.00      0.00      0.00        11
      approval       0.00      0.00      0.00        15
        caring       0.00      0.00      0.00        12
     confusion       0.00      0.00      0.00         2
     curiosity       0.00      0.00      0.00         2
        desire       0.00      0.00      0.00         1
disappointment       0.33      0.21      0.26        1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
df_whisperbothmodel = pd.merge(
    df_whisperbothmodel_sent,
    df_whisperbothmodel_emo,
    on=["filename", "whisper_transcription"]
)

In [None]:
df_whisperbothmodel.head()

Unnamed: 0,filename,whisper_transcription,sentiment,predicted_sentiment,emotion,predicted_emotion
0,100.mp4,8 year old son loved it,positive,positive,[joy],[realization]
1,133.mp4,i got this for my son for his fourth birthday ...,negative,positive,[disappointment],[surprise]
2,38.mp4,very nice costume just a bunch that i had to p...,negative,positive,[annoyance],[amusement]
3,14.mp4,very cheap quality but it did the trick,negative,positive,[annoyance],[realization]
4,52.mp4,mike it loves ligo and loves minecraft he like...,positive,positive,[admiration],[optimism]


#### Apply Multi-Task Model to Real Transcriptions

Tokenizer

In [None]:
def tokenize_and_pad(text, label):
    texts = []
    labels = []

    for t in text:
        texts.append(t)

    for l in label:
        labels.append(l)

    sequences = tokenizer_sent.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=400, padding="post", truncating="post")

    return tf.data.Dataset.from_tensor_slices((padded, labels)).batch(512).prefetch(tf.data.AUTOTUNE)

X_sent = tokenize_and_pad(df_realbothmodel_sent['real_transcription'], df_realbothmodel_sent['sentiment'])

In [None]:
# Initialize empty lists
X_text = []
y_sentiment = []

# Loop through the dataset and collect values
for text_batch, label_batch in X_sent:
    X_text.extend(text_batch.numpy())
    y_sentiment.extend(label_batch.numpy())


X_text = np.array(X_text)
y_sentiment = np.array(y_sentiment)

In [None]:
text_seq_emotion = tokenizer_emo.texts_to_sequences(df_realbothmodel_emo["real_transcription"])
X_padded_emotion = pad_sequences(text_seq_emotion, maxlen=maxlen_emo, padding="post", truncating="post")

mlb = MultiLabelBinarizer(classes=emotion_labels)
df_realbothmodel_emo["emotion"] = df_realbothmodel_emo["emotion"].apply(
    lambda x: [x] if isinstance(x, str) else x
)
y_emotion_bin = mlb.fit_transform(df_realbothmodel_emo["emotion"])

X_emo = tf.data.Dataset.from_tensor_slices((X_padded_emotion, y_emotion_bin))
X_emo = X_emo.shuffle(1000, seed=42, reshuffle_each_iteration=False).batch(256).prefetch(tf.data.AUTOTUNE)

In [None]:
X_text_e = []
y_emotion_e = []

for text_batch, label_batch in X_emo:
    X_text_e.extend(text_batch.numpy())
    y_emotion_e.extend(label_batch.numpy())

X_text_e = np.array(X_text_e)
y_emotion_e = np.array(y_emotion_e)

In [None]:
X_text_e = pad_sequences(X_text_e, maxlen=400, padding='post', truncating='post')

In [None]:
X_text_all = np.concatenate([X_text, X_text_e])
y_sentiment_all = np.concatenate([y_sentiment, np.full((len(y_emotion_e),), -1)])
y_emotion_all = np.concatenate([np.full((len(y_sentiment), 28), -1), y_emotion_e])

In [None]:
test_ds = tf.data.Dataset.from_tensor_slices(
    (X_text_all, {'sentiment': y_sentiment_all, 'emotion': y_emotion_all})
)

test_ds = test_ds.shuffle(buffer_size=len(X_text_all), seed=42, reshuffle_each_iteration=False)
test_ds = test_ds.batch(512).prefetch(tf.data.AUTOTUNE)

Evaluate

In [None]:
sentiment_preds, emotion_preds = model.predict(test_ds)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


In [None]:
# Split predictions
sentiment_preds_only = sentiment_preds[:len(df_whisperbothmodel_sent)]
emotion_preds_only = emotion_preds[len(df_whisperbothmodel_sent):]

In [None]:
df_realbothmodel_sent["predicted_sentiment"] = np.where(
    sentiment_preds_only >= 0.5, "positive", "negative"
)

def decode_top_emotion(probs):
    top_idx = np.argmax(probs)
    return [emotion_labels[top_idx]]

df_realbothmodel_emo["predicted_emotion"] = [
    decode_top_emotion(p) for p in emotion_preds_only
]

# Convert true labels
df_realbothmodel_emo["emotion"] = df_realbothmodel_emo["emotion"].apply(lambda x: [x] if isinstance(x, str) else x)

# Evaluation

## Sentiment
print("Sentiment Evaluation:")
print("Accuracy:", accuracy_score(df_realbothmodel_sent["sentiment"], df_realbothmodel_sent["predicted_sentiment"]))
print(classification_report(df_realbothmodel_sent["sentiment"], df_realbothmodel_sent["predicted_sentiment"]))

## Emotion
mlb = MultiLabelBinarizer(classes=emotion_labels)
y_true = mlb.fit_transform(df_realbothmodel_emo["emotion"])
y_pred = mlb.transform(df_realbothmodel_emo["predicted_emotion"])

print("Emotion Evaluation:")
print(classification_report(y_true, y_pred, target_names=emotion_labels))

Sentiment Evaluation:
Accuracy: 0.74
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        44
    positive       0.77      0.95      0.85       156

    accuracy                           0.74       200
   macro avg       0.39      0.47      0.43       200
weighted avg       0.60      0.74      0.66       200

Emotion Evaluation:
                precision    recall  f1-score   support

    admiration       0.08      0.12      0.10        16
     amusement       0.14      0.14      0.14        14
         anger       0.00      0.00      0.00         2
     annoyance       0.00      0.00      0.00        11
      approval       0.00      0.00      0.00        15
        caring       0.00      0.00      0.00        12
     confusion       0.00      0.00      0.00         2
     curiosity       0.00      0.00      0.00         2
        desire       0.00      0.00      0.00         1
disappointment       0.20      0.14      0.17        14

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
df_realbothmodel = pd.merge(
    df_realbothmodel_sent,
    df_realbothmodel_emo,
    on=["filename", "real_transcription"]
)

In [None]:
df_realbothmodel.head()

Unnamed: 0,filename,real_transcription,sentiment,predicted_sentiment,emotion,predicted_emotion
0,1.mp4,terrible quality the ship wasnt even straight ...,negative,positive,[disgust],[realization]
1,2.mp4,my husband loved this set the quality was good...,positive,positive,[admiration],[grief]
2,3.mp4,the toy broke within minutes of opening the box,negative,positive,[disappointment],[gratitude]
3,4.mp4,so cute and fluffy,positive,positive,[joy],[disappointment]
4,5.mp4,great little truck son loves it and plays with...,positive,positive,[love],[admiration]


### Check the performance

#### Semantic Similarity

Define the function

In [None]:
nlp = spacy.load("en_core_web_md")

def are_emotions_similar(pred_emotion, true_emotion, threshold=0.50):
    doc_pred = nlp(pred_emotion)
    doc_true = nlp(true_emotion)
    return doc_pred.similarity(doc_true) >= threshold

Define the threshold for each dataset

In [None]:
thresholds = [round(t, 2) for t in np.arange(0.5, 0.96, 0.05)]
results = []

for threshold in thresholds:
    correct = sum(
        are_emotions_similar(pred[0], true[0], threshold)
        for pred, true in zip(df_whisperonemodel["predicted_emotion"], df_whisperonemodel["emotion"])
    )
    total = len(df_whisperonemodel)
    acc = correct / total
    results.append((threshold, acc))

best_threshold, best_acc = max(results, key=lambda x: x[1])
print(f"\n Best threshold: {best_threshold:.2f} with Accuracy: {best_acc:.2%}")


 Best threshold: 0.50 with Accuracy: 24.00%


In [None]:
thresholds = [round(t, 2) for t in np.arange(0.5, 0.96, 0.05)]
results = []

for threshold in thresholds:
    correct = sum(
        are_emotions_similar(pred[0], true[0], threshold)
        for pred, true in zip(df_realonemodel["predicted_emotion"], df_realonemodel["emotion"])
    )
    total = len(df_realonemodel)
    acc = correct / total
    results.append((threshold, acc))

best_threshold, best_acc = max(results, key=lambda x: x[1])
print(f"\n Best threshold: {best_threshold:.2f} with Accuracy: {best_acc:.2%}")


 Best threshold: 0.50 with Accuracy: 30.50%


In [None]:
thresholds = [round(t, 2) for t in np.arange(0.5, 0.96, 0.05)]
results = []

for threshold in thresholds:
    correct = sum(
        are_emotions_similar(pred[0], true[0], threshold)
        for pred, true in zip(df_whisperbothmodel["predicted_emotion"], df_whisperbothmodel["emotion"])
    )
    total = len(df_whisperbothmodel)
    acc = correct / total
    results.append((threshold, acc))

best_threshold, best_acc = max(results, key=lambda x: x[1])
print(f"\n Best threshold: {best_threshold:.2f} with Accuracy: {best_acc:.2%}")


 Best threshold: 0.50 with Accuracy: 28.50%


In [None]:
thresholds = [round(t, 2) for t in np.arange(0.5, 0.96, 0.05)]
results = []

for threshold in thresholds:
    correct = sum(
        are_emotions_similar(pred[0], true[0], threshold)
        for pred, true in zip(df_realbothmodel["predicted_emotion"], df_realbothmodel["emotion"])
    )
    total = len(df_realbothmodel)
    acc = correct / total
    results.append((threshold, acc))

best_threshold, best_acc = max(results, key=lambda x: x[1])
print(f"\n Best threshold: {best_threshold:.2f} with Accuracy: {best_acc:.2%}")


 Best threshold: 0.50 with Accuracy: 29.50%


#### Analyze how transcription errors impact sentiment analysis accuracy

This block evaluates the "performance" when using Whisper-generated transcriptions as input, applying one model for sentiment prediction and a different model for emotion prediction.

In [None]:
# Sentiment match
correct_sentiments = sum(
    true[0] in pred for true, pred in zip(df_whisperonemodel["sentiment"], df_whisperonemodel["predicted_sentiment"])
)
total = len(df_whisperonemodel)
print(f"Different Models Whisper Sentiment matches: {correct_sentiments} / {total}")

print(correct_sentiments/total * 100)

# Emotion match

correct_emotions = sum(
    true[0] in pred for true, pred in zip(df_whisperonemodel["emotion"], df_whisperonemodel["predicted_emotion"])
)
total = len(df_whisperonemodel)
print(f"Different Models Whisper Emotion contains correct label: {correct_emotions} / {total}")

print(correct_emotions/total * 100)

# Semantic similarity match
correct_semantic = sum(
    are_emotions_similar(pred[0], true[0])
    for true, pred in zip(df_whisperonemodel["emotion"], df_whisperonemodel["predicted_emotion"])
)

total = len(df_whisperonemodel)
print(f"Different Models Whisper Emotion (semantic similarity) match: {correct_semantic} / {total}")
print(round(correct_semantic / total * 100, 1))

Different Models Whisper Sentiment matches: 169 / 200
84.5
Different Models Whisper Emotion contains correct label: 11 / 200
5.5
Different Models Whisper Emotion (semantic similarity) match: 48 / 200
24.0


Here, the same setup is applied (separate models for sentiment and emotion), but using the manually labeled transcriptions instead of Whisper outputs to compare performance against the same ground truth labels.

In [None]:
# Sentiment match
correct_sentiments = sum(
    true[0] in pred for true, pred in zip(df_realonemodel["sentiment"], df_realonemodel["predicted_sentiment"])
)
total = len(df_realonemodel)

print(f"Different Models Real Sentiment matches: {correct_sentiments} / {total}")

print(correct_sentiments/total * 100)

# Emotion match

correct_emotions = sum(
    true[0] in pred for true, pred in zip(df_realonemodel["emotion"], df_realonemodel["predicted_emotion"])
)
total = len(df_realonemodel)

print(f"Different Models Real Emotion contains correct label: {correct_emotions} / {total}")

print(correct_emotions/total * 100)

# Semantic similarity match
correct_semantic = sum(
    are_emotions_similar(pred[0], true[0])
    for true, pred in zip(df_realonemodel["emotion"], df_realonemodel["predicted_emotion"])
)

total = len(df_realonemodel)
print(f"Different Models Real Emotion (semantic similarity) match: {correct_semantic} / {total}")
print(round(correct_semantic / total * 100, 1))


Different Models Real Sentiment matches: 171 / 200
85.5
Different Models Real Emotion contains correct label: 14 / 200
7.000000000000001
Different Models Real Emotion (semantic similarity) match: 61 / 200
30.5


This part reports results when a single model is used to predict both sentiment and emotion based on Whisper transcriptions.

In [None]:
# Sentiment match
correct_sentiments = sum(
    true[0] in pred for true, pred in zip(df_whisperbothmodel["sentiment"], df_whisperbothmodel["predicted_sentiment"])
)
total = len(df_whisperbothmodel)
print(f"One Model Whisper Sentiment matches: {correct_sentiments} / {total}")
print(round(correct_sentiments / total * 100, 1))

# Emotion match
correct_emotions = sum(
    true[0] in pred for true, pred in zip(df_whisperbothmodel["emotion"], df_whisperbothmodel["predicted_emotion"])
)
print(f"One Model Whisper Emotion contains correct label: {correct_emotions} / {total}")
print(round(correct_emotions / total * 100, 1))

# Semantic similarity match
correct_semantic = sum(
    are_emotions_similar(pred[0], true[0])
    for true, pred in zip(df_whisperbothmodel["emotion"], df_whisperbothmodel["predicted_emotion"])
)

total = len(df_whisperbothmodel)
print(f"One Model Whisper Emotion (semantic similarity) match: {correct_semantic} / {total}")
print(round(correct_semantic / total * 100, 1))


One Model Whisper Sentiment matches: 153 / 200
76.5
One Model Whisper Emotion contains correct label: 10 / 200
5.0
One Model Whisper Emotion (semantic similarity) match: 57 / 200
28.5


This final block uses the same model, but feeds in the real transcriptions to test whether manual text yields better predictions than Whisper input across both sentiment and emotion tasks.

In [None]:
# Sentiment match
correct_sentiments = sum(
    true[0] in pred for true, pred in zip(df_realbothmodel["sentiment"], df_realbothmodel["predicted_sentiment"])
)
total = len(df_realbothmodel)
print(f"One Model Real Sentiment matches: {correct_sentiments} / {total}")
print(round(correct_sentiments / total * 100, 1))

# Emotion match
correct_emotions = sum(
    true[0] in pred for true, pred in zip(df_realbothmodel["emotion"], df_realbothmodel["predicted_emotion"])
)
print(f"One Model Real Emotion contains correct label: {correct_emotions} / {total}")
print(round(correct_emotions / total * 100, 1))

# Semantic similarity match
correct_semantic = sum(
    are_emotions_similar(pred[0], true[0])
    for true, pred in zip(df_realbothmodel["emotion"], df_realbothmodel["predicted_emotion"])
)

total = len(df_realbothmodel)
print(f"One Model Real Emotion (semantic similarity) match: {correct_semantic} / {total}")
print(round(correct_semantic / total * 100, 1))


One Model Real Sentiment matches: 148 / 200
74.0
One Model Real Emotion contains correct label: 9 / 200
4.5
One Model Real Emotion (semantic similarity) match: 59 / 200
29.5


## Innovation

In [114]:
df_music_sent = pd.read_csv("Music.csv", delimiter=";")
df_music_sent.head()

Unnamed: 0,artist_name,track_name,genre,lyrics,topic
0,mukesh,mohabbat bhi jhoothi,pop,hold time feel break feel untrue convince spea...,sadness
1,frankie laine,i believe,pop,believe drop rain fall grow believe darkest ni...,world/life
2,johnnie ray,cry,pop,sweetheart send letter goodbye secret feel bet...,music
3,pérez prado,patricia,pop,kiss lips want stroll charm mambo chacha merin...,romantic
4,giorgos papadopoulos,apopse eida oneiro,pop,till darling till matter know till dream live ...,romantic


In [115]:
df_music_emo = pd.read_csv("Music.csv", delimiter=";")
df_music_emo.head()

Unnamed: 0,artist_name,track_name,genre,lyrics,topic
0,mukesh,mohabbat bhi jhoothi,pop,hold time feel break feel untrue convince spea...,sadness
1,frankie laine,i believe,pop,believe drop rain fall grow believe darkest ni...,world/life
2,johnnie ray,cry,pop,sweetheart send letter goodbye secret feel bet...,music
3,pérez prado,patricia,pop,kiss lips want stroll charm mambo chacha merin...,romantic
4,giorgos papadopoulos,apopse eida oneiro,pop,till darling till matter know till dream live ...,romantic


In [116]:
def tokenize_and_pad(text):
    texts = []

    for t in text:
        texts.append(t)

    sequences = tokenizer_sent.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=400, padding="post", truncating="post")

    return tf.data.Dataset.from_tensor_slices((padded)).batch(512).prefetch(tf.data.AUTOTUNE)

X_sent = tokenize_and_pad(df_music_sent['lyrics'])

sentiment_probs = sentiment_model.predict(X_sent)

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 1s/step


In [117]:
text_seq_emotion = tokenizer_emo.texts_to_sequences(df_music_emo["lyrics"])
X_padded_emotion = pad_sequences(text_seq_emotion, maxlen=maxlen_emo, padding="post", truncating="post")

X_emo = tf.data.Dataset.from_tensor_slices((X_padded_emotion))
X_emo = X_emo.shuffle(1000, seed=42, reshuffle_each_iteration=False).batch(256).prefetch(tf.data.AUTOTUNE)

emotion_probs = emotion_model.predict(X_emo)

[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 79ms/step


In [118]:
df_music_sent["predicted_sentiment"] = np.where(sentiment_probs >= 0.5, "positive", "negative")

def decode_top_emotion(probs):
    top_idx = np.argmax(probs)
    return [emotion_labels[top_idx]]  # wrapped in a list to match your format

df_music_emo["predicted_emotion"] = [decode_top_emotion(p) for p in emotion_probs]

In [119]:
df_music = pd.merge(
    df_music_sent,
    df_music_emo,
    on=["artist_name", "track_name", "genre", "lyrics", "topic"]
)
df_music.head()

Unnamed: 0,artist_name,track_name,genre,lyrics,topic,predicted_sentiment,predicted_emotion
0,mukesh,mohabbat bhi jhoothi,pop,hold time feel break feel untrue convince spea...,sadness,negative,[annoyance]
1,frankie laine,i believe,pop,believe drop rain fall grow believe darkest ni...,world/life,negative,[sadness]
2,johnnie ray,cry,pop,sweetheart send letter goodbye secret feel bet...,music,negative,[caring]
3,pérez prado,patricia,pop,kiss lips want stroll charm mambo chacha merin...,romantic,positive,[love]
4,giorgos papadopoulos,apopse eida oneiro,pop,till darling till matter know till dream live ...,romantic,positive,[annoyance]


In [120]:
df_music.to_csv("music_labels.csv", index=False)

# Part 3

The main objective of this part is to implement sentiment analysis using commercial LLMs:

*   Set up API access to OpenAI's GPT and/or Anthropic's Claude
*   Develop prompts for sentiment and emotion classification
*   Compare results with your neural network-based approach


## Import LangChain and Set Up LLM

In [None]:
!pip install -U langchain langchain-community openai
!pip install anthropic

Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting openai
  Downloading openai-1.82.0-py3-none-any.whl.metadata (25 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langcha

In [None]:
from langchain.llms import Anthropic
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

## Set up API key Securely

In [None]:
api_key = "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"


## Create prompt template for sentiment analysis and emotion

In [None]:
from langchain.prompts import PromptTemplate
from anthropic import Anthropic

# Set API key manually
api_key = "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"

# Initialize the client
client = Anthropic(api_key=api_key)

# Define emotion classes
emotion_labels = ["admiration", "amusement", "anger", "annoyance", "approval", "caring",
                  "confusion", "curiosity", "desire", "disappointment", "disapproval",
                  "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
                  "joy", "love", "nervousness", "optimism", "pride", "realization", "relief",
                  "remorse", "sadness", "surprise", "neutral"]

# Create the prompt template for sentiment analysis and emotion identification
sentiment_and_emotion_prompt = PromptTemplate(
    input_variables=["text"],
    template=(
        "Analyze the sentiment of the following text and classify it as "
        "positive, negative, or neutral. Additionally, identify the primary emotion expressed in the text "
        "from the following list: admiration, amusement, anger, annoyance, approval, caring, confusion, "
        "curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, "
        "grief, joy, love, nervousness, optimism, pride, realization, relief, remorse, sadness, surprise, neutral. "
        "Text: {text}"
    )
)


## Compare Results: LLM VS our lstm models for sentiment and emotion

In [None]:
def parse_predicted_output(output):
    # Convert output to lowercase for easier matching
    output_lower = output.lower()
    sentiment = None
    emotion = None

    # Extract sentiment
    for s in ["positive", "negative", "neutral"]:
        if s in output_lower:
            sentiment = s
            break

    # Extract emotion - pick first matching emotion label found in output
    for emo in emotion_labels:
        if emo in output_lower:
            emotion = emo
            break

    return sentiment, emotion

# Example dataframe: df_test with columns 'text', 'sentiment', 'emotion', 'predicted_sentiment', 'predicted_emotion'
# Replace this with your actual dataframe
# df_test = ...

predicted_sentiments = []
predicted_emotions = []

for text in df_whisperbothmodel['whisper_transcription']:
    prompt_text = sentiment_and_emotion_prompt.format(text=text)
    response = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=150,
        messages=[{"role": "user", "content": prompt_text}]
    )
    output_text = response.content[0].text if hasattr(response.content[0], "text") else response.content[0]
    sentiment, emotion = parse_predicted_output(output_text)
    predicted_sentiments.append(sentiment)
    predicted_emotions.append(emotion)

df_whisperbothmodel['pred_sentiment_llm'] = predicted_sentiments
df_whisperbothmodel['pred_emotion_llm'] = predicted_emotions

# Evaluate sentiment accuracy
print("Sentiment Evaluation:")
print("Accuracy:", accuracy_score(df_whisperbothmodel["sentiment"], df_whisperbothmodel["pred_sentiment_llm"]))
print(classification_report(df_whisperbothmodel["sentiment"], df_whisperbothmodel["pred_sentiment_llm"]))

# Prepare multilabel binarizer for emotions
mlb = MultiLabelBinarizer(classes=emotion_labels)

# Convert true emotion labels to lists if necessary
df_whisperbothmodel["emotion"] = df_whisperbothmodel["emotion"].apply(lambda x: [x] if isinstance(x, str) else x)

y_true = mlb.fit_transform(df_whisperbothmodel["emotion"])


Sentiment Evaluation:
Accuracy: 0.955
              precision    recall  f1-score   support

    negative       0.89      0.91      0.90        44
    positive       0.97      0.97      0.97       156

    accuracy                           0.95       200
   macro avg       0.93      0.94      0.93       200
weighted avg       0.96      0.95      0.96       200



TypeError: 'NoneType' object is not iterable

In [None]:
# Convert string labels to list labels for MultiLabelBinarizer
df_whisperbothmodel["pred_emotion_llm_list"] = df_whisperbothmodel["pred_emotion_llm"].apply(lambda x: [x])

# Then transform
y_pred = mlb.transform(df_whisperbothmodel["pred_emotion_llm_list"])




In [None]:
print("Emotion Evaluation:")
print(classification_report(y_true, y_pred, target_names=emotion_labels))

Emotion Evaluation:
                precision    recall  f1-score   support

    admiration       0.38      0.38      0.38        16
     amusement       0.67      0.29      0.40        14
         anger       0.00      0.00      0.00         2
     annoyance       0.50      0.18      0.27        11
      approval       0.21      0.87      0.33        15
        caring       0.67      0.17      0.27        12
     confusion       0.00      0.00      0.00         2
     curiosity       0.00      0.00      0.00         2
        desire       0.20      1.00      0.33         1
disappointment       0.30      0.64      0.41        14
   disapproval       0.00      0.00      0.00         9
       disgust       0.00      0.00      0.00         3
 embarrassment       0.00      0.00      0.00         0
    excitement       0.33      0.29      0.31        14
          fear       0.00      0.00      0.00         1
     gratitude       0.67      0.22      0.33         9
         grief       0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


###  visualize comparison

In order to compare our model with the llm we decided to compare each one with the corrected labels, which we labeled before

In [None]:
def contains_label(true_label, pred_label):
    if true_label is None or true_label == "":
        true_label = []
    elif isinstance(true_label, list):
        true_label = [t.lower().strip() for t in true_label if t]
    else:
        true_label = [true_label.lower().strip()]

    if pred_label is None or pred_label == "":
        pred_label = []
    elif isinstance(pred_label, list):
        pred_label = [p.lower().strip() for p in pred_label if p]
    else:
        pred_label = [pred_label.lower().strip()]

    return any(t in pred_label for t in true_label)


# --- LLM ---

total_both = len(df_whisperbothmodel)

correct_sentiments_llm = sum(
    contains_label(true, pred) for true, pred in zip(df_whisperbothmodel["sentiment"], df_whisperbothmodel["pred_sentiment_llm"])
)
correct_emotions_llm = sum(
    contains_label(true, pred) for true, pred in zip(df_whisperbothmodel["emotion"], df_whisperbothmodel["pred_emotion_llm"])
)

print(f"LLM Sentiment matches: {correct_sentiments_llm} / {total_both} ({round(correct_sentiments_llm / total_both * 100, 1)}%)")
print(f"LLM Emotion contains correct label: {correct_emotions_llm} / {total_both} ({round(correct_emotions_llm / total_both * 100, 1)}%)")


# --- Whisper One Model ---

total_one_sent = len(df_whisperonemodel_sent)
total_one_emo = len(df_whisperonemodel_emo)

correct_sentiments_one = sum(
    contains_label(true, pred) for true, pred in zip(df_whisperonemodel_sent["sentiment"], df_whisperonemodel_sent["predicted_sentiment"])
)

correct_emotions_one = sum(
    contains_label(true, pred) for true, pred in zip(df_whisperonemodel_emo["emotion"], df_whisperonemodel_emo["predicted_emotion"])
)

print(f"LSTM One Model Sentiment matches: {correct_sentiments_one} / {total_one_sent} ({round(correct_sentiments_one / total_one_sent * 100, 1)}%)")
print(f"LSTM One Model Emotion contains correct label: {correct_emotions_one} / {total_one_emo} ({round(correct_emotions_one / total_one_emo * 100, 1)}%)")


LLM Sentiment matches: 191 / 200 (95.5%)
LLM Emotion contains correct label: 59 / 200 (29.5%)
LSTM One Model Sentiment matches: 169 / 200 (84.5%)
LSTM One Model Emotion contains correct label: 11 / 200 (5.5%)
