In [1]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from datasets import load_dataset
from utils_emotiontraining import *

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps


In [2]:
model = TFAutoModel.from_pretrained("distilroberta-base")
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [3]:
# Load the dataset
# if cannot find path to dataset, use this:
import os

if not os.path.exists("augmented_emotion_dataset"):
    emotion_dataset = load_dataset("google-research-datasets/go_emotions", "simplified")

In [4]:
# Select a larger subset of examples for better training
small_train_dataset = emotion_dataset['train'].select(range(10000))
small_test_dataset = emotion_dataset['test'].select(range(1000))

# Create a new small dataset with the reduced splits
small_emotion_dataset = {
    'train': small_train_dataset,
    'test': small_test_dataset
}

In [5]:
emotions_id2label = {
    0: 'admiration',
    1: 'amusement',
    2: 'anger',
    3: 'annoyance',
    4: 'approval',
    5: 'caring',
    6: 'confusion',
    7: 'curiosity',
    8: 'desire',
    9: 'disappointment',
    10: 'disapproval',
    11: 'disgust',
    12: 'embarrassment',
    13: 'excitement',
    14: 'fear',
    15: 'gratitude',
    16: 'grief',
    17: 'joy',
    18: 'love',
    19: 'nervousness',
    20: 'optimism',
    21: 'pride',
    22: 'realization',
    23: 'relief',
    24: 'remorse',
    25: 'sadness',
    26: 'surprise',
    27: 'neutral'  # Last entry (no comma)
}

emotions_label2id = {v: k for k, v in emotions_id2label.items()}

# Print dataset info to verify we understand what we're working with
print("Sample of emotions dataset:")
print(small_emotion_dataset["train"][0])
print(f"Number of labels: {len(emotions_id2label)}")

Sample of emotions dataset:
{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [27], 'id': 'eebbqej'}
Number of labels: 28


In [7]:
# Cell source for ID: 4c515f62 (Simplified Logic)
MINORITY_THRESHOLD_PERCENT = 1.0
from collections import Counter
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
import re

print("Sample of augmented training data:")
num_samples_to_show = min(5, len(small_train_dataset))
if num_samples_to_show > 0:
    for i in range(num_samples_to_show):
        original_text_display = small_train_dataset[i].get('text', 'N/A')
        augmented_text_display = small_train_dataset[i].get('text', 'N/A')
        labels_display = small_train_dataset[i].get('labels', 'N/A')
        print(f"Original: {original_text_display}")
        print(f"Augmented: {augmented_text_display}")
        print(f"Labels: {labels_display}")
        print()
else:
    print("Augmented dataset appears empty or has fewer than 5 samples.")

Sample of augmented training data:
Original: My favourite food is anything I didn't have to cook myself.
Augmented: My favourite food is anything I didn't have to cook myself.
Labels: [27]

Original: Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead
Augmented: Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead
Labels: [27]

Original: WHY THE FUCK IS BAYLESS ISOING
Augmented: WHY THE FUCK IS BAYLESS ISOING
Labels: [2]

Original: To make her feel threatened
Augmented: To make her feel threatened
Labels: [14]

Original: Dirty Southern Wankers
Augmented: Dirty Southern Wankers
Labels: [3]



In [8]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, return_tensors='tf')

# Tokenize the small dataset
small_emotions_encoded = {}
small_emotions_encoded['train'] = small_emotion_dataset['train'].map(tokenize, batched=True, batch_size=None)
small_emotions_encoded['test'] = small_emotion_dataset['test'].map(tokenize, batched=True, batch_size=None)

print(small_emotions_encoded['train'])

Map: 100%|██████████| 10000/10000 [00:00<00:00, 20458.51 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 41419.50 examples/s]

Dataset({
    features: ['text', 'labels', 'id', 'input_ids', 'attention_mask'],
    num_rows: 10000
})





In [10]:
import numpy as np
import tensorflow as tf
from collections import Counter

NUM_CLASSES = 28  # Define the number of classes (matches emotions_id2label)

def create_multi_hot_labels(example):
    """Convert the list of labels into a multi-hot encoded vector."""
    multi_hot_label = np.zeros(NUM_CLASSES, dtype=np.float32)
    if 'labels' in example and isinstance(example['labels'], list) and len(example['labels']) > 0:
        for label_id in example['labels']:
            if isinstance(label_id, int) and 0 <= label_id < NUM_CLASSES:
                multi_hot_label[label_id] = 1.0
    example['multi_hot_labels'] = multi_hot_label
    return example

# Apply the conversion to add the new multi-hot label column
small_emotions_encoded['train'] = small_emotions_encoded['train'].map(create_multi_hot_labels)
small_emotions_encoded['test'] = small_emotions_encoded['test'].map(create_multi_hot_labels)

# Remove the original 'labels' column and any leftover 'label_int'
columns_to_remove = [col for col in ['label_int', 'labels'] if col in small_emotions_encoded['train'].features]
if columns_to_remove:
    print(f"Removing existing columns: {columns_to_remove}")
    small_emotions_encoded['train'] = small_emotions_encoded['train'].remove_columns(columns_to_remove)
    small_emotions_encoded['test'] = small_emotions_encoded['test'].remove_columns(columns_to_remove)

# Rename the new column 'multi_hot_labels' to 'labels'
if 'multi_hot_labels' in small_emotions_encoded['train'].features:
    print("Renaming 'multi_hot_labels' to 'labels'")
    small_emotions_encoded['train'] = small_emotions_encoded['train'].rename_column('multi_hot_labels', 'labels')
if 'multi_hot_labels' in small_emotions_encoded['test'].features:
    small_emotions_encoded['test'] = small_emotions_encoded['test'].rename_column('multi_hot_labels', 'labels')

# --- Add Sample Weight Calculation ---
print("Calculating sample weights...")
# Get all multi-hot labels from the training set as a NumPy array
train_labels_np = np.array(small_emotions_encoded['train']['labels'])
# Count frequency of each label (column-wise sum)
label_counts = np.sum(train_labels_np, axis=0)
total_samples = len(train_labels_np)

# Calculate weight for each class (inverse frequency, smoothed)
class_weights_calc = {}
for i in range(NUM_CLASSES):
    # Avoid division by zero for labels that might not appear in the subset
    count = label_counts[i] if label_counts[i] > 0 else 1
    class_weights_calc[i] = total_samples / (NUM_CLASSES * count)

# Calculate weight for each sample: max weight of its positive labels
sample_weights_np = np.zeros(total_samples, dtype=np.float32)
for i in range(total_samples):
    sample_label_indices = np.where(train_labels_np[i] == 1.0)[0]
    if len(sample_label_indices) > 0:
        sample_weights_np[i] = max(class_weights_calc[idx] for idx in sample_label_indices)
    else:
        # Assign a default weight (e.g., 1.0 or average) for samples with no positive labels
        sample_weights_np[i] = 1.0
print("Sample weights calculated.")
# --- End Sample Weight Calculation ---


# Set format to tensorflow
feature_cols = ["input_ids", "attention_mask"]
label_col = "labels"
cols_to_set_format = feature_cols + [label_col]

actual_train_cols = list(small_emotions_encoded['train'].features)
actual_test_cols = list(small_emotions_encoded['test'].features)
final_train_cols = [col for col in cols_to_set_format if col in actual_train_cols]
final_test_cols = [col for col in cols_to_set_format if col in actual_test_cols]

if all(col in final_train_cols for col in cols_to_set_format) and \
   all(col in final_test_cols for col in cols_to_set_format):
    print("Setting dataset format to TensorFlow")
    # Don't set format yet, extract numpy arrays first, then create dataset
else:
     raise ValueError(f"Error: Could not find all necessary columns. Train has: {actual_train_cols}, Test has: {actual_test_cols}. Needed: {cols_to_set_format}")


# Extract features and labels as numpy arrays before creating dataset
train_features_np = {col: np.array(small_emotions_encoded['train'][col]) for col in feature_cols}
train_labels_np = np.array(small_emotions_encoded['train']['labels']) # Already have this from weight calc

test_features_np = {col: np.array(small_emotions_encoded['test'][col]) for col in feature_cols}
test_labels_np = np.array(small_emotions_encoded['test']['labels'])


# Create TensorFlow datasets
BATCH_SIZE = 32 # Keep batch size reasonable

print("Creating tf.data.Dataset objects with sample weights...")
# Modify train_dataset to yield (features, labels, sample_weights)
train_dataset = tf.data.Dataset.from_tensor_slices(
    (train_features_np, train_labels_np, sample_weights_np)
)
train_dataset = train_dataset.shuffle(len(sample_weights_np)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE) # Add prefetch

# Test dataset remains (features, labels)
test_dataset = tf.data.Dataset.from_tensor_slices(
    (test_features_np, test_labels_np)
)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE) # Add prefetch
print("Datasets created successfully.")


Map: 100%|██████████| 10000/10000 [00:00<00:00, 19836.48 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 20911.71 examples/s]


Removing existing columns: ['labels']
Renaming 'multi_hot_labels' to 'labels'
Calculating sample weights...
Sample weights calculated.
Setting dataset format to TensorFlow
Creating tf.data.Dataset objects with sample weights...
Datasets created successfully.


In [11]:
class BERTForClassification(tf.keras.Model):

    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        # Change activation to 'sigmoid' for multi-label classification
        self.fc = tf.keras.layers.Dense(num_classes, activation='sigmoid')

    def call(self, inputs):
        # Make sure we handle the case when inputs is a dictionary
        outputs = self.bert(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            return_dict=True
        )
        pooled_output = outputs.pooler_output
        return self.fc(pooled_output)

In [12]:
# Define NUM_CLASSES if not defined globally earlier
try:
    NUM_CLASSES
except NameError:
    NUM_CLASSES = 28 # Set default if run out of order

# Create a shared emotion prediction function for multi-label output
def predict_emotion(text, model, threshold=0.5):
    """Predict multiple emotions for a given text using the provided model and threshold"""
    inputs = tokenizer(text, return_tensors='tf', padding=True, truncation=True)
    predictions = model(inputs) # Shape: (1, NUM_CLASSES)

    # --- Add this line temporarily ---
    # print(f"Raw probabilities for '{text}': {predictions[0].numpy()}")
    # --- End of added line ---

    predicted_labels_indices = tf.where(predictions[0] > threshold).numpy().flatten()
    predicted_emotions = []
    confidences = []
    if len(predicted_labels_indices) > 0:
        for index in predicted_labels_indices:
            predicted_emotions.append(emotions_id2label[index])
            confidences.append(float(predictions[0][index]))
    else:
        # Optional: If no label passes threshold, predict the highest one or 'neutral'
        highest_prob_index = tf.argmax(predictions, axis=1).numpy()[0]
        predicted_emotions.append(emotions_id2label[highest_prob_index])
        confidences.append(float(predictions[0][highest_prob_index]))


    return {
        'text': text,
        'emotions': predicted_emotions,
        'confidences': confidences
    }

# Define test texts to use for both untrained and trained models
test_texts = [
    "I'm so happy today!",
    "This makes me really angry.",
    "I'm feeling very sad and disappointed.",
    "That's really interesting, tell me more.",
    "I am both excited and nervous about the presentation.", # Example with multiple emotions
]

## Analyze Test Texts with Untrained Model

Let's first create and test our model before training to establish a baseline. This will show how the model performs with random weights, which we can compare to the fine-tuned model later.

In [13]:
# Create an untrained model for baseline comparison
# Ensure the base model 'model' is loaded correctly from cell 2
untrained_classifier = BERTForClassification(model, num_classes=NUM_CLASSES)

# Compile the model for multi-label classification
untrained_classifier.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=2e-5),
    # Use BinaryCrossentropy for multi-label with sigmoid activation
    loss=tf.keras.losses.BinaryCrossentropy(),
    # Use BinaryAccuracy for multi-label evaluation
    metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy')]
)

print("Predictions with UNTRAINED model (random weights - multi-label):")
print("-------------------------------------------------------------")

# Get predictions from untrained model using our updated shared function
for text in test_texts:
    result = predict_emotion(text, untrained_classifier, threshold=0.1) # Lower threshold for untrained might show more random outputs
    print(f"Text: {result['text']}")
    print(f"Predicted emotions: {result['emotions']}")
    # Zip confidences with emotions for clarity
    emotion_confidence_pairs = list(zip(result['emotions'], result['confidences']))
    print(f"Confidences: {emotion_confidence_pairs}")
    # print(f"Confidences: {[f'{c:.4f}' for c in result['confidences']]}")
    print()

# Evaluating accuracy on the test set for an untrained multi-label model isn't very informative
# untrained_loss, untrained_accuracy = untrained_classifier.evaluate(test_dataset, verbose=0)
# print(f"Untrained model test accuracy (BinaryAccuracy): {untrained_accuracy:.4f}")
# Random baseline for BinaryAccuracy depends on label distribution, harder to interpret than single-label.

Predictions with UNTRAINED model (random weights - multi-label):
-------------------------------------------------------------
Text: I'm so happy today!
Predicted emotions: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
Confidences: [('admiration', 0.7032246589660645), ('amusement', 0.4195410907268524), ('anger', 0.4252801537513733), ('annoyance', 0.5530133247375488), ('approval', 0.48919641971588135), ('caring', 0.5244767069816589), ('confusion', 0.4554974436759949), ('curiosity', 0.5426892042160034), ('desire', 0.4391227960586548), ('disappointment', 0.6152594685554504), ('disapproval', 0.48842307925224304), ('disgust', 0.4899848699569702), ('embarrassment', 0.4912385940551758), ('excitement', 0.485923

In [14]:
# --- Suggested Change for Cell ID: d127d7b4 ---

# Update num_classes if not already defined
try:
    NUM_CLASSES
except NameError:
    NUM_CLASSES = 28

# Define the model - ensure 'model' (the base BERT model) is loaded
classifier = BERTForClassification(model, num_classes=NUM_CLASSES)

# Compile the model for multi-label classification with more metrics
print("Compiling model with AUC, Precision, Recall...")
classifier.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=2e-5), # Consider trying AdamW later
    loss=tf.keras.losses.BinaryCrossentropy(), # Correct loss for multi-label sigmoid
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.AUC(multi_label=True, name='auc'), # Good overall multi-label metric
        tf.keras.metrics.Precision(name='precision'), # How many selected items are relevant?
        tf.keras.metrics.Recall(name='recall') # How many relevant items are selected?
        ]
)
print("Model compiled.")

Compiling model with AUC, Precision, Recall...
Model compiled.


In [17]:
# --- Suggested Change for Cell ID: e9df6074 ---

# Train the model
# Sample weights are now included in train_dataset, so no class_weight argument needed
print("Starting multi-label model training with sample weights...")
# callbacks = [
#     tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=3, mode='max', restore_best_weights=True),
#     tf.keras.callbacks.ModelCheckpoint('best_emotion_model.keras', save_best_only=True, monitor='val_auc', mode='max')
# ]

history = classifier.fit(
    train_dataset,
    epochs=1,  # Adjust epochs as needed, more data might require more/fewer epochs
    validation_data=test_dataset
    # callbacks=callbacks # Uncomment to use callbacks
)
print("Training finished.")

# Evaluate the model on the test set
print("Evaluating model on test set...")
results = classifier.evaluate(test_dataset, verbose=1) # Use verbose=1 to see progress

# Print evaluation results dynamically based on compiled metrics
print("\nTest Set Evaluation Results:")
for name, value in zip(classifier.metrics_names, results):
    print(f"- {name}: {value:.4f}")

# test_loss = results[classifier.metrics_names.index('loss')]
# test_auc = results[classifier.metrics_names.index('auc')]
# print(f"\nTest Loss: {test_loss:.4f}")
# print(f"Test AUC: {test_auc:.4f}")

Starting multi-label model training with sample weights...
Training finished.
Evaluating model on test set...

Test Set Evaluation Results:
- loss: 0.0019
- accuracy: 1.0000
- auc: 0.0000
- precision: 0.0000
- recall: 0.0000


In [22]:
print("Predictions with TRAINED multi-label model:")
print("----------------------------------------")

# Use the updated shared function with the trained model
prediction_threshold = 0.1 # Adjust threshold as needed based on validation performance

test_texts = TEST_TEXTS
for text in test_texts:
    result = predict_emotion(text, classifier, threshold=prediction_threshold)
    print(f"Text: {result['text']}")
    print(f"Predicted emotions: {result['emotions']}")
    # Zip confidences with emotions for clarity
    emotion_confidence_pairs = list(zip(result['emotions'], result['confidences']))
    print(f"Confidences: {emotion_confidence_pairs}")
    # print(f"Confidences: {[f'{c:.4f}' for c in result['confidences']]}")
    print()

Predictions with TRAINED multi-label model:
----------------------------------------
Text: I'm so happy today!
Predicted emotions: ['nervousness']
Confidences: [('nervousness', 0.002956303535029292)]

Text: This makes me really angry.
Predicted emotions: ['nervousness']
Confidences: [('nervousness', 0.002934002550318837)]

Text: I'm feeling very sad and disappointed.
Predicted emotions: ['nervousness']
Confidences: [('nervousness', 0.0029893810860812664)]

Text: That's really interesting, tell me more.
Predicted emotions: ['nervousness']
Confidences: [('nervousness', 0.0029663715977221727)]

Text: I am both excited and nervous about the presentation.
Predicted emotions: ['nervousness']
Confidences: [('nervousness', 0.0029224834870547056)]

Text: My hands were shaking as I opened the letter, unsure what to expect.
Predicted emotions: ['nervousness']
Confidences: [('nervousness', 0.002900628373026848)]

Text: After the argument, the silence in the car was deafening and heavy.
Predicted e

In [24]:
from transformers import pipeline

# Load the pre-trained model and tokenizer
model = 'borisn70/bert-43-multilabel-emotion-detection'
tokenizer = 'borisn70/bert-43-multilabel-emotion-detection'

# Create a pipeline for sentiment analysis
nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

testTexts = TEST_TEXTS
result = nlp(testTexts)

# Print the result
print(result)


Device set to use mps:0


[{'label': 'joy', 'score': 0.9998683929443359}, {'label': 'anger', 'score': 0.9998956918716431}, {'label': 'sadness', 'score': 0.9999432563781738}, {'label': 'neutral', 'score': 0.7713387608528137}, {'label': 'fear', 'score': 0.9935625791549683}, {'label': 'fear', 'score': 0.9996892213821411}, {'label': 'anger', 'score': 0.9703493714332581}, {'label': 'joy', 'score': 0.9454078674316406}, {'label': 'joy', 'score': 0.6393844485282898}, {'label': 'neutral', 'score': 0.7897074222564697}, {'label': 'sadness', 'score': 0.9980188608169556}, {'label': 'fear', 'score': 0.5129643082618713}, {'label': 'neutral', 'score': 0.9996825456619263}, {'label': 'joy', 'score': 0.9929666519165039}, {'label': 'fear', 'score': 0.9995587468147278}]
