In [1]:
import pandas as pd
import numpy as np
import torch
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
df = pd.read_csv('data/lyrics_emotions_dataset.csv') 
df.drop(columns=["album", "year"], inplace=True) 

In [4]:
def preprocess_lyrics(lyrics):
    lyrics = lyrics.lower() # convert to lowercase
    lyrics = re.sub(r'\[.*?\]', '', lyrics) # remove [chorus], [verse], etc.
    lyrics = re.sub(r'\{.*?\}', '', lyrics) # remove {chorus}, {verse}, etc.
    lyrics = re.sub(r'\n', ' ', lyrics) # remove newline characters
    lyrics = re.sub(r'\s+', ' ', lyrics) # remove extra whitespace
    lyrics = lyrics.strip() # remove leading and trailing whitespace
    return lyrics

df['lyrics'] = df['lyrics'].apply(preprocess_lyrics)

In [5]:
mlb = MultiLabelBinarizer()

df['labels'] = df['labels'].apply(lambda x: x.split(', ') if isinstance(x, str) else x)

labels_binarized = mlb.fit_transform(df['labels'])

encoded_df = pd.DataFrame(labels_binarized, columns=mlb.classes_)

binary_columns = list(mlb.classes_)
df.drop(columns=binary_columns, errors="ignore", inplace=True)

df = pd.concat([df, encoded_df], axis=1)

In [6]:
X = df['lyrics'].tolist()
y = labels_binarized
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train_tokenized = tokenizer(X_train, truncation=True, padding=True, return_tensors='pt', max_length=512)
X_test_tokenized = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt', max_length=512)

y_train = torch.tensor(y_train, dtype=torch.float32).to(device)
y_test = torch.tensor(y_test, dtype=torch.float32).to(device)

In [8]:
class LyricsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = {key: val.to(device) for key, val in encodings.items()}
        self.labels = labels.to(device)

    def __len__(self):
        return len(self.labels)        

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    
train_dataset = LyricsDataset(X_train_tokenized, y_train)
test_dataset = LyricsDataset(X_test_tokenized, y_test)

In [9]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', 
    num_labels=len(mlb.classes_), 
    problem_type='multi_label_classification').to(device)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.001,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='epoch', 
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    save_total_limit=2,
    seed=42,
    fp16=True,
    gradient_accumulation_steps=2
)

def compute_metrics(pred):
    labels = pred.label_ids
    logits = pred.predictions
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    return {
        'f1': f1_score(labels, preds, average='macro'),
        'hamming_loss': hamming_loss(labels, preds)
    }

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

trainer.train()

results = trainer.evaluate()
print(results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
1,0.6369,0.632044,0.129203,0.348659,58.4208,3.971,0.496
2,0.583,0.602785,0.27231,0.323276,53.238,4.358,0.545
3,0.5484,0.566882,0.410822,0.291667,62.219,3.729,0.466
4,0.5064,0.556243,0.447784,0.277778,61.7978,3.754,0.469
5,0.4639,0.547561,0.494364,0.278257,61.33,3.783,0.473
6,0.4379,0.556367,0.510104,0.280651,92.3747,2.512,0.314
7,0.4159,0.548154,0.526277,0.273467,52.887,4.387,0.548
8,0.374,0.550593,0.535853,0.27251,58.0129,3.999,0.5
9,0.3646,0.551884,0.538978,0.269157,52.95,4.381,0.548
10,0.3561,0.551687,0.54575,0.269636,56.3608,4.116,0.515


{'eval_loss': 0.5516871809959412, 'eval_f1': 0.5457501490706075, 'eval_hamming_loss': 0.2696360153256705, 'eval_runtime': 64.4222, 'eval_samples_per_second': 3.601, 'eval_steps_per_second': 0.45, 'epoch': 10.0}


In [21]:
def predict_emotions(lyrics):
    lyrics = preprocess_lyrics(lyrics)
    input = tokenizer(lyrics, truncation=True, padding=True, return_tensors='pt', max_length = 512)
    input = {key: val.to(device) for key, val in input.items()} 
    logits = model(**input).logits
    preds = (torch.sigmoid(logits) > 0.5).int().cpu().numpy()
    return mlb.inverse_transform(preds)

test_lyrics = """Risin' up, back on the street
Did my time, took my chances
Went the distance, now I'm back on my feet
Just a man and his will to survive
So many times, it happens too fast
You trade your passion for glory
Don't lose your grip on the dreams of the past
You must fight just to keep them alive
It's the eye of the tiger, it's the thrill of the fight
Risin' up to the challenge of our rival
And the last known survivor stalks his prey in the night
And he's watching us all with the eye of the tiger
Face to face, out on the heat
Hangin' tough, stayin' hungry
They stack the odds still we take to the street
For the kill, with the skill to survive
It's the eye of the tiger, it's the thrill of the fight
Risin' up to the challenge of our rival
And the last known survivor stalks his prey in the night
And he's watching us all with the eye of the tiger
Risin' up, straight to the top
Had the guts, got the glory
Went the distance, now I'm not gonna stop
Just a man and his will to survive
It's the eye of the tiger, it's the thrill of the fight
Risin' up to the challenge of our rival
And the last known survivor stalks his prey in the night
And he's watching us all with the eye of the tiger
The eye of the tiger
The eye of the tiger
The eye of the tiger
The eye of the tiger"""
predicted_emotions = predict_emotions(test_lyrics)
print(predicted_emotions)

[('Power', 'Solemnity', 'Tension')]
