In [1]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


In [2]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(name)
model = RobertaForSequenceClassification.from_pretrained(name, num_labels=6)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import Dataset, load_dataset

# General emotions
hf_diar = load_dataset("dair-ai/emotion")


README.md: 0.00B [00:00, ?B/s]

split/train-00000-of-00001.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

split/validation-00000-of-00001.parquet:   0%|          | 0.00/127k [00:00<?, ?B/s]

split/test-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
labels = ["sad", "joy", "love", "anger", "fear", "surprise"]

print(hf_diar)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [13]:
model.to(device)

def tokenize(dataset):
    return tokenizer(dataset['text'], truncation=True, padding='max_length')
train_set = hf_diar['train'].map(
    tokenize, batched=True,
    remove_columns=['text'],
)
valid_set = hf_diar['validation'].map(
    tokenize, batched=True,
    remove_columns=['text'],
)
test_set = hf_diar['test'].map(
    tokenize, batched=True,
    remove_columns=['text'],
)

train_set.to_pandas()

Unnamed: 0,label,input_ids,attention_mask
0,0,"[0, 118, 46405, 619, 32386, 2, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0,"[0, 118, 64, 213, 31, 2157, 98, 24418, 7, 98, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,3,"[0, 757, 16004, 10, 2289, 7, 618, 939, 619, 34...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."
3,2,"[0, 118, 524, 655, 2157, 28055, 59, 5, 24672, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,3,"[0, 118, 524, 2157, 22970, 17414, 2, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
15995,0,"[0, 118, 95, 56, 10, 182, 4315, 86, 11, 5, 232...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15996,0,"[0, 118, 524, 122, 3408, 8, 939, 619, 31790, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15997,1,"[0, 118, 619, 670, 8, 205, 1374, 2, 1, 1, 1, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
15998,3,"[0, 118, 619, 101, 42, 21, 215, 10, 21820, 112...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [7]:
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    return {'accuracy': accuracy_score(y_true, y_pred)}

epochs = 2
batch_size = 16
weight_decay = 0.01
model_name = f"{name}-sentiment-classifications"

train_args = TrainingArguments(
    output_dir=model_name,
    log_level='error',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
)

In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=train_args,
    compute_metrics=compute_metrics,
    train_dataset=train_set,
    eval_dataset=valid_set,
)

In [9]:
trainer.train()

Step,Training Loss
500,0.6043
1000,0.2734
1500,0.1614
2000,0.1331


TrainOutput(global_step=2000, training_loss=0.2930334815979004, metrics={'train_runtime': 636.391, 'train_samples_per_second': 50.284, 'train_steps_per_second': 3.143, 'total_flos': 8419856154624000.0, 'train_loss': 0.2930334815979004, 'epoch': 2.0})

In [10]:
test_set.to_pandas()
output = trainer.predict(test_set)
output

In [14]:
from sklearn.metrics import classification_report

y_true = output.label_ids
y_pred = np.argmax(output.predictions, axis=-1)
target_names = labels
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         sad       0.96      0.98      0.97       581
         joy       0.96      0.95      0.95       695
        love       0.85      0.84      0.84       159
       anger       0.94      0.92      0.93       275
        fear       0.90      0.91      0.90       224
    surprise       0.76      0.76      0.76        66

    accuracy                           0.93      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.93      0.93      0.93      2000



In [34]:
# Songs
kg_songs = load_dataset("csv", data_files="datasets/songs.csv")
kg_songs = kg_songs["train"].to_pandas()

# Poems
hf_mteb = load_dataset("mteb/PoemSentimentClassification")
hf_mteb = hf_mteb["train"].to_pandas()
kg_poems = load_dataset("csv", data_files="datasets/final_df_emotions(remove-bias).csv")
kg_poems = kg_poems["train"].to_pandas()
md_perc = pd.read_excel("datasets/PERC_mendelly.xlsx")

In [36]:
print("SONGS:", kg_songs.columns)
print("MTEB:", hf_mteb.columns)
print("KG_POEMS:", kg_poems.columns)
print("PERC:", md_perc.columns)

SONGS: Index(['song_name', 'artist', 'lyrics', 'mood'], dtype='object')
MTEB: Index(['text', 'label'], dtype='object')
KG_POEMS: Index(['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear',
       'joy', 'neutral', 'sadness', 'surprise', 'age', 'type'],
      dtype='object')
PERC: Index(['Poem', 'Emotion'], dtype='object')
