In [2]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(name)
model = RobertaForSequenceClassification.from_pretrained(name, num_labels=6)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import Dataset, load_dataset

# General emotions
hf_diar = load_dataset("dair-ai/emotion")

In [5]:
print(hf_diar)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [6]:
model.to(device)

def tokenize(dataset):
    return tokenizer(dataset['text'], truncation=True, padding='max_length')
train_set = hf_diar['train'].map(
    tokenize, batched=True,
    remove_columns=['text'],
)
valid_set = hf_diar['validation'].map(
    tokenize, batched=True,
    remove_columns=['text'],
)

train_set.to_pandas()

Unnamed: 0,label,input_ids,attention_mask
0,0,"[0, 118, 46405, 619, 32386, 2, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0,"[0, 118, 64, 213, 31, 2157, 98, 24418, 7, 98, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,3,"[0, 757, 16004, 10, 2289, 7, 618, 939, 619, 34...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."
3,2,"[0, 118, 524, 655, 2157, 28055, 59, 5, 24672, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,3,"[0, 118, 524, 2157, 22970, 17414, 2, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
15995,0,"[0, 118, 95, 56, 10, 182, 4315, 86, 11, 5, 232...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15996,0,"[0, 118, 524, 122, 3408, 8, 939, 619, 31790, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15997,1,"[0, 118, 619, 670, 8, 205, 1374, 2, 1, 1, 1, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
15998,3,"[0, 118, 619, 101, 42, 21, 215, 10, 21820, 112...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [7]:
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    return {'accuracy': accuracy_score(y_true, y_pred)}

epochs = 5
batch_size = 16
weight_decay = 0.01
model_name = f"{name}-sentiments"

train_args = TrainingArguments(
    output_dir=model_name,
    log_level='error',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
)

In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=train_args,
    compute_metrics=compute_metrics,
    train_dataset=train_set,
    eval_dataset=valid_set,
)

In [None]:
trainer.train()

Step,Training Loss
500,0.6083


In [59]:
trainer.save_model(model_name)
tokenizer.save_pretrained(model_name)

('roberta-base-sentiments/tokenizer_config.json',
 'roberta-base-sentiments/special_tokens_map.json',
 'roberta-base-sentiments/vocab.json',
 'roberta-base-sentiments/merges.txt',
 'roberta-base-sentiments/added_tokens.json')

In [1]:
test_set.to_pandas()
output = trainer.predict(test_set)
output

NameError: name 'test_set' is not defined

In [14]:
from sklearn.metrics import classification_report

labels = ["sad", "joy", "love", "anger", "fear", "surprise"]
y_true = output.label_ids
y_pred = np.argmax(output.predictions, axis=-1)
target_names = labels
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         sad       0.96      0.98      0.97       581
         joy       0.96      0.95      0.95       695
        love       0.85      0.84      0.84       159
       anger       0.94      0.92      0.93       275
        fear       0.90      0.91      0.90       224
    surprise       0.76      0.76      0.76        66

    accuracy                           0.93      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.93      0.93      0.93      2000



In [48]:
# Songs
kg_songs = load_dataset("csv", data_files="datasets/songs.csv")
kg_songs = kg_songs["train"].to_pandas()

# Poems
hf_mteb = load_dataset("mteb/PoemSentimentClassification")
hf_mteb = hf_mteb["train"].to_pandas()
kg_poems = load_dataset("csv", data_files="datasets/final_df_emotions(remove-bias).csv")
kg_poems = kg_poems["train"].to_pandas()
md_perc = pd.read_excel("datasets/PERC_mendelly.xlsx")

In [49]:
kg_songs = kg_songs.rename(columns={"mood": "label"})
md_perc = md_perc.rename(columns={"Emotion": "label"})

def print_unique_labels(kg_songs, hf_mteb, kg_poems, md_perc):
    kg_song_labels = kg_songs['label'].unique()
    hf_mteb_labels = hf_mteb['label'].unique()
    kg_poem_labels = kg_poems['label'].unique()
    perc_labels = md_perc['label'].unique()

    out = (
        f"Valid labels: {labels}\n"
        f"kg_songs labels: {kg_song_labels}\n"
        f"hf_mteb labels: {hf_mteb_labels}\n"
        f"kg_poems labels: {kg_poem_labels}\n"
        f"perc labels: {perc_labels}"
    )
    print(out)
    return

print_unique_labels(kg_songs, hf_mteb, kg_poems, md_perc)

Valid labels: ['sad', 'joy', 'love', 'anger', 'fear', 'surprise']
kg_songs labels: ['calm' 'sad' 'excited' 'motivated' 'angry' 'happy' 'relaxed' 'energetic'
 'nostalgic' 'romantic']
hf_mteb labels: [1 2 0 3]
kg_poems labels: ['sadness' 'anger' 'joy' 'disgust' 'fear' 'neutral' 'surprise']
perc labels: ['sad' 'love' 'peace' 'joy' 'courage' 'surprise' 'hate' 'anger' 'fear']


In [50]:
label_map = {
    "sad": "sad",
    "sadness": "sad",
    "joy": "joy",
    "happy": "joy",
    "love": "love",
    "romantic": "love",
    "anger": "anger",
    "angry": "anger",
    "fear": "fear",
    "surprise": "surprise",
    "excited": "surprise"
}

def adjust_labels(df, label_col, label_map):
    df[label_col] = df[label_col].map(label_map)
    return df[df[label_col].isin(labels)].reset_index(drop=True)

kg_songs = adjust_labels(kg_songs, "label", label_map)
hf_mteb = adjust_labels(hf_mteb, "label", label_map)
kg_poems = adjust_labels(kg_poems, "label", label_map)
md_perc = adjust_labels(md_perc, "label", label_map)

print_unique_labels(kg_songs, hf_mteb, kg_poems, md_perc)

Valid labels: ['sad', 'joy', 'love', 'anger', 'fear', 'surprise']
kg_songs labels: ['sad' 'surprise' 'anger' 'joy' 'love']
hf_mteb labels: []
kg_poems labels: ['sad' 'anger' 'joy' 'fear' 'surprise']
perc labels: ['sad' 'love' 'joy' 'surprise' 'anger' 'fear']
