In [None]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

name = './roberta-base-sentiments'
tokenizer = RobertaTokenizer.from_pretrained(name)
model = RobertaForSequenceClassification.from_pretrained(name)
model.to(device)

In [None]:
from datasets import load_dataset, Dataset

# Songs
kg_songs = load_dataset("csv", data_files="datasets/songs.csv")
kg_songs = kg_songs["train"].to_pandas()


# Poems
hf_mteb = load_dataset("mteb/PoemSentimentClassification")
hf_mteb = hf_mteb["train"].to_pandas()

kg_poems = load_dataset("csv", data_files="datasets/final_df_emotions(remove-bias).csv")
kg_poems = kg_poems["train"].to_pandas()

md_perc = pd.read_excel("datasets/PERC_mendelly.xlsx")

In [None]:
kg_songs = kg_songs.rename(columns={"mood": "label", "lyrics": "text"})
md_perc = md_perc.rename(columns={"Emotion": "label"})

labels = ["sad", "joy", "love", "anger", "fear", "surprise"]
label_map = {
    "sad": "sad",
    "sadness": "sad",
    "joy": "joy",
    "happy": "joy",
    "love": "love",
    "romantic": "love",
    "anger": "anger",
    "angry": "anger",
    "fear": "fear",
    "surprise": "surprise",
    "excited": "surprise"
}

def adjust_labels(df, label_col, label_map):
    df[label_col] = df[label_col].map(label_map)
    return df[df[label_col].isin(labels)].reset_index(drop=True)

kg_songs = adjust_labels(kg_songs, "label", label_map)
hf_mteb = adjust_labels(hf_mteb, "label", label_map)
kg_poems = adjust_labels(kg_poems, "label", label_map)
md_perc = adjust_labels(md_perc, "label", label_map)

In [None]:
print(kg_songs)

In [8]:
from transformers import Trainer
def tokenize(dataset):
    return tokenizer(dataset['text'], truncation=True)

kg_songs_ds = Dataset.from_pandas(kg_songs)
test1 = kg_songs_ds.map(
    tokenize, batched=True,
    remove_columns=['song_name', 'artist'],
)

Map:   0%|          | 0/498 [00:00<?, ? examples/s]

In [9]:
print(test1)

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 498
})


In [None]:
trainer = Trainer(model=model)
output = trainer.predict(test1)