# Downloads

In [None]:
# Only needed for Colab
!pip install --quiet spacy pystemmer

# Import

In [81]:
import os
import spacy
import Stemmer
import torch
import seaborn as sns
import pandas as pd
import numpy as np
import re
import gensim.downloader as api
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

# Loading data

## Google Colab

In [None]:
ROOT_PATH = "drive/MyDrive/Навчання/Диплом"
DATASET_PATH = f"{ROOT_PATH}/lyrics.csv"
df = pd.read_csv(DATASET_PATH)

## Local

In [130]:
DATASET_PATH = "../lyrics.csv"
df = pd.read_csv(DATASET_PATH)

In [104]:
def load_lyrics_by_label(root_folder="../lyrics"):
    data = []

    for label in os.listdir(root_folder):
        label_dir = os.path.join(root_folder, label)
        if not os.path.isdir(label_dir):
            continue

        for filename in os.listdir(label_dir):
            if filename.endswith(".txt"):
                artist, title = filename[:-4].replace("&", "/").split(" - ", 1)
                filepath = os.path.join(label_dir, filename)

                with open(filepath, "r", encoding="utf-8") as f:
                    lyrics = f.read()

                data.append(
                    {"label": label, "artist": artist, "title": title, "lyrics": lyrics}
                )

    return pd.DataFrame(data)

In [105]:
df = load_lyrics_by_label()

## Common

In [131]:
df = df[~df["lyrics"].str.startswith("ERROR")]
min_label_count = df["label"].value_counts().min()
balanced_df = df.groupby("label").sample(n=min_label_count, random_state=42)

In [107]:
balanced_df["label"].value_counts()

label
angry      584
happy      584
relaxed    584
sad        584
Name: count, dtype: int64

In [132]:
lyrics, labels = balanced_df["lyrics"].values, balanced_df["label"].values

# Preprocessing

## Utils

In [133]:
nlp = spacy.blank("en")
stemmer = Stemmer.Stemmer("english")

In [134]:
def clean(text):
    text = text.replace("\n", " ").replace("\r", " ")
    text = re.sub(" +", " ", text)
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"^\d+ Contributors", "", text)
    text = re.sub(r"^(.*?)Lyrics", "", text, flags=re.MULTILINE)
    return text

In [135]:
def tokenize(text):
    return [
        str(token).lower()
        for token in nlp(text)
        if not token.is_stop and str(token).isalpha()
    ]

In [136]:
def stem(tokens):
    return stemmer.stemWords(tokens)

In [137]:
def preprocess(text):
    text = clean(text)
    tokens = tokenize(text)
    return stem(tokens)

## TF-IDF

In [91]:
vectorizer = TfidfVectorizer(analyzer=preprocess, max_features=512)
X = vectorizer.fit_transform(lyrics).toarray()

## Word2Vec

In [92]:
word2vec = api.load("word2vec-google-news-300")

In [138]:
def get_sentence_embedding(sentence, model, vector_size=300):
    tokens = tokenize(clean(sentence))
    vectors = [model[word] for word in tokens if word in model]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

In [139]:
X = torch.tensor(np.array([get_sentence_embedding(sentence, word2vec) for sentence in lyrics]))

## Word2Vec sequence

In [None]:
word2vec = api.load("word2vec-google-news-300")

In [None]:
tokenized_lyrics = [tokenize(clean(l)) for l in lyrics]
max_sequence_length = len(max(tokenized_lyrics, key=lambda x: len(x)))

In [None]:
def get_tokens_embedding_sequence(tokens, word2vec, max_sequence_length=64):
    vectors = [word2vec[word] for word in tokens if word in word2vec]
    if len(vectors) == 0:
        return np.zeros((max_sequence_length, 300))
    if len(vectors) > max_sequence_length:
        vectors = vectors[:max_sequence_length]
    else:
        vectors += [np.zeros(300)] * (max_sequence_length - len(vectors))
    return vectors

In [None]:
X = np.array([get_tokens_embedding_sequence(l, word2vec) for l in tokenized_lyrics])

## Train-Test dataset split

In [140]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

In [141]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=12
)

In [142]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

  X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
  X_test_tensor = torch.tensor(X_test, dtype=torch.float32)


In [143]:
batch_size = 16
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# DNN definition

In [144]:
class SentimentDNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, dropout_rates, output_size):
        super(SentimentDNN, self).__init__()

        layers = []
        for i, hidden_size in enumerate(hidden_sizes):
            layers.append(
                nn.Linear(input_size if i == 0 else hidden_sizes[i - 1], hidden_size)
            )
            layers.append(nn.ReLU())
            if dropout_rates[i] > 0:
                layers.append(nn.Dropout(dropout_rates[i]))

        layers.append(nn.Linear(hidden_sizes[-1], output_size))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        y = self.model(x)
        return y

# RNN definition

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SentimentRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        rnn_out, hidden = self.rnn(x)
        return self.fc(hidden.squeeze(0))

# Machine learning


## Model creation

### TF-IDF

In [None]:
input_size = X_train.shape[1]
hidden_sizes = [128, 64]
dropout_rates = [0.2, 0.2]
output_size = len(label_encoder.classes_)
model = SentimentDNN(input_size, hidden_sizes, dropout_rates, output_size)

### Word2Vec

In [145]:
input_size = X_train.shape[1]
hidden_sizes = [128, 64]
dropout_rates = [0.2, 0.2]
output_size = len(label_encoder.classes_)
model = SentimentDNN(input_size, hidden_sizes, dropout_rates, output_size)

### Word2Vec sequence(RNN)

In [None]:
model = SentimentRNN(
    input_size=300, hidden_size=64, num_classes=len(label_encoder.classes_)
)

### Criterion and optimizer

In [146]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

## Training

In [147]:
num_epochs = 64
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss:.4f}")

Epoch [1/64], Loss: 154.3498
Epoch [2/64], Loss: 151.1439
Epoch [3/64], Loss: 144.0711
Epoch [4/64], Loss: 131.4385
Epoch [5/64], Loss: 115.9266
Epoch [6/64], Loss: 102.3310
Epoch [7/64], Loss: 92.5541
Epoch [8/64], Loss: 83.9755
Epoch [9/64], Loss: 76.7378
Epoch [10/64], Loss: 72.9694
Epoch [11/64], Loss: 67.6725
Epoch [12/64], Loss: 64.1982
Epoch [13/64], Loss: 62.2580
Epoch [14/64], Loss: 58.7711
Epoch [15/64], Loss: 57.0395
Epoch [16/64], Loss: 52.9350
Epoch [17/64], Loss: 52.6392
Epoch [18/64], Loss: 51.6779
Epoch [19/64], Loss: 48.3343
Epoch [20/64], Loss: 48.2797
Epoch [21/64], Loss: 46.5135
Epoch [22/64], Loss: 46.1372
Epoch [23/64], Loss: 44.6243
Epoch [24/64], Loss: 44.0994
Epoch [25/64], Loss: 41.5078
Epoch [26/64], Loss: 41.8435
Epoch [27/64], Loss: 41.3764
Epoch [28/64], Loss: 40.4020
Epoch [29/64], Loss: 39.3708
Epoch [30/64], Loss: 38.6872
Epoch [31/64], Loss: 37.4936
Epoch [32/64], Loss: 36.8817
Epoch [33/64], Loss: 37.1080
Epoch [34/64], Loss: 35.4278
Epoch [35/64], Lo

## Evaluation

In [148]:
with torch.no_grad():
    model.eval()
    y_pred = model(X_test_tensor)
    y_pred_classes = torch.argmax(y_pred, axis=1)
    accuracy = (y_pred_classes == y_test_tensor).sum().item() / len(y_test_tensor)
    print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8747


In [None]:
clf_report = classification_report(
    y_test, y_pred_classes, target_names=label_encoder.classes_, output_dict=True
)
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred_classes, display_labels=label_encoder.classes_
)

# Saving model

In [None]:
torch.save(model.state_dict(), "sentiment-dnn.pt")