# Downloads

In [None]:
# Only needed for Colab
!pip install --quiet spacy pystemmer

# Import

In [1]:
import pickle
import os
import spacy
import Stemmer
import torch
import seaborn as sns
import pandas as pd
import numpy as np
import re
import gensim.downloader as api
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

# Loading data

## Google Colab

In [None]:
ROOT_PATH = "drive/MyDrive/Навчання/Диплом"
DATASET_PATH = f"{ROOT_PATH}/lyrics.csv"
df = pd.read_csv(DATASET_PATH)

## Local

In [6]:
DATASET_PATH = "../lyrics.csv"
df = pd.read_csv(DATASET_PATH)

In [4]:
def load_lyrics_by_label(root_folder="../lyrics"):
    data = []

    for label in os.listdir(root_folder):
        label_dir = os.path.join(root_folder, label)
        if not os.path.isdir(label_dir):
            continue

        for filename in os.listdir(label_dir):
            if filename.endswith(".txt"):
                artist, title = filename[:-4].replace("&", "/").split(" - ", 1)
                filepath = os.path.join(label_dir, filename)

                with open(filepath, "r", encoding="utf-8") as f:
                    lyrics = f.read()

                data.append(
                    {"label": label, "artist": artist, "title": title, "lyrics": lyrics}
                )

    return pd.DataFrame(data)

In [5]:
df = load_lyrics_by_label()

## Common

In [8]:
df = df[~df["lyrics"].str.startswith("ERROR")]
min_label_count = df["label"].value_counts().min()
balanced_df = df.groupby("label").sample(n=min_label_count, random_state=42)

In [9]:
balanced_df["label"].value_counts()

label
angry      558
happy      558
relaxed    558
sad        558
Name: count, dtype: int64

In [10]:
lyrics, labels = balanced_df["lyrics"].values, balanced_df["label"].values

# Preprocessing

## Utils

In [11]:
nlp = spacy.blank("en")
stemmer = Stemmer.Stemmer("english")

In [12]:
def clean(text):
    text = text.replace("\n", " ").replace("\r", " ")
    text = re.sub(" +", " ", text)
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"^\d+ Contributors", "", text)
    text = re.sub(r"^(.*?)Lyrics", "", text, flags=re.MULTILINE)
    return text

In [13]:
def tokenize(text):
    return [
        str(token).lower()
        for token in nlp(text)
        if not token.is_stop and str(token).isalpha()
    ]

In [14]:
def stem(tokens):
    return stemmer.stemWords(tokens)

In [15]:
def preprocess(text):
    text = clean(text)
    tokens = tokenize(text)
    return stem(tokens)

## TF-IDF

In [16]:
vectorizer = TfidfVectorizer(analyzer=preprocess, max_features=512)
X = vectorizer.fit_transform(lyrics).toarray()

## Word2Vec

In [None]:
word2vec = api.load("word2vec-google-news-300")

In [None]:
def get_sentence_embedding(sentence, model, vector_size=300):
    tokens = tokenize(clean(sentence))
    vectors = [model[word] for word in tokens if word in model]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

In [None]:
X = torch.tensor([get_sentence_embedding(sentence, word2vec) for sentence in lyrics])

## Word2Vec sequence

In [None]:
word2vec = api.load("word2vec-google-news-300")

In [None]:
tokenized_lyrics = [tokenize(clean(l)) for l in lyrics]
max_sequence_length = len(max(tokenized_lyrics, key=lambda x: len(x)))

In [None]:
def get_tokens_embedding_sequence(tokens, word2vec, max_sequence_length=64):
    vectors = [word2vec[word] for word in tokens if word in word2vec]
    if len(vectors) == 0:
        return np.zeros((max_sequence_length, 300))
    if len(vectors) > max_sequence_length:
        vectors = vectors[:max_sequence_length]
    else:
        vectors += [np.zeros(300)] * (max_sequence_length - len(vectors))
    return vectors

In [None]:
X = np.array([get_tokens_embedding_sequence(l, word2vec) for l in tokenized_lyrics])

## Train-Test dataset split

In [17]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=12
)

In [19]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [20]:
batch_size = 16
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# DNN definition

In [22]:
class SentimentDNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, dropout_rates, output_size):
        super(SentimentDNN, self).__init__()

        layers = []
        for i, hidden_size in enumerate(hidden_sizes):
            layers.append(
                nn.Linear(input_size if i == 0 else hidden_sizes[i - 1], hidden_size)
            )
            layers.append(nn.ReLU())
            if dropout_rates[i] > 0:
                layers.append(nn.Dropout(dropout_rates[i]))

        layers.append(nn.Linear(hidden_sizes[-1], output_size))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        y = self.model(x)
        return y

# RNN definition

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SentimentRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        rnn_out, hidden = self.rnn(x)
        return self.fc(hidden.squeeze(0))

# Machine learning


## Model creation

### TF-IDF

In [23]:
input_size = X_train.shape[1]
hidden_sizes = [128, 64]
dropout_rates = [0.2, 0.2]
output_size = len(label_encoder.classes_)
model = SentimentDNN(input_size, hidden_sizes, dropout_rates, output_size)

### Word2Vec

In [None]:
input_size = X_train.shape[1]
hidden_sizes = [64, 32]
dropout_rates = [0, 0]
output_size = len(label_encoder.classes_)
model = SentimentDNN(input_size, hidden_sizes, dropout_rates, output_size)

### Word2Vec sequence(RNN)

In [None]:
model = SentimentRNN(
    input_size=300, hidden_size=64, num_classes=len(label_encoder.classes_)
)

### Criterion and optimizer

In [24]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

## Training

In [25]:
num_epochs = 64
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss:.4f}")

Epoch [1/64], Loss: 155.6825
Epoch [2/64], Loss: 154.8291
Epoch [3/64], Loss: 153.4650
Epoch [4/64], Loss: 149.9794
Epoch [5/64], Loss: 142.5696
Epoch [6/64], Loss: 129.4210
Epoch [7/64], Loss: 111.9171
Epoch [8/64], Loss: 92.9567
Epoch [9/64], Loss: 76.4101
Epoch [10/64], Loss: 62.9600
Epoch [11/64], Loss: 53.4069
Epoch [12/64], Loss: 46.1110
Epoch [13/64], Loss: 40.9770
Epoch [14/64], Loss: 36.5133
Epoch [15/64], Loss: 33.1190
Epoch [16/64], Loss: 30.0497
Epoch [17/64], Loss: 27.6893
Epoch [18/64], Loss: 25.8499
Epoch [19/64], Loss: 24.1974
Epoch [20/64], Loss: 22.5803
Epoch [21/64], Loss: 21.1686
Epoch [22/64], Loss: 19.8339
Epoch [23/64], Loss: 18.8786
Epoch [24/64], Loss: 17.6409
Epoch [25/64], Loss: 16.9364
Epoch [26/64], Loss: 16.4857
Epoch [27/64], Loss: 15.5781
Epoch [28/64], Loss: 15.2980
Epoch [29/64], Loss: 14.2462
Epoch [30/64], Loss: 13.3793
Epoch [31/64], Loss: 12.7409
Epoch [32/64], Loss: 12.3090
Epoch [33/64], Loss: 12.0814
Epoch [34/64], Loss: 11.4039
Epoch [35/64], L

## Evaluation

In [26]:
with torch.no_grad():
    model.eval()
    y_pred = model(X_test_tensor)
    y_pred_classes = torch.argmax(y_pred, axis=1)
    accuracy = (y_pred_classes == y_test_tensor).sum().item() / len(y_test_tensor)
    print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9351


In [None]:
clf_report = classification_report(
    y_test, y_pred_classes, target_names=label_encoder.classes_, output_dict=True
)
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred_classes, display_labels=label_encoder.classes_
)

# Saving model

In [None]:
torch.save(model.state_dict(), "sentiment-dnn.pt")