# Data Preprocessing

In [1]:
#Download Spotify Million Song Dataset from https://drive.google.com/file/d/1fdGUg5fQl7FGUNYyK2hiVL84uc2EnzGo/view#
!wget 'https://docs.google.com/uc?export=download&id=1fdGUg5fQl7FGUNYyK2hiVL84uc2EnzGo' -O data.csv

--2024-01-16 14:22:27--  https://docs.google.com/uc?export=download&id=1fdGUg5fQl7FGUNYyK2hiVL84uc2EnzGo
Resolving docs.google.com (docs.google.com)... 2607:f8b0:4002:c08::8a, 2607:f8b0:4002:c08::8b, 2607:f8b0:4002:c08::64, ...
Connecting to docs.google.com (docs.google.com)|2607:f8b0:4002:c08::8a|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1fdGUg5fQl7FGUNYyK2hiVL84uc2EnzGo&export=download [following]
--2024-01-16 14:22:27--  https://drive.usercontent.google.com/download?id=1fdGUg5fQl7FGUNYyK2hiVL84uc2EnzGo&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 2607:f8b0:4002:c0c::84, 74.125.138.132
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|2607:f8b0:4002:c0c::84|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 74864162 (71M) [application/octet-stream]
Saving to: ‘data.csv’


2024-01-16 14:22:40 (7.04 MB/s) - 

In [126]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import numpy as np
import pandas as pd
import random
import re
from matplotlib import pyplot as plt
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [90]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

Load in data from CSV file

In [91]:
data_all = pd.read_csv("data.csv")

data_all.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


### Cleaning

In [92]:
top_data = data_all["artist"].value_counts().nlargest(8)
top_data

Donna Summer        191
Gordon Lightfoot    189
Bob Dylan           188
George Strait       188
Loretta Lynn        187
Cher                187
Alabama             187
Reba Mcentire       187
Name: artist, dtype: int64

In [93]:
data = data_all.loc[data_all["artist"].isin(top_data.index)]
data

Unnamed: 0,artist,song,link,text
361,Alabama,Calling All Angels,/a/alabama/calling+all+angels_20005071.html,"Calling, calling all angels, oh I'm calling, c..."
362,Alabama,Can't Keep A Good Man Down,/a/alabama/cant+keep+a+good+man+down_20522156....,I thought it was forever \r\nI thought it wou...
363,Alabama,Carolina Mountain Dew,/a/alabama/carolina+mountain+dew_20175804.html,Somewhere in the mountains......... In norther...
364,Alabama,Christmas In Dixie,/a/alabama/christmas+in+dixie_20005147.html,"By now in New York City, there's snow on the g..."
365,Alabama,Christmas In Your Arms,/a/alabama/christmas+in+your+arms_20005047.html,All my friends are asking me where I plan to s...
...,...,...,...,...
50506,Reba Mcentire,Old Man River,/r/reba+mcentire/old+man+river_20114622.html,"(ronny scaife, danny hogan) \r\n \r\nCool br..."
50507,Reba Mcentire,On My Own,/r/reba+mcentire/on+my+own_20114592.html,So many times I said it was forever \r\nSaid ...
50508,Reba Mcentire,"One Child, One Day",/r/reba+mcentire/one+child+one+day_20114569.html,"Three wise men, a shining star \r\nA mother a..."
50509,Reba Mcentire,One Promise Too Late,/r/reba+mcentire/one+promise+too+late_20114392...,I would have waited forever \r\nIf I'd known ...


Clean the text by removing all brackets and deleting content inside.
This removes unnecessary lyrics such as adlibs, sound effects, etc.
Remove line breaks as well.

In [94]:
text_in_round_brackets = sum(list(data['text'].map(lambda s: re.findall(r'\((.*?)\)',s))), [])
text_in_square_brackets = sum(list(data['text'].map(lambda s: re.findall(r'\[(.*?)\]',s))), [])

print(f'Number of round brackets: {len(text_in_round_brackets)}')
print(f'Number of square brackets: {len(text_in_square_brackets)}')

Number of round brackets: 540
Number of square brackets: 707


In [95]:
random.seed(0)
print(random.choices(text_in_round_brackets, k=5))
print(random.choices(text_in_square_brackets, k=5))

['sometimes', 'she is the lady of the night', 'Oh, oh baby', 'Ah', 'I']
['Chorus:', 'Chorus', 'Chorus:', 'Chorus', 'Chorus:']


In [96]:
pd.set_option('mode.chained_assignment', None)
data['text'] = data['text'].map(lambda s: re.sub(r'\(|\)', '', s))
data['text'] = data['text'].map(lambda s: re.sub(r'\[(.*?)\] ', '', s))
data['text'] = data['text'].map(lambda s: re.sub(r'\r\n|\n', '', s))
data

Unnamed: 0,artist,song,link,text
361,Alabama,Calling All Angels,/a/alabama/calling+all+angels_20005071.html,"Calling, calling all angels, oh I'm calling, c..."
362,Alabama,Can't Keep A Good Man Down,/a/alabama/cant+keep+a+good+man+down_20522156....,I thought it was forever I thought it would l...
363,Alabama,Carolina Mountain Dew,/a/alabama/carolina+mountain+dew_20175804.html,Somewhere in the mountains......... In norther...
364,Alabama,Christmas In Dixie,/a/alabama/christmas+in+dixie_20005147.html,"By now in New York City, there's snow on the g..."
365,Alabama,Christmas In Your Arms,/a/alabama/christmas+in+your+arms_20005047.html,All my friends are asking me where I plan to s...
...,...,...,...,...
50506,Reba Mcentire,Old Man River,/r/reba+mcentire/old+man+river_20114622.html,"ronny scaife, danny hogan Cool breeze on th..."
50507,Reba Mcentire,On My Own,/r/reba+mcentire/on+my+own_20114592.html,So many times I said it was forever Said our ...
50508,Reba Mcentire,"One Child, One Day",/r/reba+mcentire/one+child+one+day_20114569.html,"Three wise men, a shining star A mother and a..."
50509,Reba Mcentire,One Promise Too Late,/r/reba+mcentire/one+promise+too+late_20114392...,I would have waited forever If I'd known that...


### Feature Engineering

In [97]:
tokenizer = RegexpTokenizer(r'\w+')
data['tokenized'] = data['text'].map(lambda x: tokenizer.tokenize(x))

Remove prefixes/suffixes

In [98]:
data['stop_words_removed'] = data['tokenized'].map(lambda x: [word for word in x if word not in (stop_words) and len(word) >= 3])

stemmer = SnowballStemmer("english")
token_to_stem = {}
token_count = 0

for lst in data['tokenized']:
    for token in lst:
        token_count += 1
        if token not in token_to_stem:
            token_to_stem[token] = stemmer.stem(token)

data['stemmed'] = data['stop_words_removed'].map(lambda lst: [token_to_stem[token] for token in lst])

print('Number of tokens: {}'.format(token_count))
print('Number of unique tokens: {}'.format(len(token_to_stem.keys())))
print('Number of unique stems: {}'.format(len(set(token_to_stem.values()))))

Number of tokens: 324661
Number of unique tokens: 11530
Number of unique stems: 7060


In [12]:
data

Unnamed: 0,artist,song,link,text,tokenized,stop_words_removed,stemmed
361,Alabama,Calling All Angels,/a/alabama/calling+all+angels_20005071.html,"Calling, calling all angels, oh I'm calling, c...","[Calling, calling, all, angels, oh, I, m, call...","[Calling, calling, angels, calling, calling, a...","[call, call, angel, call, call, angel, the, ni..."
362,Alabama,Can't Keep A Good Man Down,/a/alabama/cant+keep+a+good+man+down_20522156....,I thought it was forever I thought it would l...,"[I, thought, it, was, forever, I, thought, it,...","[thought, forever, thought, would, last, Gotta...","[thought, forev, thought, would, last, gotta, ..."
363,Alabama,Carolina Mountain Dew,/a/alabama/carolina+mountain+dew_20175804.html,Somewhere in the mountains......... In norther...,"[Somewhere, in, the, mountains, In, northern, ...","[Somewhere, mountains, northern, Alabama, The,...","[somewher, mountain, northern, alabama, the, c..."
364,Alabama,Christmas In Dixie,/a/alabama/christmas+in+dixie_20005147.html,"By now in New York City, there's snow on the g...","[By, now, in, New, York, City, there, s, snow,...","[New, York, City, snow, ground, And, Californi...","[new, york, citi, snow, ground, and, californi..."
365,Alabama,Christmas In Your Arms,/a/alabama/christmas+in+your+arms_20005047.html,All my friends are asking me where I plan to s...,"[All, my, friends, are, asking, me, where, I, ...","[All, friends, asking, plan, spend, holidays, ...","[all, friend, ask, plan, spend, holiday, peopl..."
...,...,...,...,...,...,...,...
50506,Reba Mcentire,Old Man River,/r/reba+mcentire/old+man+river_20114622.html,"ronny scaife, danny hogan Cool breeze on th...","[ronny, scaife, danny, hogan, Cool, breeze, on...","[ronny, scaife, danny, hogan, Cool, breeze, ri...","[ronni, scaif, danni, hogan, cool, breez, rive..."
50507,Reba Mcentire,On My Own,/r/reba+mcentire/on+my+own_20114592.html,So many times I said it was forever Said our ...,"[So, many, times, I, said, it, was, forever, S...","[many, times, said, forever, Said, love, would...","[mani, time, said, forev, said, love, would, a..."
50508,Reba Mcentire,"One Child, One Day",/r/reba+mcentire/one+child+one+day_20114569.html,"Three wise men, a shining star A mother and a...","[Three, wise, men, a, shining, star, A, mother...","[Three, wise, men, shining, star, mother, fath...","[three, wise, men, shine, star, mother, father..."
50509,Reba Mcentire,One Promise Too Late,/r/reba+mcentire/one+promise+too+late_20114392...,I would have waited forever If I'd known that...,"[I, would, have, waited, forever, If, I, d, kn...","[would, waited, forever, known, could, shared,...","[would, wait, forev, known, could, share, live..."


Removing unused columns

In [13]:
data = data.drop(['link', 'tokenized', 'stop_words_removed', 'song', 'text'], axis = 1)

Vectorize

In [109]:
join_stemmed = data['stemmed'].map(lambda tokens: " ".join(tokens))

vectorizer = TfidfVectorizer()
vectorized = vectorizer.fit_transform(join_stemmed.values).toarray()
data['vectorized'] = vectorized.tolist()

vector_length = vectorized.shape
print("Number of Documents, Length of Vector:", vector_length)

Number of Documents, Length of Vector: (1504, 6868)


### Data Splitting

In [15]:
train_df, test_df = train_test_split(data, test_size=0.2, stratify = data['artist'], random_state=0)

In [16]:
print(train_df['artist'].value_counts())

Donna Summer        153
Gordon Lightfoot    151
George Strait       150
Bob Dylan           150
Reba Mcentire       150
Cher                150
Loretta Lynn        150
Alabama             149
Name: artist, dtype: int64


In [17]:
print(test_df['artist'].value_counts())

George Strait       38
Bob Dylan           38
Alabama             38
Donna Summer        38
Gordon Lightfoot    38
Loretta Lynn        37
Reba Mcentire       37
Cher                37
Name: artist, dtype: int64


In [51]:
train_df

Unnamed: 0,artist,stemmed,vectorized
33891,George Strait,"[you, say, want, talk, want, tri, but, way, wo...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
34334,Gordon Lightfoot,"[four, month, ago, april, daycoach, came, and,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4413,Donna Summer,"[undercov, cop, car, came, screech, halt, bodi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
25795,Bob Dylan,"[utter, idl, word, reprob, mind, cling, strang...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50459,Reba Mcentire,"[thought, send, rose, for, reason, send, rose,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
4401,Donna Summer,"[perfect, love, never, find, say, goodby, left...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50443,Reba Mcentire,"[love, take, patienc, job, that, mama, alway, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
11996,Loretta Lynn,"[know, see, touch, sun, pain, wet, get, hurt, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50485,Reba Mcentire,"[daddi, use, tell, ran, fast, fall, hurt, but,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# Traditional Machine Learning

In [19]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from IPython.display import clear_output

In [121]:
class Classifier:
    def __init__(self, train_df, test_df, algorithm=None):
        self.train_df = train_df
        self.test_df = test_df
        self.slider = slider
        self.algorithm = algorithm
        
        self.classifier = None

    def plot(self):
        if self.algorithm == "KNN":
            return
        elif self.algorithm == "Decision Tree":
            print("Decision Tree...")
            
            plot_tree(self.classifier, filled=True)
            self.classifier = DecisionTreeClassifier(max_depth=slider, random_state=100)
            
        elif self.algorithm == "Random Forest":
            print("Random Forest...")
            
            for i, tree in enumerate(self.classifier.estimators_):
                plot_tree(tree, filled=True, max_depth=5)
                plt.title(f"Tree {i + 1}")
                plt.show()
                
        elif self.algorithm == "Gradient Boosting":
            return

    def set_classifier(self, slider):
        self.slider = slider
        
        if self.algorithm == "KNN":
            self.classifier = KNeighborsClassifier(n_neighbors=slider)
            self.slider_name = "Number of neighbors: "
            
        elif self.algorithm == "Decision Tree":
            self.classifier = DecisionTreeClassifier(max_depth=slider, random_state=100)
            self.slider_name = "Maximum Depth: "
            
        elif self.algorithm == "Random Forest":
            self.classifier = RandomForestClassifier(n_estimators=slider, random_state=100)
            self.slider_name = "Number of estimators: "
            
        elif self.algorithm == "Gradient Boosting":
            self.classifier = GradientBoostingClassifier(n_estimators=slider, learning_rate=0.05, random_state=100)
            self.slider_name = "Number of estimators: "
        

    def train(self, slider_val, plot=False):
        print("Computing...")
        self.set_classifier(slider_val)
        
        train_vec = np.vstack(self.train_df['vectorized'])
        test_vec = np.vstack(self.test_df['vectorized'])
    
        self.classifier.fit(train_vec, self.train_df['artist'])
        
        predicted = self.classifier.predict(test_vec)
        classifier_accuracy = accuracy_score(test_df['artist'], predicted)
        classifier_accuracy_percentage = round(classifier_accuracy * 100, 2)
    
        labels = test_df['artist'].unique()
        matrix = confusion_matrix(test_df['artist'], predicted, labels=labels, normalize='true')
        classifier_disp = ConfusionMatrixDisplay(confusion_matrix=matrix, display_labels=labels)

        clear_output(wait=True)
        print(f"{self.slider_name}: {self.slider}")
        print(f"{self.algorithm} Accuracy: {classifier_accuracy_percentage}%")
    
        classifier_disp.plot(xticks_rotation=75, values_format='.2f', colorbar = False)
        plt.title(f'{self.algorithm} Confusion Matrix')
        plt.show()

        if plot:
            self.plot()


# Traditional Machine Learning Visualization

In [119]:
import ipywidgets
from ipywidgets import interact

def visualize_ml(algorithm):
    classifier.algorithm = algorithm
    
    if algorithm == "KNN":
        interact(classifier.train, slider_val=slider)
        
    elif algorithm == "Decision Tree":
        interact(classifier.train, slider_val=slider)
                
    elif algorithm == "Random Forest":
        interact(classifier.train, slider_val=slider)

    elif algorithm == "Gradient Boosting":
        interact(classifier.train, slider_val=slider)


slider = ipywidgets.IntSlider(value=1, min=1, max=100, step=3, description='Value:')

algorithms = ["KNN", "Decision Tree", "Random Forest", "Gradient Boosting"]
algorithm_dropdown = ipywidgets.Dropdown(options=algorithms, value=algorithms[0], description='Algorithm')


classifier = Classifier(train_df, test_df)

interactive_plot = interact(visualize_ml, algorithm=algorithm_dropdown)

interactive(children=(Dropdown(description='Algorithm', options=('KNN', 'Decision Tree', 'Random Forest', 'Gra…

# Deep Learning

In [22]:
from collections import Counter
import torch
import torch.nn as nn
from torch import optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler

device = torch.device('cpu')

### Feature Engineering Extended

In [120]:
PADDING_VALUE = 0
UNK_VALUE = 1

def generate_vocab(df, min_count):
    vocab = { "": PADDING_VALUE, "UNK": UNK_VALUE}

    counter = Counter()
    for word in df['stemmed']:
        counter.update(word)
    
    id = 2
    for token, count in counter.items():
        if count > min_count:
            vocab[token] = id
            id += 1

    return vocab


def generate_labels(df):
    artists = train_df['artist'].unique()
    labels = {artist: index for index, artist in enumerate(artists)}
    
    return labels


def collate_fn(batch, padding_value=PADDING_VALUE): # Dataloader parameter function to batch data
    
    tokens, labels = zip(*batch)
    padded_tokens = pad_sequence(tokens, batch_first=True, padding_value=padding_value).long().to(device)
    labels_tensor = torch.tensor(labels).long().to(device)

    return padded_tokens, labels_tensor

In [24]:
class VectorDataset(Dataset): 

    def __init__(self, df, labels):
        self.df = df
        self.labels = labels
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        vector = torch.tensor(row['vectorized']).float().to(device)
        label = torch.tensor(self.labels[row['artist']]).long().to(device)

        return vector, label

In [25]:
class LyricDataset(Dataset): 

    def __init__(self, df, vocab, labels, max_length=200):
        self.vocab = vocab
        self.labels = labels
        self.df = df
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        ids = []

        for token in row['stemmed'][:self.max_length]:
            if token in self.vocab:
                ids.append(self.vocab[token])
            else:
                ids.append(self.vocab['UNK'])
            
        tokenized_tensor = torch.tensor(ids).long()
        label = torch.tensor(self.labels[row['artist']]).long()

        return tokenized_tensor, label

In [125]:
BATCH_SIZE = 16

vocab = generate_vocab(train_df, 3) #generate unique vocab map
labels = generate_labels(train_df) #generate unique label map

# Create Loaders for Feed-Forward and Recurrent Model
train_vector_dataset = VectorDataset(train_df, labels)
test_vector_dataset = VectorDataset(test_df, labels)

train_vector_sampler = RandomSampler(train_vector_dataset)
test_vector_sampler = RandomSampler(test_vector_dataset)

train_vector_loader = DataLoader(train_vector_dataset, batch_size=BATCH_SIZE, sampler=train_vector_sampler)
test_vector_loader = DataLoader(test_vector_dataset, batch_size=BATCH_SIZE, sampler=test_vector_sampler)

#Create Loaders for Neural Embedding
train_lyric_dataset = LyricDataset(train_df, vocab, labels)
test_lyric_dataset = LyricDataset(test_df, vocab, labels)

train_lyric_sampler = RandomSampler(train_lyric_dataset)
test_lyric_sampler = RandomSampler(test_lyric_dataset)

train_lyric_loader = DataLoader(train_lyric_dataset, batch_size=BATCH_SIZE, sampler=train_lyric_sampler, collate_fn=collate_fn)
test_lyric_loader = DataLoader(test_lyric_dataset, batch_size=BATCH_SIZE, sampler=test_lyric_sampler, collate_fn=collate_fn)

### Training and Evaluation

In [111]:
def train(model, criterion, optim, iterator):
    model.train() #set to training mode
    total_correct, total_loss, num_examples = 0, 0, 0
    
    for input_data, label in iterator:
        
        scores = model(input_data) #forward pass
        predictions = scores.argmax(dim=1)

        correct = torch.sum(predictions == label)
        loss = criterion(scores, label)

        optim.zero_grad() #clear gradient
        loss.backward() #compute gradient
        optim.step() #update parameters

        total_correct += correct
        total_loss += loss.item()
        num_examples += len(label)

    accuracy = 100. * total_correct / num_examples
    avg_loss = total_loss / num_examples

    return accuracy, avg_loss 


def evaluate(model, criterion, iterator):
    model.eval() #set to evaluation mode
    total_correct, total_loss, num_examples = 0, 0, 0

    for input_data, label in iterator:
        
        scores = model(input_data)
        predictions = scores.argmax(dim=1)

        correct = torch.sum(predictions == label)
        loss = criterion(scores, label)

        total_correct += correct
        total_loss += loss.item()
        num_examples += len(label)


    accuracy = 100. * total_correct / num_examples
    avg_loss = total_loss / num_examples

    return accuracy, avg_loss


def train_loop(model, criterion, optim, train_loader, test_loader, epochs):
    best_test_acc = 0

    for epoch in range(epochs):
        print("Epoch:", epoch)
        train_acc, train_loss = train(model, criterion, optim, train_loader)
        test_acc, test_loss = evaluate(model, criterion, test_loader)

        if test_acc > best_test_acc:
            print(f"Highest Accuracy during epoch {epoch}: {test_acc}")
            best_test_acc = test_acc
            torch.save(model.state_dict(), "state.pth")

    model.load_state_dict(torch.load("state.pth"))


def predict_plot_confusion_matrix(model, iterator, test_df):
    model.eval()
    actuals, predictions = [], []

    for input_data, label in iterator:
        
        scores = model(input_data)
        preds = scores.argmax(dim=1)
        predictions.extend(preds.tolist())

        actuals.extend(label.tolist())

    actuals, predictions = np.array(actuals), np.array(predictions)
    
    accuracy = np.sum(actuals == predictions) / len(actuals)
    accuracy_percentage = str(round(accuracy * 100, 2)) + "%"

    labels = test_df['artist'].unique()
    matrix = confusion_matrix(actuals, predictions, normalize='true')
    
    disp = ConfusionMatrixDisplay(confusion_matrix=matrix, display_labels=labels)

    return disp, accuracy_percentage

# Models

### Feed Forward Nerual Network

In [28]:
class FeedForwardModel(nn.Module):
    def __init__(self, input_size, num_labels, dropout=0.5):
        super().__init__()

        self.classifier = nn.Sequential(
            nn.Linear(in_features=input_size, out_features=50),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(in_features=50, out_features=50),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(in_features=50, out_features=num_labels),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        
        return self.classifier(x)

In [29]:
feed_forward_model = FeedForwardModel(input_size=6868, num_labels=len(labels)).to(device)

### Nerual Embedding

In [57]:
class NeuralEmbedding(nn.Module):
    def __init__(self, vocab_size, num_labels, embedding_dim, dropout=0.5):
        super().__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        
        self.classifier = nn.Sequential(
            nn.Linear(in_features=embedding_dim, out_features=50),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(in_features=50, out_features=50),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(in_features=50, out_features=num_labels),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        embedded = self.embedding(x.long())
        averaged = embedded.mean(dim = 1)
        probabilties = self.classifier(averaged)

        return probabilties

In [58]:
neural_embedding_model = NeuralEmbedding(vocab_size=len(vocab), num_labels=len(labels), embedding_dim = 200).to(device)

### Recurrent Nerual Network

In [32]:
class RecurrentModel(nn.Module):
    def __init__(self, input_size, num_labels, hidden_size, dropout=0.5):
        super().__init__()

        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)

        self.classifier = nn.Sequential(
            nn.Linear(in_features=hidden_size, out_features=50),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(in_features=50, out_features=50),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(in_features=50, out_features=num_labels),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        lstm_out, memory = self.lstm(x)
        
        return self.classifier(lstm_out)

In [33]:
recurrent_model = RecurrentModel(input_size=6868, num_labels=len(labels), hidden_size=30).to(device)

# Neural Model Trainer

In [60]:
class NeuralModelTrainer:
    def __init__(self, model=None, criterion=None, train_loader=None, test_loader=None, lr=0.005, epochs=10):
        self.lr = lr
        self.epochs = epochs
        
        self.optimizer_type = None
        self.model_title = None
        self.optimizer_title = None

    def optimize(self, optimizer_type):
        self.optimizer_title = optimizer_type
        if optimizer_type ==  "SGD":
            self.optimizer_type = optim.SGD(self.model.parameters(), lr=0.005, momentum=0.9)
        elif optimizer_type ==  "RMSprop":
            self.optimizer_type = optim.RMSprop(self.model.parameters(), lr=0.005)
        elif optimizer_type ==  "Adam":
            self.optimizer_type = optim.Adam(self.model.parameters(), lr=0.005)

    def train_and_evaluate(self, optimizer_type):
        self.optimize(optimizer_type)
        
        print("Calculating...")
        train_loop(self.model, self.criterion, self.optimizer_type, self.train_loader, self.test_loader, self.epochs)
        disp, accuracy = predict_plot_confusion_matrix(self.model, self.test_loader, test_df)

        clear_output(wait=True)
        print(f"{self.optimizer_title} Optimizer Accuracy: {accuracy}")
        disp.plot(xticks_rotation=75, values_format='.2f', colorbar=False)
        plt.title(f'{self.model_title} Confusion Matrix')
        plt.show()

# Deep Learning Visualization

In [114]:
def visualize_dl(algorithm):
    neural_model.model_title = algorithm
    
    if algorithm == "Feed Forward Neural Network":
        neural_model.model = feed_forward_model
        neural_model.train_loader = train_vector_loader
        neural_model.test_loader = test_vector_loader
        
        interact(neural_model.train_and_evaluate, optimizer_type=optimizer_dropdown)
        
    elif algorithm == "Neural Embedding":
        neural_model.model = neural_embedding_model
        neural_model.train_loader = train_lyric_loader
        neural_model.test_loader = test_lyric_loader
        
        interact(neural_model.train_and_evaluate, optimizer_type=optimizer_dropdown)
    
    elif algorithm == "Recurrent Neural Network":
        neural_model.model = recurrent_model
        neural_model.train_loader = train_vector_loader
        neural_model.test_loader = test_vector_loader
        
        interact(neural_model.train_and_evaluate, optimizer_type=optimizer_dropdown)


optimizers = ["SGD", "RMSprop", "Adam"]
optimizer_dropdown = ipywidgets.Dropdown(options=optimizers, value=optimizers[2], description='Optimizer')

algorithms = ["Feed Forward Neural Network", "Neural Embedding", "Recurrent Neural Network"]
algorithm_dropdown = ipywidgets.Dropdown(options=algorithms, value=algorithms[0], description='Algorithm')


neural_model = NeuralModelTrainer()
neural_model.criterion = nn.CrossEntropyLoss()
neural_model.optimizer_dropdown = optimizer_dropdown

interactive_plot = interact(visualize_dl, algorithm=algorithm_dropdown)

interactive(children=(Dropdown(description='Algorithm', options=('Feed Forward Neural Network', 'Neural Embedd…