<a href="https://colab.research.google.com/github/brianvoha/Lyric-based-Classification-of-Musical-Genres/blob/master/FINALPROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install langdetect

In [0]:
!pip install whatthelang

In [0]:
# DONT RUN
import pandas as pd
import matplotlib.pyplot as plt
from whatthelang import WhatTheLang

songList = pd.read_csv("lyrics.csv", engine='python', error_bad_lines=False, encoding='utf-8')

# replace '\n' with space
songList = songList.replace({'\n': ' '}, regex=True)

# Clean up the dataset (from Lyrics Genre Analysis - Machine Learning)
songList['lyrics'] = songList['lyrics'].str.replace(r'[^\w\s]+', '')
songList['lyrics'] = songList['lyrics'].str.replace(r'^\s+$', '')
songList['lyrics'] = songList['lyrics'].str.replace(r'^[\s\d\s]+$', '')

# remove all songs with <13 characters (Includes all the Instrumental songs)
songList=songList[songList.lyrics.apply(lambda x: len(str(x))>=13)]
print(str(len(songList))+" rows left.")

# Remove duplicate lyrics fields, keep the first entry
songList.drop_duplicates(subset ="lyrics", keep = 'first', inplace = True)

print(str(len(songList))+" rows left.") 

# Remove songs with 'Not Available' or 'Other' as their genres
songList = songList[songList['genre']!='Not Available'].dropna()
songList = songList[songList['genre']!='Other'].dropna()
print(str(len(songList))+" rows left.")

# Remove songs that are not in english
wtl = WhatTheLang()
songList=songList[songList.lyrics.apply(lambda x: wtl.predict_lang(str(x))=="en")]
print(str(len(songList))+" rows left.")

songList['word_count'] = songList['lyrics'].str.split().str.len()
songList.head()

#remove all songs with <100 words or >1000 words
df_clean = songList[songList['word_count'] >= 100]
df_clean = songList[songList['word_count'] <= 1000]
df_clean['word_count'].groupby(df_clean['genre']).describe()
print(str(len(df_clean))+" rows left.")

df_clean.to_csv("clean.csv", encoding='utf-8')

In [0]:
#DONT RUN
import random
import pandas as pd

In [0]:
#UPLOAD CLEAN.CSV AND START HERE
import pandas as pd

songList = pd.read_csv("clean.csv", engine='python', error_bad_lines=False, encoding='utf-8')
index = []
columns = ['index','song','year','artist','genre','lyrics']
final = pd.DataFrame(index=index,columns=columns)
a = set(songList['genre'])

# remove 5 genres with the least amount of songs
a.remove('Electronic')
a.remove('Folk')
a.remove('Indie')
a.remove('Jazz')
a.remove('R&B')

# take a random sample of 13353 for each of the remaining genres and put them into 
for genre in a:
    temp = songList[songList['genre'] == genre].sample(n=13353,random_state = 1)
    final = pd.concat([final,temp])
print(len(final))
final.to_csv("balanced.csv", encoding='utf-8')

In [0]:
# DONT RUN
# remove stopwords from final
# import nltk
# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')
# tokenized = final['lyrics'].apply(lambda x: x.split())
# tokenized = tokenized.apply(lambda x: [item for item in x if item not in stop_words])
# detokenized_doc = [] 
# for i in range(len(final)): 
#     t = ' '.join(tokenized[i]) 
#     detokenized_doc.append(t) 
# final['lyrics'] = detokenized_doc

In [0]:
from fastai.text import *
from sklearn.model_selection import train_test_split

# split data into train/val and test sets
final, test = train_test_split(final, test_size=0.2, random_state=1)
print(final.shape)
print(test.shape)
final = final[~final.isna().any(axis=1)]
# split train/val ito train and val sets
final['train'] = np.random.choice(a=[True,False], size=len(final), p=[0.8,0.2])
final.groupby('train').size()
df_full = final.copy()

In [0]:
#create the language model and classfier model data bunches
data_lm2 = TextLMDataBunch.from_df(path='.', 
                             train_df=final[final['train']], 
                             valid_df=final[~final['train']], 
                             text_cols = 'lyrics',
                            )
data2 = TextClasDataBunch.from_df(path='.', 
                             train_df=final[final['train']], 
                             valid_df=final[~final['train']], 
                             text_cols = 'lyrics',
                             label_cols = 'genre',
                             vocab = data_lm2.train_ds.vocab, bs=32
                            )

In [0]:
#save our data bunches so we don't have to create them every time
data_lm2.save('data_lm2_export.pkl')
data2.save('data2_export.pkl')

In [0]:
#load in the data bunches
data_lm2 = load_data('.', 'data_lm2_export.pkl')
data2 = load_data('.', 'data2_export.pkl', bs=16)

In [0]:
#start fine-tuning the pre-trained model to fit our dataset
learn2 = language_model_learner(data_lm2, AWD_LSTM, drop_mult=0.5)
learn2.fit_one_cycle(1, 1e-2)

In [0]:
learn2.unfreeze()
learn2.fit_one_cycle(1, 1e-3)

In [0]:
learn2.save_encoder('ft_enc')

In [0]:
learn2 = text_classifier_learner(data2, AWD_LSTM, drop_mult=0.5)
learn2.load_encoder('ft_enc')

In [0]:
data2.show_batch()

In [0]:
learn2.fit_one_cycle(1, 1e-2)

In [0]:
learn2.freeze_to(-2)
learn2.fit_one_cycle(1, slice(5e-3/2., 5e-3))

In [0]:
learn2.unfreeze()
learn2.fit_one_cycle(1, slice(2e-3/100, 2e-3))

In [0]:
interp2 = ClassificationInterpretation.from_learner(learn2)

In [0]:
interp2.most_confused(slice_size=10)

In [0]:
interp2.plot_confusion_matrix()

In [0]:
lyric = "MidnightYou come and pick me up, no headlights Long driv Could end in burning flames or paradise Fade into view, oh It's been a while since I have even heard from you And I should just tell you to leave, 'cause I Know exactly where it leads, but I Watch us go 'round and 'round each time You got that James Dean daydream look in your eye And I got that red lip classic thing that you like And when we go crashing down, we come back every time 'Cause we never go out of style, we never go out of style You got that long hair, slicked back, white t-shirt And I got that good girl faith and a tight little skirt And when we go crashing down, we come back every time 'Cause we never go out of style, we never go out of style"
print(str(learn2.predict(lyric)[0]))

In [0]:
for index, row in test.iterrows():
  test.at[index,'predicted'] = str(learn2.predict(row.lyrics)[0])

In [0]:
seriesObj = test.apply(lambda x: True if x['predicted'] == x['genre'] else False , axis=1)
# Count number of True in series
numOfRows = len(seriesObj[seriesObj == True].index)

print("There are "+str(numOfRows)+" correctly identified songs.")
print("Accuracy: "+str(numOfRows/float(len(test))))

In [0]:
pip install palettable

In [0]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
from palettable.colorbrewer.sequential import YlOrBr_4
from matplotlib.colors import ListedColormap
import palettable

# test = pd.read_csv("predictFINAL.csv", engine='python', error_bad_lines=False, encoding='utf-8')
arr_actual = test['genre'].astype(str).values.tolist()
arr_predicted = test['predicted'].astype(str).values.tolist()

# confusion_matrix(arr_actual, arr_predicted)
y_actu = pd.Series(arr_actual, name='Actual')
y_pred = pd.Series(arr_predicted, name='Predicted')
df_confusion = pd.crosstab(test['genre'], test['predicted'], rownames=['Actual'], colnames=['Predicted'])
plt.figure(figsize = (6,5))
cmap = ListedColormap(palettable.colorbrewer.sequential.YlOrBr_4.mpl_colors)
sn.heatmap(df_confusion, annot=True, fmt='g', cmap=cmap,)

In [0]:
test.to_csv("predict.csv", encoding='utf-8')

In [0]:
## Tim's code for Naive Bayes, Logistic Regression, KNN, SVC
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import seaborn as sn

lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))

lyrics = []
genre = []
songList = pd.read_csv("balanced.csv", engine='python', error_bad_lines=False, encoding='utf-8')
for row in songList.itertuples():
    tlyrics = " ".join([lemmatizer.lemmatize(x.lower()) for x in row.lyrics.split() if x not in stop])

    lyrics.append(tlyrics)
    genre.append(row.genre)

tfidf_vectorizer = TfidfVectorizer( max_features=10000)
x = tfidf_vectorizer.fit_transform(lyrics)

xtrain, xval, ytrain, yval = train_test_split(x, genre, test_size=0.2, random_state=9)
#y = multilabel_binarizer.inverse_transform(songList['genre'])
#print(x.shape)
#print(y)
knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(xtrain,ytrain)
nb = MultinomialNB()
nb.fit(xtrain,ytrain)
lg = LogisticRegression()
lg.fit(xtrain,ytrain)
pred = lg.predict(xval)
svm = LinearSVC()
svm.fit(xtrain,ytrain)

a = list(set(genre))
print(a)
print(confusion_matrix(yval,pred,labels= a))
df_confusion = pd.crosstab(yval, pred, rownames=['Actual'], colnames=['Predicted'])
plt.figure(figsize = (6,5))
sn.heatmap(df_confusion, annot=True, fmt='g', cmap='Blues')
print("nb:",nb.score(xval,yval))
print("knn:",knn.score(xval,yval))
print("lg:",lg.score(xval,yval))
print("svm:",svm.score(xval,yval))

In [0]:
import pandas as pd

songList = pd.read_csv("clean.csv", engine='python', error_bad_lines=False, encoding='utf-8')

for genre in set(songList['genre']):
    print(genre,len(songList[songList['genre']==genre]))