In [2]:
import pandas as pd
import numpy as np
import csv
from nltk.stem import WordNetLemmatizer
from gensim.models.doc2vec import TaggedDocument



In [3]:
df_general = pd.read_csv('lyrics_dfclean_598.csv')
del (df_general['Unnamed: 0'])
df_general

Unnamed: 0,Song_Name,Artist,Genre,Lyrics,SongID
0,No Tears Left to Cry,Ariana Grande,Pop,right state mind wanna like time got tears lef...,1
1,FRIENDS,Marshmello & Anne-Marie,Pop,ooooh oh ooooh woh ooooh oh ooooh woh say love...,2
2,A Million Dreams,Ziv Zaifman,Pop,close eyes see world waiting call dark door on...,3
3,The Middle,Zedd,Pop,take seat right sat stairs stay leave cabinets...,4
4,Rewrite the Stars,Zac Efron,Pop,know want secret try hide know want keep sayin...,5
5,âlovely,Billie Eilish & Khalid,Pop,thought found way thought found way found neve...,6
6,Be Careful,Cardi B,Pop,yeah care care care uh yeah look wanna get mar...,7
7,Te BotÃ© (Remix),"Nio GarcÃ­a, Casper MÃ¡gico & Bad Bunny",Pop,lyrics,8
8,One Kiss,Calvin Harris & Dua Lipa,Pop,one kiss takes fallin love possibilities look ...,9
9,SAD!,XXXTENTACION,Pop,yeah someone afraid let go uh decide ever gonn...,10


In [4]:
wnl = WordNetLemmatizer()

class Sentences(object):
    
    def __init__(self, filename, column):
        self.filename = filename
        self.column = column
        
    @staticmethod
    def get_tokens(text):
        """Helper function for tokenizing data"""
        return [wnl.lemmatize(r.lower()) for r in text.split()]
 
    def __iter__(self):
        reader = csv.DictReader(open(self.filename, 'r' ))
        for row in reader:
            words = self.get_tokens(row[self.column])
            tags = ['%s|%s' % (row['Artist'], row['SongID'])]
            yield TaggedDocument(words=words, tags=tags)

In [5]:
filename = 'lyrics_dfclean_598.csv'
sentences = Sentences(filename=filename, column='Lyrics')

# for song lookups
df_train = pd.read_csv(filename)

In [6]:
n_dim = 300

In [7]:

from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec(
    alpha=0.025,
    min_alpha=0.025,
    workers=15, 
    min_count=2,
    window=10,
    size=300,
    iter=20,
    sample=0.001,
    negative=5
)



In [8]:
model.build_vocab(sentences)

In [9]:
model.save('lyrics598.doc2vec')

model = Doc2Vec.load('lyrics598.doc2vec')

In [10]:
model.wv.word_vec('look')

array([ 6.43691339e-04, -8.43589893e-04, -2.64992676e-04, -6.27334113e-04,
       -8.44512775e-04,  1.19857886e-03, -7.49203435e-04, -9.30247945e-04,
        8.11470207e-04,  7.08626816e-04,  5.78158826e-04,  4.07293846e-04,
       -7.65120203e-04,  1.63657044e-03, -2.01943316e-04,  5.53486578e-04,
        8.50060664e-04,  3.05975205e-04,  3.30239505e-04, -8.87740985e-04,
        6.43991516e-04, -6.40007784e-04,  6.78881246e-04,  7.79793598e-04,
       -1.62045099e-03, -1.39574485e-03, -1.12805853e-03, -1.60233211e-03,
       -1.26680607e-04, -3.43428808e-04,  1.17881200e-03,  1.17479044e-03,
       -1.59273541e-03,  1.06306886e-03, -1.16418011e-03, -4.40394768e-04,
       -3.41445586e-04,  6.14566787e-04, -2.62759277e-04,  1.51088473e-03,
        9.19725280e-04,  6.79241566e-05, -3.24375840e-04,  1.62622298e-03,
       -5.38365683e-04, -5.53829712e-04,  1.17916102e-03,  8.93750868e-04,
       -5.80287830e-04,  1.61022018e-03, -1.52601671e-04,  1.43184327e-04,
        1.24416733e-03, -

In [12]:
def print_songs(results):
    lookup = lambda x: df_train[
        df_train.SongID==int(x)
    ].Song_Name.values[0]
    return [
        [
            i[0].split('|')[0], 
            lookup(i[0].split('|')[1]), 
            i[1]
        ] for i in results
    ]

In [13]:
print_songs(
    model.docvecs.most_similar([model['country']], topn=10)
)

[['Dolly Parton', 'Jolene', 0.16160523891448975],
 ['Virgoun', 'Bukti', 0.15511712431907654],
 ['Larray', 'First Place (The Race - Remix)', 0.14716432988643646],
 ['Hugh Jackman', 'The Other Side', 0.13595302402973175],
 ['Demi Lovato', 'Sorry Not Sorry', 0.13429497182369232],
 ['Taylor Swift', 'Enchanted', 0.13356564939022064],
 ['Johnny Cash', 'You Are My Sunshine', 0.1256883293390274],
 ['5 Seconds of Summer', 'Youngblood', 0.11990809440612793],
 ['Cakra Khan', 'Kekasih Bayangan', 0.11939827352762222],
 ['Lady Gaga', 'Million Reasons', 0.11657141149044037]]

In [14]:
model['country'].size

300

In [15]:
from gensim.models.doc2vec import TaggedDocument
from sklearn.cross_validation import train_test_split



In [16]:
X = df_general.loc[:,'Lyrics']
y = df_general.Genre

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
from sklearn.preprocessing import normalize

In [19]:
def getVecs_doc2vec(model, corpus, size):
    vec = np.zeros(size).reshape((1,size))
    count = 0.
    for word in corpus:
        try:
            vec += model[word].reshape((1,size))
            count += 1.
        except KeyError:
            continue
    if count != 0 :
        vec /= count
    return vec

In [20]:
train_vecs_dm = np.concatenate([getVecs_doc2vec(model, z,n_dim) for z in X_train])
train_vecs_dm = normalize(train_vecs_dm)

In [21]:
test_vecs_dm = np.concatenate([getVecs_doc2vec(model, z,n_dim) for z in X_test])
test_vecs_dm = normalize(test_vecs_dm)

In [22]:
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss = 'log', penalty='l1')

In [23]:
lr = SGDClassifier(loss = 'log', penalty='l1')
lr.fit(train_vecs_dm, y_train)
print('test accuraccy: %.2f'%lr.score(test_vecs_dm,y_test))

test accuraccy: 0.51




In [24]:
lr = SGDClassifier(loss = 'log', penalty='l2')
lr.fit(train_vecs_dm, y_train)
print('test accuraccy: %.2f'%lr.score(test_vecs_dm,y_test))

test accuraccy: 0.52




In [25]:
from sklearn import svm
clf = svm.SVC(C=0.8, kernel='linear',  decision_function_shape='ovo')
clf.fit(train_vecs_dm,y_train)
print(clf.score(train_vecs_dm,y_train))
print(clf.score(test_vecs_dm,y_test))

0.5669856459330144
0.5166666666666667


In [26]:
clf = svm.SVC(C=0.7, kernel='rbf', gamma=0.8, decision_function_shape='ovr')
clf.fit(train_vecs_dm,y_train)
print(clf.score(train_vecs_dm,y_train))
print(clf.score(test_vecs_dm,y_test))

0.5933014354066986
0.55


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_test1_dm = {'n_estimators':range(10,251,10)}
gsearch1_dm = GridSearchCV(estimator = RandomForestClassifier(n_jobs= 4), param_grid = param_test1_dm, scoring='accuracy',cv=10)
gsearch1_dm.fit(train_vecs_dm,y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(10, 251, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [28]:
print(gsearch1_dm.best_params_)
print(gsearch1_dm.best_score_)

{'n_estimators': 30}
0.7966507177033493


In [29]:
clf_dm = RandomForestClassifier(n_estimators=180, n_jobs=4,)
clf_dm.fit(train_vecs_dm,y_train)

clf_dm.score(test_vecs_dm,y_test)

0.8222222222222222