In [2]:
import pandas as pd
import numpy as np
import csv
from nltk.stem import WordNetLemmatizer
from gensim.models.doc2vec import TaggedDocument



In [3]:
wnl = WordNetLemmatizer()

class Sentences(object):
    
    def __init__(self, filename, column):
        self.filename = filename
        self.column = column
        
    @staticmethod
    def get_tokens(text):
        """Helper function for tokenizing data"""
        return [wnl.lemmatize(r.lower()) for r in text.split()]
 
    def __iter__(self):
        reader = csv.DictReader(open(self.filename, 'r' ))
        for row in reader:
            words = self.get_tokens(row[self.column])
            tags = ['%s|%s' % (row['Artist'], row['SongID'])]
            yield TaggedDocument(words=words, tags=tags)

In [4]:
filename = 'C:/Users/alienware/Desktop/660project/preprocess/lyrics_dfclean_rap100.csv'
sentences = Sentences(filename=filename, column='Lyrics')

# for song lookups
df_train = pd.read_csv(filename)

In [5]:

from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec(
    alpha=0.002,
    min_alpha=0.002,
    workers=15, 
    min_count=2,
    window=10,
    size=300,
    iter=20,
    sample=0.001,
    negative=5
)



In [6]:
model.build_vocab(sentences)

In [7]:
model.save('lyrics.doc2vec')

model = Doc2Vec.load('lyrics.doc2vec')

In [11]:
model.wv.word_vec('look')

array([-1.59364099e-05, -1.00325723e-03,  1.76167567e-04,  7.59857416e-04,
        2.26444346e-04, -1.98105146e-04, -6.91738329e-04,  3.60095219e-05,
       -1.38420798e-03, -7.80572518e-05, -9.34609736e-04, -5.72831195e-04,
       -7.84504402e-04,  1.24225859e-03, -6.61123195e-04, -1.32618903e-03,
        4.33237234e-04, -4.23180551e-04, -1.12534373e-03, -3.25649220e-04,
        1.11146411e-03,  1.59883033e-03,  1.49207993e-03, -1.43145444e-03,
        1.50021887e-03, -9.12959105e-04, -1.09146081e-03,  8.34635284e-04,
        2.14894724e-04, -7.75872555e-04,  9.20016842e-04,  1.94030596e-04,
        9.62458667e-04, -1.45337603e-04,  5.99248306e-05,  1.60731049e-03,
       -9.30978102e-04,  6.64379681e-04, -1.44066871e-03, -6.36127661e-04,
        1.55860023e-03,  7.31624721e-04, -1.36226212e-04,  9.65996529e-04,
       -6.76926793e-05, -1.66124944e-03,  7.81188603e-04, -1.04021758e-03,
        1.45327812e-03,  7.63955759e-04, -1.97798479e-04,  9.61203943e-04,
        1.34088262e-03, -

In [17]:
model.wv.most_similar('brother')

[('sound', 0.18308037519454956),
 ('friendsnthey', 0.18186749517917633),
 ('save', 0.18142272531986237),
 ('else', 0.17627644538879395),
 ('xanax', 0.17517533898353577),
 ('yountell', 0.16326946020126343),
 ('holy', 0.16178609430789948),
 ('jelly', 0.16161216795444489),
 ('near', 0.157787024974823),
 ('talk', 0.15720215439796448)]

In [26]:
def print_songs(results):
    lookup = lambda x: df_train[
        df_train.SongID==int(x)
    ].Song.values[0]
    return [
        [
            i[0].split('|')[0], 
            lookup(i[0].split('|')[1]), 
            i[1]
        ] for i in results
    ]

In [27]:
print_songs(
    model.docvecs.most_similar([model['rap']], topn=10)
)

[['G-Eazy x Bebe Rexha', 'Me, Myself & I', 0.13875354826450348],
 ['DLOW', 'Do It Like Me', 0.10179801285266876],
 ['Eminem Featuring Ed Sheeran', 'River', 0.10104384273290634],
 ['O.T. Genasis Featuring Young Dolph', 'Cut It', 0.0972680002450943],
 ['J. Cole', 'Deja Vu', 0.09491831064224243],
 ['G-Eazy Featuring A$AP Rocky & Cardi B', 'No Limit', 0.08829905092716217],
 ['Drake', 'Feel No Ways', 0.08621341735124588],
 ['Drake Featuring The Throne', 'Pop Style', 0.08593448996543884],
 ['G-Eazy & Halsey', 'Him & I', 0.08452640473842621],
 ['Drake Featuring 21 Savage', "Sneakin'", 0.08221504092216492]]