In [1]:
import pandas as pd
import numpy as np
import csv
from nltk.stem import WordNetLemmatizer
from gensim.models.doc2vec import TaggedDocument



In [2]:
wnl = WordNetLemmatizer()

class Sentences(object):
    
    def __init__(self, filename, column):
        self.filename = filename
        self.column = column
        
    @staticmethod
    def get_tokens(text):
        """Helper function for tokenizing data"""
        return [wnl.lemmatize(r.lower()) for r in text.split()]
 
    def __iter__(self):
        reader = csv.DictReader(open(self.filename, 'r' ))
        for row in reader:
            words = self.get_tokens(row[self.column])
            tags = ['%s|%s' % (row['Artist'], row['SongID'])]
            yield TaggedDocument(words=words, tags=tags)

In [3]:
filename = 'lyrics_dfclean_rap100.csv'
sentences = Sentences(filename=filename, column='Lyrics')

# for song lookups
df_train = pd.read_csv(filename)

In [4]:

from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec(
    alpha=0.025,
    min_alpha=0.025,
    workers=15, 
    min_count=2,
    window=10,
    size=300,
    iter=20,
    sample=0.001,
    negative=5
)



In [5]:
model.build_vocab(sentences)

In [6]:
model.save('rap-lyrics.doc2vec')

model = Doc2Vec.load('rap-lyrics.doc2vec')

In [7]:
model.wv.word_vec('rap')

array([ 3.02083616e-04, -4.68715036e-04, -9.93132358e-04, -1.42299151e-03,
        9.13617143e-04, -7.04606296e-04,  6.47708075e-04,  6.83385937e-04,
       -1.13368803e-03,  1.42135704e-03, -3.34205717e-04,  3.60242964e-04,
       -7.07598927e-04,  6.29830232e-04,  8.25769617e-04,  1.20889198e-03,
        1.47062074e-03, -1.64852128e-03,  1.10768282e-03,  1.56695547e-03,
        1.43725704e-03,  1.24008581e-03,  1.32028142e-03, -1.49414514e-03,
        6.84477738e-04, -8.10408150e-04,  1.47308991e-03, -1.28575671e-03,
       -9.74034119e-05,  4.93510801e-04, -1.23450672e-03, -1.49352697e-03,
        1.90216451e-04, -1.22913145e-04, -1.33403728e-03,  1.10410643e-03,
        1.29718450e-03, -7.44064164e-04,  1.60618790e-03, -1.89146100e-04,
       -2.78893043e-04, -1.12125615e-03, -2.11936145e-04, -1.22347346e-03,
       -5.02556832e-05, -1.65929215e-03,  4.66009107e-04, -1.47322298e-03,
        1.07947516e-03,  9.51986527e-04,  1.05475228e-04, -9.28394031e-04,
        7.30948814e-04,  

In [8]:
model.wv.most_similar('you')

[('loud', 0.20162919163703918),
 ('living', 0.18209415674209595),
 ('stackin', 0.1763698011636734),
 ('orange', 0.16115154325962067),
 ('so', 0.15814511477947235),
 ('backnwake', 0.15596804022789001),
 ('forgot', 0.15453220903873444),
 ('ask', 0.1531437486410141),
 ('yet', 0.1510993391275406),
 ('givenyoull', 0.14993122220039368)]

In [9]:
def print_titles(results):
    lookup = lambda x: df_train[
        df_train.song_id==int(x)
    ].song.values[0]
    return [
        [
            i[0].split('|')[0], 
            lookup(i[0].split('|')[1]), 
            i[1]
        ] for i in results
    ]

In [10]:
print_titles(
    model.docvecs.most_similar([model['??????']], topn=10)
)

KeyError: "tag '??????' not seen in training corpus/invalid"