In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import utils

In [2]:
# Run nltk downloader
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/tom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
tracks_by_artist = pd.read_csv("./data/tracks_with_lyrics_since_2013.csv")

In [3]:
tracks_by_artist.head()

Unnamed: 0,artist,album,track,lyrics
0,2 Chainz,ColleGrove,Not Invited,[Intro]\nOf course I'ma stunt\nOf course I'ma ...
1,2 Chainz,Based On A T.R.U. Story,Money Machine,"[Intro]\nI told them, get on my level\nBitch, ..."
2,2 Chainz,ColleGrove,MF'N Right,[Produced by Mike WiLL Made It & Zaytoven]\n\n...
3,2 Chainz,ColleGrove,100 Joints,[Hook]\nYeah uum\nNo matter what they say I sm...
4,2 Chainz,Based On A T.R.U. Story,I'm Different,"[Intro: YG]\nMustard on the beat, ho!\n\n[Hook..."


In [4]:
tracks_by_artist = tracks_by_artist[~tracks_by_artist["lyrics"].isnull()]
tracks_by_artist.tail()

Unnamed: 0,artist,album,track,lyrics
1049,Z-Ro,Drankin & Drivin,Devil Ass City,[Hook]\nI only fear god in this devil ass city...
1051,Z-Ro,Legendary,"Dome, Kush, and Codeine","[Hook]\nDome, kush, and codeine\nYou know that..."
1053,Z-Ro,Drankin & Drivin,New Shit,[Hook]\nIf you don't like my new shit you can ...
1056,Z-Ro,Legendary,Out His Mind,[Chorus: Z-Ro]\nZ-ro must be out his mind\nThi...
1058,Z-Ro,Drankin & Drivin,Hate Me So Much,"[Verse 1]\nY'all can keep on talking, dropping..."


In [6]:
tracks_by_artist.groupby("artist").agg("count")["lyrics"]

artist
2 Chainz                      20
A$AP Rocky                    20
Atmosphere                    17
Big Sean                      19
Cardi B                       17
Chief Keef                    18
Danny Brown                   17
Drake                         20
Eminem                        20
Future                        20
G-Eazy                        18
Gucci Mane                    17
Hopsin                        18
Isaiah Rashad                 18
J. Cole                       20
Jeezy                         16
Kendrick Lamar                19
Kevin Gates                   18
Kid Cudi                      20
Kodak Black                   20
Lecrae                        14
Lil Uzi Vert                  20
Lil Wayne                     20
Lil Yachty                    17
Logic                         19
Mac Miller                    19
Machine Gun Kelly             16
Migos                         20
Mike Stud                     17
Moneybagg Yo                  15
NF 

In [60]:
artist_sample = tracks_by_artist[tracks_by_artist["artist"].isin(["J. Cole", "Lil Uzi Vert"])]

In [61]:
artist_sample

Unnamed: 0,artist,album,track,lyrics
320,J. Cole,2014 Forest Hills Drive,Note to Self,[Chorus]\nAnd wherever we go\nAnd whatever we ...
321,J. Cole,KOD,Intro (KOD),Can someone please turn off my mind?\nMy thoug...
322,J. Cole,2014 Forest Hills Drive,Hello,"[Chorus]\nHello, hello, hello\nHello, hello, h..."
323,J. Cole,KOD,Photograph,[Chorus]\nFell in love through photograph\nI d...
324,J. Cole,2014 Forest Hills Drive,G.O.M.D.,[Produced by J.Cole]\n[Directed by Lawrence La...
325,J. Cole,Born Sinner,Mo Money (Interlude),"[Verse]\nMo Money, yeah\nMo Money, Blow Money,..."
326,J. Cole,Born Sinner,Miss America,[Intro]\nThis is a public service announcement...
327,J. Cole,2014 Forest Hills Drive,03' Adolescence,[Produced by Willie B]\n\n[Verse 1]\nI grew up...
328,J. Cole,2014 Forest Hills Drive,Apparently,[Produced by J. Cole and Omen]\n[Directed by S...
329,J. Cole,4 Your Eyez Only,"She's Mine, Pt. 2","[Intro]\nCatch me, don't you\nCatch me, don't ..."


In [67]:
# train, test = train_test_split(tracks_by_artist, test_size=0.3, random_state=42)
train, test = train_test_split(artist_sample, test_size=0.3, random_state=40)

In [68]:
train.groupby("artist").agg("count")["track"].size

2

In [69]:
test.groupby("artist").agg("count")["track"].size

2

In [70]:
train.groupby("artist").agg("count")["track"]

artist
J. Cole         15
Lil Uzi Vert    13
Name: track, dtype: int64

In [71]:
test.groupby("artist").agg("count")["track"]

artist
J. Cole         5
Lil Uzi Vert    7
Name: track, dtype: int64

In [72]:
train_artists = train["artist"].tolist()
train_lyrics = train["lyrics"].tolist()

test_artists = test["artist"].tolist()
test_lyrics = test["lyrics"].tolist()

In [73]:
# preprocess lyrics ...
def preprocess(raw_lyrics):
    # Some choices here are specific to the format of Genius lyrics, want to remove non-vocalised 
    # text in square brackets, and other text not part of the main song body such as adlibs in 
    # round brackets
    raw_lyrics = re.sub("([\(\[].*?[\)\]])|([^\w\d'\s]+)", "", raw_lyrics)
    raw_lyrics = re.sub("[\n]+", " ", raw_lyrics)
    raw_lyrics = raw_lyrics.strip()
    raw_lyrics = raw_lyrics.lower()

    return raw_lyrics

In [74]:
train_tagged = [TaggedDocument(words=word_tokenize(preprocess(train_lyrics[idx])), tags=[train_artists[idx]]) for idx, _ in enumerate(train_artists)]
test_tagged = [TaggedDocument(words=word_tokenize(preprocess(test_lyrics[idx])), tags=[test_artists[idx]]) for idx, _ in enumerate(test_artists)]

In [75]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [76]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab(train_tagged)

In [77]:
model_dbow.corpus_count

28

In [78]:
max_epochs = 30
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model_dbow.train(utils.shuffle(train_tagged), total_examples=model_dbow.corpus_count, epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha
#     previouse example
#     model_dbow.train(tagged_lyrics,
#                 total_examples=model.corpus_count,
#                 epochs=model.iter)
#     # decrease the learning rate
#     model.alpha -= 0.0002
#     # fix the learning rate, no decay
#     model.min_alpha = model.alpha

model_dbow.save("dbow.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
Model Saved


In [79]:
def vec_for_learning(model, tagged_docs):
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in tagged_docs])
    return targets, regressors

In [80]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [81]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.75
Testing F1 score: 0.7517482517482517


In [82]:
from sklearn import svm

svc = svm.SVC(kernel="poly")
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.4166666666666667
Testing F1 score: 0.2450980392156863


  'precision', 'predicted', average, warn_for)


In [59]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab(train_tagged)

In [60]:
for epoch in range(30):
    model_dmm.train(utils.shuffle(train_tagged), total_examples=len(train_tagged), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

In [61]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.11956521739130435
Testing F1 score: 0.11944127014002791


  'precision', 'predicted', average, warn_for)


In [12]:
topn = len(set(artists))-1

In [13]:
closest_drake = model.docvecs.most_similar("Drake", topn=topn)
closest_drake.sort(key=lambda x: x[1], reverse=True)
closest_drake

[('Russ', 0.5853328108787537),
 ('Kevin Gates', 0.5152955055236816),
 ('YoungBoy Never Broke Again', 0.48948970437049866),
 ('Post Malone', 0.4561176002025604),
 ('BROCKHAMPTON', 0.4287121593952179),
 ('Lil Yachty', 0.35661354660987854),
 ('NF', 0.324750155210495),
 ('Future', 0.2802601456642151),
 ('Lil Uzi Vert', 0.2795082628726959),
 ('J. Cole', 0.2701497972011566),
 ('Wiz Khalifa', 0.2618672251701355),
 ('Moneybagg Yo', 0.24229663610458374),
 ('Migos', 0.23129615187644958),
 ('Kendrick Lamar', 0.19112615287303925),
 ('Kodak Black', 0.18138381838798523),
 ('Eminem', 0.09486167132854462),
 ('Gucci Mane', -0.01438937522470951)]

In [14]:
closest_j_cole = model.docvecs.most_similar("J. Cole", topn=topn)
closest_j_cole.sort(key=lambda x: x[1], reverse=True)
closest_j_cole

[('Kendrick Lamar', 0.5769816637039185),
 ('Gucci Mane', 0.4730648994445801),
 ('Lil Yachty', 0.4245752990245819),
 ('BROCKHAMPTON', 0.39214181900024414),
 ('YoungBoy Never Broke Again', 0.38885417580604553),
 ('Eminem', 0.38101840019226074),
 ('Kevin Gates', 0.3272707462310791),
 ('Kodak Black', 0.272230863571167),
 ('Drake', 0.27014976739883423),
 ('NF', 0.26713502407073975),
 ('Wiz Khalifa', 0.18016491830348969),
 ('Migos', 0.15268343687057495),
 ('Moneybagg Yo', 0.13472585380077362),
 ('Lil Uzi Vert', 0.07694961130619049),
 ('Russ', 0.07243375480175018),
 ('Post Malone', 0.06938178092241287),
 ('Future', 0.045791659504175186)]

In [15]:
closest_nf = model.docvecs.most_similar("NF", topn=topn)
closest_nf.sort(key=lambda x: x[1], reverse=True)
closest_nf

[('BROCKHAMPTON', 0.5922399163246155),
 ('Eminem', 0.558570384979248),
 ('Russ', 0.5573232769966125),
 ('Drake', 0.3247501254081726),
 ('Kendrick Lamar', 0.3203350305557251),
 ('J. Cole', 0.26713502407073975),
 ('Migos', 0.20939365029335022),
 ('Wiz Khalifa', 0.18576228618621826),
 ('Post Malone', 0.18536292016506195),
 ('Gucci Mane', 0.1551463007926941),
 ('Kevin Gates', 0.12770546972751617),
 ('Moneybagg Yo', 0.11673309653997421),
 ('Lil Uzi Vert', 0.11299404501914978),
 ('Future', 0.07157484441995621),
 ('Lil Yachty', -0.009864097461104393),
 ('YoungBoy Never Broke Again', -0.02485552616417408),
 ('Kodak Black', -0.1735169142484665)]

In [16]:
closest_brockhampton = model.docvecs.most_similar("BROCKHAMPTON", topn=topn)
closest_brockhampton.sort(key=lambda x: x[1], reverse=True)
closest_brockhampton

[('Russ', 0.6033599972724915),
 ('NF', 0.5922399759292603),
 ('Post Malone', 0.44411501288414),
 ('Drake', 0.4287121295928955),
 ('Lil Uzi Vert', 0.3980545401573181),
 ('Kendrick Lamar', 0.3945070207118988),
 ('J. Cole', 0.39214178919792175),
 ('Gucci Mane', 0.3679358661174774),
 ('Eminem', 0.30388718843460083),
 ('Lil Yachty', 0.27308738231658936),
 ('Kodak Black', 0.1908894181251526),
 ('YoungBoy Never Broke Again', 0.07527803629636765),
 ('Kevin Gates', 0.017246827483177185),
 ('Wiz Khalifa', 0.01604609377682209),
 ('Future', 0.0073099881410598755),
 ('Migos', -0.006686776876449585),
 ('Moneybagg Yo', -0.02164846658706665)]

In [110]:
closest_lil_yachty = model.docvecs.most_similar("Lil Yachty", topn=topn)
closest_lil_yachty.sort(key=lambda x: x[1], reverse=True)
closest_lil_yachty

[('Wiz Khalifa', 0.339235782623291),
 ('Post Malone', 0.2566002607345581),
 ('Gucci Mane', 0.14417427778244019),
 ('NF', 0.0957019180059433),
 ('Kendrick Lamar', 0.08413362503051758),
 ('Migos', 0.03958903253078461),
 ('J. Cole', 0.030617788434028625),
 ('Drake', 0.004214301705360413),
 ('BROCKHAMPTON', -0.013500861823558807),
 ('Kodak Black', -0.1294480860233307),
 ('Lil Uzi Vert', -0.14480730891227722),
 ('Eminem', -0.23411545157432556),
 ('YoungBoy Never Broke Again', -0.29086756706237793),
 ('Kevin Gates', -0.2930602431297302),
 ('Russ', -0.37881702184677124),
 ('Future', -0.37943166494369507),
 ('Moneybagg Yo', -0.5447709560394287)]