In [43]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import re

In [8]:
# Run nltk downloader
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/tom/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
tracks_by_artist = pd.read_csv("./data/complete_tracks_with_lyrics.csv")
tracks_by_artist = tracks_by_artist[["artist", "album", "track", "lyrics"]]

In [4]:
tracks_by_artist.head()

Unnamed: 0,artist,album,track,lyrics
0,BROCKHAMPTON,iridescence,WEIGHT,[Verse 1: Kevin Abstact]\nThey split my world ...
1,BROCKHAMPTON,iridescence,VIVID,"[Intro: Matt Champion]\n""Yo, get—[censored]—tu..."
2,BROCKHAMPTON,iridescence,TAPE,"[Verse 1: Kevin Abstract]\nI can barely rap, I..."
3,BROCKHAMPTON,Saturation III,STAINS,[Verse 1: Ameer Vann]\nI spent like a year and...
4,BROCKHAMPTON,iridescence,DISTRICT,"[Intro]\n""I'm Sammy Jo, and my favorite colors..."


In [27]:
artists = tracks_by_artist["artist"].tolist()
lyrics = tracks_by_artist["lyrics"].tolist()

In [56]:
# preprocess lyrics ...
def preprocess(raw_lyrics):
    # Some choices here are specific to the format of Genius lyrics, want to remove non-vocalised 
    # text in square brackets, and other text not part of the main song body such as adlibs in 
    # round brackets
    raw_lyrics = re.sub("([\(\[].*?[\)\]])|([^\w\d'\s]+)", "", raw_lyrics)
    raw_lyrics = re.sub("[\n]+", " ", raw_lyrics)
    raw_lyrics = raw_lyrics.strip()
    raw_lyrics = raw_lyrics.lower()

    return raw_lyrics

In [64]:
tagged_lyrics = [TaggedDocument(words=word_tokenize(preprocess(lyrics[idx])), tags=[artists[idx]]) for idx, _ in enumerate(artists)]

In [65]:
tagged_lyrics

[TaggedDocument(words=['they', 'split', 'my', 'world', 'into', 'pieces', 'i', 'ai', "n't", 'heard', 'from', 'my', 'nieces', 'i', 'been', 'feeling', 'defeated', 'like', 'i', "'m", 'the', 'worst', 'in', 'the', 'boyband', 'i', 'ai', "n't", 'sleep', 'in', 'some', 'weekends', 'tryna', 'headline', 'both', 'weekends', 'leave', 'my', 'niggas', "y'all", 'sheepin', "'", 'i', 'keep', 'the', 'world', 'in', 'my', 'hands', 'i', 'know', 'accounts', 'should', 'be', 'deleted', 'i', 'know', 'some', 'niggas', 'should', 'stop', 'hitting', 'my', 'phone', 'whenever', 'they', 'needing', 'money', 'or', 'favors', 'done', "'cause", 'i', "'m", 'still', 'worried', "'bout", 'when', 'ashlan', 'finna', 'put', 'the', 'razor', 'down', 'so', 'i', 'do', "n't", 'really', 'give', 'a', 'fuck', 'about', 'what', 'story', 'they', 'done', 'spun', 'and', 'i', 'ai', "n't", 'done', 'yeah', 'and', 'i', 'ai', "n't", 'done', 'you', 'heard', 'me', 'i', 'ai', "n't", 'done', 'i', 'really', 'miss', 'the', 'old', 'days', 'before', 'the',

In [67]:
# example Doc2Vec model
# don't know about these hyperparameters
vector_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vector_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm=1)

model.build_vocab(tagged_lyrics)

In [68]:
max_epochs = 100
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47


  


iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration 77
iteration 78
iteration 79
iteration 80
iteration 81
iteration 82
iteration 83
iteration 84
iteration 85
iteration 86
iteration 87
iteration 88
iteration 89
iteration 90
iteration 91
iteration 92
iteration 93
iteration 94
iteration 95
iteration 96
iteration 97
iteration 98
iteration 99
Model Saved


In [105]:
topn = len(set(artists))-1

In [106]:
closest_drake = model.docvecs.most_similar("Drake", topn=topn)
closest_drake.sort(key=lambda x: x[1], reverse=True)
closest_drake

[('BROCKHAMPTON', 0.5834943056106567),
 ('Kendrick Lamar', 0.3099706172943115),
 ('Gucci Mane', 0.3050391674041748),
 ('Future', 0.29996204376220703),
 ('Lil Uzi Vert', 0.24502570927143097),
 ('Eminem', 0.23232199251651764),
 ('Kevin Gates', 0.16570186614990234),
 ('Wiz Khalifa', 0.15436489880084991),
 ('YoungBoy Never Broke Again', 0.1274348944425583),
 ('Migos', 0.1251392811536789),
 ('NF', 0.12338171899318695),
 ('J. Cole', 0.025200296193361282),
 ('Lil Yachty', 0.0042143166065216064),
 ('Russ', -0.0032776780426502228),
 ('Post Malone', -0.021515265107154846),
 ('Kodak Black', -0.02439533919095993),
 ('Moneybagg Yo', -0.0632888525724411)]

In [107]:
closest_j_cole = model.docvecs.most_similar("J. Cole", topn=topn)
closest_j_cole.sort(key=lambda x: x[1], reverse=True)
closest_j_cole

[('Moneybagg Yo', 0.27738481760025024),
 ('YoungBoy Never Broke Again', 0.2330983281135559),
 ('Russ', 0.16563192009925842),
 ('Migos', 0.15500503778457642),
 ('BROCKHAMPTON', 0.03454535827040672),
 ('Lil Yachty', 0.03061780333518982),
 ('Eminem', 0.02904406189918518),
 ('Drake', 0.025200318545103073),
 ('NF', 0.01582556962966919),
 ('Future', -0.0340365469455719),
 ('Kodak Black', -0.051008984446525574),
 ('Wiz Khalifa', -0.07333488762378693),
 ('Post Malone', -0.12903784215450287),
 ('Kendrick Lamar', -0.21508406102657318),
 ('Gucci Mane', -0.23787468671798706),
 ('Kevin Gates', -0.26148560643196106),
 ('Lil Uzi Vert', -0.2954378128051758)]

In [108]:
closest_nf = model.docvecs.most_similar("NF", topn=topn)
closest_nf.sort(key=lambda x: x[1], reverse=True)
closest_nf

[('Gucci Mane', 0.27874672412872314),
 ('Drake', 0.12338172644376755),
 ('BROCKHAMPTON', 0.10381519794464111),
 ('Lil Yachty', 0.0957019105553627),
 ('Migos', 0.09279169142246246),
 ('Kendrick Lamar', 0.06990151107311249),
 ('Wiz Khalifa', 0.06391143053770065),
 ('Kodak Black', 0.03149725869297981),
 ('J. Cole', 0.01582556962966919),
 ('Eminem', 0.0066891685128211975),
 ('Moneybagg Yo', 0.0031755492091178894),
 ('Lil Uzi Vert', -0.08019638806581497),
 ('Future', -0.1338222324848175),
 ('Post Malone', -0.20343756675720215),
 ('Kevin Gates', -0.27603739500045776),
 ('YoungBoy Never Broke Again', -0.29641976952552795),
 ('Russ', -0.41504597663879395)]

In [109]:
closest_brockhampton = model.docvecs.most_similar("BROCKHAMPTON", topn=topn)
closest_brockhampton.sort(key=lambda x: x[1], reverse=True)
closest_brockhampton

[('Drake', 0.5834944248199463),
 ('Kendrick Lamar', 0.27372413873672485),
 ('Future', 0.2563052475452423),
 ('Post Malone', 0.20900899171829224),
 ('Kevin Gates', 0.18217532336711884),
 ('Eminem', 0.15968912839889526),
 ('Gucci Mane', 0.14827388525009155),
 ('NF', 0.10381520539522171),
 ('Wiz Khalifa', 0.058491531759500504),
 ('Moneybagg Yo', 0.04541568085551262),
 ('J. Cole', 0.034545354545116425),
 ('Russ', 0.005877792835235596),
 ('Lil Yachty', -0.013500861823558807),
 ('Lil Uzi Vert', -0.03992360830307007),
 ('Kodak Black', -0.05744391679763794),
 ('Migos', -0.13090144097805023),
 ('YoungBoy Never Broke Again', -0.2590065002441406)]

In [110]:
closest_lil_yachty = model.docvecs.most_similar("Lil Yachty", topn=topn)
closest_lil_yachty.sort(key=lambda x: x[1], reverse=True)
closest_lil_yachty

[('Wiz Khalifa', 0.339235782623291),
 ('Post Malone', 0.2566002607345581),
 ('Gucci Mane', 0.14417427778244019),
 ('NF', 0.0957019180059433),
 ('Kendrick Lamar', 0.08413362503051758),
 ('Migos', 0.03958903253078461),
 ('J. Cole', 0.030617788434028625),
 ('Drake', 0.004214301705360413),
 ('BROCKHAMPTON', -0.013500861823558807),
 ('Kodak Black', -0.1294480860233307),
 ('Lil Uzi Vert', -0.14480730891227722),
 ('Eminem', -0.23411545157432556),
 ('YoungBoy Never Broke Again', -0.29086756706237793),
 ('Kevin Gates', -0.2930602431297302),
 ('Russ', -0.37881702184677124),
 ('Future', -0.37943166494369507),
 ('Moneybagg Yo', -0.5447709560394287)]