## Train a Word2Vec model with Spotify lists ("sentences") of tracks ("words")

### Prepare the data

In [1]:
import os
import csv
import time
import gensim
from gensim.models.callbacks import CallbackAny2Vec

try: # are we running on Google colab?
    # Load the Drive helper and mount
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    # This will prompt for authorization.
    drive.mount('/content/drive')
    os.chdir('/content/drive/My Drive/Colab Notebooks')
   
csv.field_size_limit(1000000) # for really long playlists!

print('Read in tracks...')
tracks = {}
with open('tracks.csv', "r", encoding='utf-8') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    for row in spamreader:
        columns = str(row)[2:-2].split(';')
        tracks[columns[0]] = [columns[1] + ' - ' +
                              columns[2], columns[3]] # title - artist, url
print(f'Number of tracks: {len(tracks)}')

print('Read in playlists...')
playlists = []
with open('playlists.csv', 'r', encoding='utf-8') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    for row in spamreader:
        columns = str(row)[2:-2].split(';')
        if columns[1] != 'Spotify': # ignore Spotify playlists which are commercial, not curated
            playlist = []
            for column in columns[2:]:
                playlist.append(column)
            playlists.append(playlist)
print(f'Number of playlists: {len(playlists)}')



Read in tracks...
Number of tracks: 3981436
Read in playlists...
Number of playlists: 238593


### Train the model...

In [2]:
# hyperparameters

min_count = 10      # minimum number of occurences of a track in all playlists
window = 2          # length of sequences
embedding_dim = 100 # number of dimensions in hidden layer
batch_words = 10000 # number of tracks to process in each batch
iter = 15           # number of iterations
sg = 0              # skip-gram (1) or Continuous Bag Of Words (0)

# ------------------------------------------------------------------------------

valid_examples = [
    '2NMgVh5qaPprKTEzFe3501', # The Police - Roxanne
    '3Ti0GdlrotgwsAVBBugv0I', # A Tribe Called Quest - Can I Kick It?
    '0nyrltZrQGAJMBZc1bYvuQ', # James Brown - Get Up Offa That Thing
    '4hy4fb5D1KL50b3sng9cjw', # Nirvana - Smells Like Teen Spirit
    '1P49MJhU5vzttesFxw3dOM', # Bob Marley & The Wailers - Three Little Birds
    '76GlO5H5RT6g7y0gev86Nk', # The Cure - Just Like Heaven
    '40tAOP3DPqmVD6L1h45Jp6', # Frank Sinatra - My Way
    '4IMvgp0WZqr9mRqpEvDKxI', # The Clash - Rock the Casbah
    '1iDcKYNvo6gglrOG6lvnHL', # The Rolling Stones - Sympathy For The Devil
    '5uvosCdMlFdTXhoazkTI5R', # The Doors - Light My Fire
    '15JINEqzVMv3SvJTAXAKED', # Eminem - Love The Way You Lie
    '69kOkLUCkxIZYexIgSG8rq', # Daft Punk - Get Lucky
    '6oVY50pmdXqLNVeK8bzomn', # John Coltrane - My Favorite Things
    '6ui6l3ZNvlrGQZArwo8195', # Sex Pistols - God Save The Queen
    '0YammaEkYSeo9vQYZ1OwS6', # David Guetta - Say My Name
    '4SHZsQIdS2N1E5yqvoXF8o'  # Andy Williams - Can't Take My Eyes Off You
]

# ------------------------------------------------------------------------------

class logger(CallbackAny2Vec):
    def __init__(self):
        print('Starting...')
        self.epoch = 0
        self.loss = 0

    def on_train_begin(self, model):
        self.start = time.time()

    def on_epoch_end(self, model):
        elapsed = time.time() - self.start
        print('#{}'.format(self.epoch), 'loss =',
              (model.get_latest_training_loss() - self.loss) / batch_words,
              'elapsed time =', elapsed // 60, 'minutes', elapsed % 60, 'seconds')
        self.epoch += 1
        self.loss = model.get_latest_training_loss()
        print('Saving model...')
        model.save('word2vec.model')
        _model = gensim.models.Word2Vec.load('word2vec.model')
        for track in valid_examples:
            similar = _model.wv.most_similar(positive=[track], topn=8)
            most_similar = ''
            for i in range(0, 8):
                most_similar = most_similar + '%s (%.2f)' % (tracks[similar[i][0]][0], similar[i][1]) + ', '
            print('  %s -> %s' % (tracks[track][0], most_similar))
        print()
        del _model

*class gensim.models.word2vec.Word2Vec(sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), max_final_vocab=None)*

In [5]:
model = gensim.models.Word2Vec(sentences=playlists, size=embedding_dim,
                               min_count=min_count, window=window, iter=iter,
                               batch_words=batch_words, compute_loss=True,
                               callbacks=[logger()])
print(model)

Starting...
#0 loss = 2442.885 elapsed time = 0.0 minutes 57.41188287734985 seconds
Saving model...
  The Police - Roxanne -> Eagles - Hotel California - Remastered (0.99), Depeche Mode - Personal Jesus (0.99), Sting - Englishman In New York (0.99), Depeche Mode - Enjoy the Silence (0.99), Mike & The Mechanics - Over My Shoulder (0.99), The Cure - Friday I'm In Love (0.99), Crowded House - Weather With You (0.99), R.E.M. - Losing My Religion (0.99), 
  A Tribe Called Quest - Can I Kick It? -> A Tribe Called Quest - Electric Relaxation (0.98), OutKast - ATLiens (0.97), A Tribe Called Quest - Award Tour (0.97), Pete Rock & C.L. Smooth - They Reminisce Over You (0.97), Nas - The World Is Yours (0.97), Ol' Dirty Bastard - Shimmy Shimmy Ya (0.97), OutKast - Rosa Parks (0.97), A Tribe Called Quest - Buggin' Out (0.96), 
  James Brown - Get Up Offa That Thing -> Electric Light Orchestra - Mr. Blue Sky (0.99), The O'Jays - Love Train (0.99), The Isley Brothers - Shout, Pts. 1 & 2 (0.99), The M

### ...or load one you made previously

In [3]:
model = gensim.models.Word2Vec.load('word2vec.model')
print(model)

Word2Vec(vocab=452130, size=100, alpha=0.025)


### Save only tracks which appear in a minimum number of playlists

In [None]:
popular_tracks = {}
for track in tracks:
    if tracks[track][3] >= min_count:
        popular_tracks[track] = [tracks[track][0], tracks[track][1], tracks[track][2]]
print(len(popular_tracks))
with open('popular_tracks.csv', 'a+', newline='', encoding='utf-8') as csvfile_tracks:
    spamwriter_tracks = csv.writer(csvfile_tracks, delimiter=';')
    for track in popular_tracks:
        spamwriter_tracks.writerow([track + ';' + popular_tracks[track][0] + ';' + popular_tracks[track][1] + ';' + popular_tracks[track][2] + ';'])

### Get the Spotify id of a particular song

In [4]:
search = 'despacito'
for track in tracks:
    if tracks[track][0].lower().find(search.lower()) != -1 and track in model.wv.vocab:
        print(f'{track} : {tracks[track][0]} : {tracks[track][1]}')

5cj54CVe4pQZ9cUKgbsZrG : Pentatonix - Despacito x Shape Of You : https://p.scdn.co/mp3-preview/442d8fce3c6f11c4b947dbf736020896b7d871c0?cid=194086cb37be48ebb45b9ba4ce4c5936
5CtI0qwDJkDQGwXD1H1cLb : Luis Fonsi - Despacito - Remix : 
4E096IghHI9bFtVVQf5SET : Luis Fonsi - Despacito : 
4aWmUDTfIPGksMNLV2rQP2 : Luis Fonsi - Despacito (Featuring Daddy Yankee) : 
4vxA3aI7l73i0Hi819OQhH : Madilyn Bailey - Despacito : https://p.scdn.co/mp3-preview/076f200df2b0f8b361ca1e6c268dba878ad998ce?cid=194086cb37be48ebb45b9ba4ce4c5936
3vqyauyYbp9MqUwrCEmWJ0 : Ramon Ayala - Despacito : https://p.scdn.co/mp3-preview/697d6cda3fb5ad7946358b71ae50a86c2a32ca14?cid=194086cb37be48ebb45b9ba4ce4c5936
6rPO02ozF3bM7NnOV4h6s2 : Luis Fonsi - Despacito - Remix : https://p.scdn.co/mp3-preview/01f2b0e71b6ac51b5aada80ff829c9027df5de8b?cid=194086cb37be48ebb45b9ba4ce4c5936
7CUYHcu0RnbOnMz4RuN07w : Luis Fonsi - Despacito (Featuring Daddy Yankee) : https://p.scdn.co/mp3-preview/1f37421e137fd8d765c99f79b6718191df19d23f?cid=1940

### Get some suggestions of similar songs

In [6]:
id = '1hVRTl4yhWmGW7ImZoO22e'
print(f'Tracks most similar to {tracks[id][0]}')
most_similar = model.wv.most_similar(positive=[id], topn=20)
for track in most_similar:
    print(f'{track[0]} : {tracks[track[0]][0]} : {tracks[track[0]][1]}')

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

print()
filename = 'spectrograms/' + id + '.png'
if os.path.exists(filename):
    print(f'{tracks[id][0]}')
    img = mpimg.imread(filename)
    plt.imshow(img)
    plt.show()
for track in most_similar:
    filename = 'spectrograms/' + track[0] + '.png'
    if os.path.exists(filename):
        print(f'{tracks[track[0]][0]}')
        img = mpimg.imread(filename)
        plt.imshow(img)
        plt.show()

Tracks most similar to John Coltrane - Naima
5p95DUZplmVxY0XCQ3hxUV : John Coltrane - I'm Old Fashioned - Remastered 2003/Rudy Van Gelder Edition : https://p.scdn.co/mp3-preview/a33f887540677271ab17f1f40759cbc1657f5096?cid=194086cb37be48ebb45b9ba4ce4c5936
6HvyFdP5JrCQdmxq2qi7MC : The Dave Brubeck Quartet - Strange Meadow Lark : https://p.scdn.co/mp3-preview/2e01e4b73c4266af97571587acf5f2a63145a357?cid=194086cb37be48ebb45b9ba4ce4c5936
3eMfftdPSD4PI1cux5w97b : Paul Desmond - Autumn Leaves : https://p.scdn.co/mp3-preview/7e68c3bb967370697d64741bd49aa6f94209c33d?cid=194086cb37be48ebb45b9ba4ce4c5936
4ymHy4hzJ09WxvvT7p0Azy : Art Pepper - You Go To My Head : https://p.scdn.co/mp3-preview/9e8242895ca9151bc8c732c3c41a4cf7e53d7dc4?cid=194086cb37be48ebb45b9ba4ce4c5936
6f6OQJv9qjqyJq9NHin45n : Phil Woods - Blue Ballad : https://p.scdn.co/mp3-preview/8cd094238997fb3f6e5196ba8cda8346b886a922?cid=194086cb37be48ebb45b9ba4ce4c5936
6ZgaUtMlUzUf7SqbUUnfbu : John Coltrane - Blue Train - Remastered 2003 : 

### Generate a playlist based on a few (seed) tracks

In [None]:
seed_tracks = [
    '7rMeV0rIjzePISDXZTfM1v',
    '30UTvW5IQWXmvdcZ1zbF6R',
    '5313VQMjcM2w5gTRZtyd6J',
    '6Xio661Y2iL7SDKFFOG9cv',
    ]
num_tracks = 10
for i in range(0, num_tracks - len(seed_tracks)):
    next_track = model.predict_output_word(context_words_list=seed_tracks[-(window - 1):], topn=1)
    if next_track == None:
        break
    seed_tracks.append(next_track[0][0])
for track in seed_tracks:
    print(f'{track} : {tracks[track][0]} : {tracks[track][1]}')

7rMeV0rIjzePISDXZTfM1v : Gang Starr - You Know My Steez : https://p.scdn.co/mp3-preview/112f54e5101577c0ace390127a204309a8def24f?cid=194086cb37be48ebb45b9ba4ce4c5936
30UTvW5IQWXmvdcZ1zbF6R : De La Soul - Me, Myself And I : 
5313VQMjcM2w5gTRZtyd6J : A Tribe Called Quest - Bonita Applebum : https://p.scdn.co/mp3-preview/fca10e0beaa2ec9c0dddad75e8fd27cdf2b936fc?cid=194086cb37be48ebb45b9ba4ce4c5936
6Xio661Y2iL7SDKFFOG9cv : Black Moon - Who Got Da Props : https://p.scdn.co/mp3-preview/2c381515b49ddc5ef8f92978e72a57541947ab92?cid=194086cb37be48ebb45b9ba4ce4c5936
6Xio661Y2iL7SDKFFOG9cv : Black Moon - Who Got Da Props : https://p.scdn.co/mp3-preview/2c381515b49ddc5ef8f92978e72a57541947ab92?cid=194086cb37be48ebb45b9ba4ce4c5936
6Xio661Y2iL7SDKFFOG9cv : Black Moon - Who Got Da Props : https://p.scdn.co/mp3-preview/2c381515b49ddc5ef8f92978e72a57541947ab92?cid=194086cb37be48ebb45b9ba4ce4c5936
6Xio661Y2iL7SDKFFOG9cv : Black Moon - Who Got Da Props : https://p.scdn.co/mp3-preview/2c381515b49ddc5ef8f9

### Hmmm, that didn't work too well. Let's try a different approach

In [7]:
seed_tracks = [
    '4iNgXqTD2uux9GH4lTnNaN',
    '3PsGlZaMR6W3luCmp3RgRc',
    '1zLfdcVHsyB4Nw6IVfAPtV',
    ]
num_tracks = 10
seed_window = num_tracks
for i in range(0, num_tracks - len(seed_tracks)):
    next_track = model.wv.most_similar(positive=seed_tracks[-seed_window:], topn=1)
    if next_track == None:
        break
    seed_tracks.append(next_track[0][0])
for track in seed_tracks:
    print(f'{track} : {tracks[track][0]} : {tracks[track][1]}')

4iNgXqTD2uux9GH4lTnNaN : Milton Nascimento - Tudo O Que Você Podia Ser : https://p.scdn.co/mp3-preview/0991fd670055bf0cfaeb6f3e937fef0931fd9205?cid=194086cb37be48ebb45b9ba4ce4c5936
3PsGlZaMR6W3luCmp3RgRc : Edu Lobo - Ponteio : https://p.scdn.co/mp3-preview/2cadb8b2e6d77f38063bfadef4e08f06a6473c83?cid=194086cb37be48ebb45b9ba4ce4c5936
1zLfdcVHsyB4Nw6IVfAPtV : Jorge Ben Jor - Oba, Lá Vem Ela : 
5tt394udRU89Pn2s3qPGaQ : Tom Zé - Vai (Menina Amanhã de Manhã) : 
5V0HmhGqwjoDq0cHlI05GS : Novos Baianos - Swing de Campo Grande : https://p.scdn.co/mp3-preview/84b03050271342ef6fe15f5f428f65409089f55f?cid=194086cb37be48ebb45b9ba4ce4c5936
3GsfYbpS0RNVoerNGRNxAq : Jorge Ben Jor - Chove Chuva : 
4aYvf6sss8oMFBfPt4vobj : Banda Eddie - O Baile de Betinha : https://p.scdn.co/mp3-preview/1f90b1d090dad6c6e09382af11fcea5abc740ed5?cid=194086cb37be48ebb45b9ba4ce4c5936
1ZetCeq3ZOSD2k64UNEBqf : Novos Baianos - Acabou Chorare : https://p.scdn.co/mp3-preview/7a85b562f1d00842861e02756e9f0b837c470ec1?cid=194086cb3

In [None]:
# display embedding
# dash
# RNN for playlist generation
# spectrograms