In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

import time
from tqdm import tqdm
import multiprocessing
import logging
import random


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
SONGS_PATH = '/content/drive/MyDrive/cs4100_data/songs.csv'
PLAYLISTS_PATH = '/content/drive/MyDrive/cs4100_data/playlists_0.npy'


In [None]:
SONGS_PATH = 'dataset/spotify2018/songs.csv'
PLAYLISTS_PATH = 'dataset/spotify2018/playlists_0.npy'

In [None]:
songs = pd.read_csv(SONGS_PATH)[['ids', 'track_name', 'artist_name', 'artist_uri']]

In [None]:
songs.head()

In [None]:
songs.shape

In [None]:
with open(PLAYLISTS_PATH, 'rb') as f:
    playlists = np.load(f, allow_pickle=True)

In [None]:
len(playlists)

In [None]:
from sklearn.model_selection import train_test_split
playlist_train, playlist_test = train_test_split(playlists, test_size = 0.01, shuffle = True, random_state = 4100)

In [None]:
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

class Callback(CallbackAny2Vec):
    
    def __init__(self):
        self.epoch = 1
        self.training_loss = []

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        current_loss = loss if self.epoch == 1 else loss - self.training_loss[-1]
        print(f"Loss after epoch {self.epoch}: {current_loss}")
        self.training_loss.append(current_loss)
        self.epoch += 1

model = Word2Vec(
    size = 256,
    window = 15,
    min_count = 1,
    sg = 0,
    negative = 5,
    workers = 7)

logging.disable(logging.NOTSET)
start = time.time()
print('Building vocab...')
model.build_vocab(playlist_train)
print('Finished building vocab in {}s'.format(round(time.time() - start)))

logging.disable(logging.INFO)
callback = Callback()


start = time.time()
print('Training model...')
model.train(playlist_train,
            total_examples = model.corpus_count,
            epochs = 100,
            compute_loss = True,
            callbacks = [callback])

print('Finished training model in {}s'.format(round(time.time() - start)))

In [None]:
def mean_vector_for_playlist(playlist):
    vec = []
    for song in playlist:
        try:
            vec.append(model.wv[song])
        except KeyError:
            continue
    return np.mean(vec, axis=0)

def similar_songs(playlist_vector, topn=3):
    similar_songs = model.wv.similar_by_vector(playlist_vector, topn=topn)
    return similar_songs

def hit_rate(playlist, window, n_songs):
    hit = 0
    
    masked_playlists = []
    
    random_idxs = [random.randrange(len(playlist)) for _ in range(3)]
    
    for idx in random_idxs:
        masked_playlist = [song for i, song in enumerate(playlist) if i != idx]
        masked_playlists.append((masked_playlist, playlist[idx]))
                             
    for masked_playlist, target in masked_playlists:
        masked_vector = mean_vector_for_playlist(masked_playlist)
        if not isinstance(masked_vector, np.ndarray) and np.isnan(masked_vector):
            return -1
        recommended_songs = similar_songs(masked_vector, topn=n_songs)
        songs_id = list(zip(*recommended_songs))[0]
        hit += int(target in songs_id)
    return hit/len(playlist)

def evaluate_model(playlists, window, n_songs):
    data = []
    for playlist in tqdm(playlists):
        hr = hit_rate(playlist, window, n_songs=n_songs)
        if not hr == -1:
            data.append(hr)
    return pd.Series(data).mean()


In [None]:
playlist_vec = list(map(mean_vector_for_playlist, playlist_test))

In [None]:
evaluate_model(playlist_test, model.window, 30)

In [None]:
def recommend_songs(playlist, n):
    print("Given Songs:")
    for song_id in playlist:
        print(songs.loc[song_id, "track_name"], songs.loc[song_id, "artist_name"])

    print('Recommended Songs:')
    playlist_vector = mean_vector_for_playlist(playlist)
    for song, similarity in similar_songs(playlist_vector, n):
        print("[{}] {}".format(similarity, song))
