In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

import pickle
import numpy as np

import cornac
from cornac.data import Reader
from cornac.eval_methods import BaseMethod, RatioSplit
from cornac.models import Recommender, BPR, WMF

TRAIN_FILEPATH = "./triplets_train_sample.csv"
TEST_FILEPATH = "./triplets_test_sample.csv"
SONGS_FILEPATH = "./songs_artist.csv" #FIXED
SPOTIFY_FILEPATH = './spotify.csv' #FIXED

MODEL_FILEPATH = "../final_bpr.pkl"
USER_ID2IDX_FILEPATH = "../user_idx2id_bpr.csv"
ITEM_ID2IDX_FILEPATH = "../item_idx2id_bpr.csv"

NUM_NEIGHBOURS = 5
NUM_RECOMMENDATIONS = 10

In [None]:
train = pd.read_csv(TRAIN_FILEPATH, sep=",")
test = pd.read_csv(TEST_FILEPATH, sep=",")
df = pd.concat([train, test])

songs = pd.read_csv(SONGS_FILEPATH)
spotify = pd.read_csv(SPOTIFY_FILEPATH)
spotify_columns = [col for col in spotify if col.startswith('spotify_')]

with open(MODEL_FILEPATH, 'rb') as f:
    model = pickle.load(f)

# playcount df
df = df.merge(songs, on="song_id")
# spotify vectors for each artist
artist_spotify = songs.merge(spotify, on='song_id').groupby(['artist_id'])[spotify.columns[1:]].mean().reset_index()


In [None]:
# spotify vectors for each existing users
users_spotify = (
    df
    .sort_values('play_count',ascending = False)
    .groupby('user_id')
    .head(20) # get top 20 per user
    .merge(spotify, on='song_id')
    .groupby(['user_id'])[spotify.columns[1:]]
    .mean()
    .reset_index()
)

## new user

In [None]:
selected_artists = ['ARJ7KF01187B98D717', 'ARS8GNX1187B9B5141', 'ARF2EHS1187B994F4E']
# based on selected artists, find a spotify representation that is representative of him (Mean)
new_user_vector = artist_spotify.loc[artist_spotify.artist_id.isin(selected_artists)].mean()

In [None]:
knn = NearestNeighbors(n_neighbors=NUM_NEIGHBOURS)
knn.fit(users_spotify[spotify_columns])

dist, indices = knn.kneighbors([new_user_vector[spotify_columns]])
perc_dist = dist[0]/dist[0].sum()

In [None]:
# load user mapping
user_id2idx = pd.read_csv(USER_ID2IDX_FILEPATH)
user_id2idx.columns = ["user_id"]
# load song mapping
item_id2idx = pd.read_csv(ITEM_ID2IDX_FILEPATH)
item_id2idx.columns = ['item_id']
# lookup user mapping to get model index of users from users_spotify.iloc[indices[0]]
model_indices = indices[0]
data = {'user_id': users_spotify.iloc[model_indices].user_id.values, 'dist': dist[0], 'perc_dist': perc_dist}
new_df = pd.DataFrame.from_dict(data)
user_mapped = user_id2idx.loc[user_id2idx.user_id.isin(users_spotify.iloc[model_indices].user_id)].reset_index()
user_mapped.columns = ['model_idx', 'user_id']
new_df = new_df.merge(user_mapped, on='user_id')
new_df

Unnamed: 0,user_id,dist,perc_dist,model_idx
0,feedd3be3e82eb2e25efa7acf65b733952409a22,0.166605,0.150117,34212
1,5b4154dd7e70953e87135e75f99d0c5edd64c720,0.223516,0.201395,19835
2,787e24589c45432510a87c70f9024e93c6dda21a,0.233914,0.210764,6200
3,b21b6ddabd4155284323ba253da778d2fe6e3097,0.236648,0.213227,4277
4,fdf9132531e7ea1d5ef5277dd6d433825690d8f6,0.249157,0.224498,29272


In [None]:
adjusted_scores = np.sum([model.score(row.model_idx) * row.perc_dist for row in new_df.itertuples()], axis=0)
top_n_recommendations_idx = np.argpartition(adjusted_scores, len(adjusted_scores) - NUM_RECOMMENDATIONS)[-NUM_RECOMMENDATIONS:]
sorted_asc_recommendation_idx = top_n_recommendations_idx[np.argsort(adjusted_scores[top_n_recommendations_idx])]

In [None]:
item_id2idx.loc[sorted_asc_recommendation_idx]

Unnamed: 0,item_id
212,SOCVTLJ12A6310F0FD
184,SOUFTBI12AB0183F65
655,SOKLRPJ12A8C13C3FE
182,SOWCKVR12A8C142411
88,SOHTKMO12AB01843B0
32,SOSXLTC12AF72A7F54
100,SONYKOW12AB01849C9
553,SOPUCYA12A8C13A694
303,SOBONKR12A58A7A7E0
576,SOFRQTD12A81C233C0


In [None]:
new_user_vector

spotify_danceability          0.463099
spotify_energy                0.649037
spotify_key                   5.654337
spotify_loudness             -7.231669
spotify_mode                  0.707973
spotify_speechiness           0.054561
spotify_acousticness          0.273597
spotify_instrumentalness      0.109664
spotify_liveness              0.194205
spotify_valence               0.417454
spotify_tempo               125.164010
spotify_time_signature        3.934819
dtype: float64

In [None]:
users_spotify.loc[model_indices]

Unnamed: 0,user_id,spotify_danceability,spotify_energy,spotify_key,spotify_loudness,spotify_mode,spotify_speechiness,spotify_acousticness,spotify_instrumentalness,spotify_liveness,spotify_valence,spotify_tempo,spotify_time_signature
41690,feedd3be3e82eb2e25efa7acf65b733952409a22,0.498833,0.672728,5.555556,-7.176333,0.666667,0.073517,0.262416,0.070873,0.2131,0.473739,125.204389,4.0
14872,5b4154dd7e70953e87135e75f99d0c5edd64c720,0.526333,0.668417,5.666667,-7.31,0.75,0.097025,0.247575,0.130099,0.203017,0.419667,125.349,3.916667
19635,787e24589c45432510a87c70f9024e93c6dda21a,0.593222,0.662889,5.666667,-7.268889,0.777778,0.041722,0.214253,0.100733,0.1569,0.557778,125.229,3.888889
29066,b21b6ddabd4155284323ba253da778d2fe6e3097,0.5198,0.6122,5.7,-7.34715,0.8,0.043875,0.192757,1.4e-05,0.155645,0.33333,125.1554,3.95
41539,fdf9132531e7ea1d5ef5277dd6d433825690d8f6,0.489611,0.699111,5.555556,-7.408667,0.611111,0.055767,0.333007,0.151601,0.1983,0.393411,125.184167,3.888889
