In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import json

Using Theano backend.


# CF Using Neural Collaborative Filtering
In the world of reccomender sytems, collabrative filtering is usually done through matrix factorization using the inner product on the latent features of users and items. According to Xiangnan He from the National University of Sigapore, forcing the features to be orthgonal using the inner product limits the features that can be extracted. He believes this can have significant performance implicaitons especially when dealing with implifict feedback (1 if consumed, 0 if not consumed) where natural negative feedback is not easily availible. 

In his paper entitled, [Nerual Collaborative Filtering](https://arxiv.org/pdf/1708.05031.pdf) He argues that using a nueral network to do the dimentionality reduction is a form of general matrix factorization (GMF) that can improve performance. Here we explore that possibility using Spotify's 1M Challenge Data. 

Below is his proposed network for GMF that we will implement on a subset of our 1M Challenge Data. 

![GMF.png](GMF.png)

## Preparing Dataset
Let's begin by reading in our dataset.

In [32]:
#Create a function to read in json 
def readin_json(start):
    if start not in np.arange(0, 1000000, 1000):
        raise Exception('Invalid start pid! Start pids must be {0, 1000, 2000, ..., 999000}')
    end=start+1000
    path = 'data/mpd.slice.' + str(start) + "-" + str(end-1) + '.json'
    d = json.load(open(path, 'r'))
    thisslice = pd.DataFrame.from_dict(d['playlists'], orient='columns')
    return thisslice

In [57]:
#Read in a subset of our 1M Playlists
first1000 = readin_json(0)
second1000 = readin_json(1000)
third1000 = readin_json(2000)
traindata = pd.concat([first1000, second1000, third1000])

In [59]:
#Also read in the challenge dataset which has missing songs we 
#want our model to predict
t = json.load(open('data/challenge_set.json'))
challenge_df = pd.DataFrame.from_dict(t['playlists'], orient='columns')
#Combine train and challenge so we can use cat code to map
#track ids to an index 0-N across both datasets
train_challengedata = pd.concat([traindata, challenge_df])

In [62]:
#Turn playlist level dataframe into song level dataframe
songPlaylistArray = []
for index, row in train_challengedata.iterrows():
    for track in row['tracks']:
        songPlaylistArray.append([track['track_uri'], track['artist_name'], track['track_name'], row['pid'], row['num_holdouts']])
songPlaylist = pd.DataFrame(songPlaylistArray, columns=['trackid', 'artist_name', 'track_name', 'pid', 'num_holdouts'])

print(songPlaylist.shape)
songPlaylist.head(10)   #is a df of all track ids, cooresponding artist names, track names and playlist ids

(480874, 5)


Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0,
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0,
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0,
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0,
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0,
5,spotify:track:0XUfyU2QviPAs6bxSpXYG4,Usher,Yeah!,0,
6,spotify:track:68vgtRHr7iZHpzGpon6Jlo,Usher,My Boo,0,
7,spotify:track:3BxWKCI06eQ5Od8TY2JBeA,The Pussycat Dolls,Buttons,0,
8,spotify:track:7H6ev70Weq6DdpZyyTmUXk,Destiny's Child,Say My Name,0,
9,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM,OutKast,Hey Ya! - Radio Mix / Club Mix,0,


In [63]:
# Turn songs into their unqiue cat codes so we have a 0-N index for tracks
songPlaylist['trackindex'] = songPlaylist['trackid'].astype('category').cat.codes
print(len(songPlaylist['trackindex'].unique()))
songPlaylist.head(10)

110716


Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts,trackindex
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0,,7076
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0,,89461
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0,,7619
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0,,16714
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0,,25336
5,spotify:track:0XUfyU2QviPAs6bxSpXYG4,Usher,Yeah!,0,,7769
6,spotify:track:68vgtRHr7iZHpzGpon6Jlo,Usher,My Boo,0,,87321
7,spotify:track:3BxWKCI06eQ5Od8TY2JBeA,The Pussycat Dolls,Buttons,0,,45441
8,spotify:track:7H6ev70Weq6DdpZyyTmUXk,Destiny's Child,Say My Name,0,,103422
9,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM,OutKast,Hey Ya! - Radio Mix / Club Mix,0,,34437


In [76]:
# split appart training and challenge data
train = songPlaylist[pd.isnull(songPlaylist['num_holdouts'])]
challenge = songPlaylist[pd.notnull(songPlaylist['num_holdouts'])]

In [82]:
train.head(10)

Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts,trackindex
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0,,7076
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0,,89461
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0,,7619
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0,,16714
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0,,25336
5,spotify:track:0XUfyU2QviPAs6bxSpXYG4,Usher,Yeah!,0,,7769
6,spotify:track:68vgtRHr7iZHpzGpon6Jlo,Usher,My Boo,0,,87321
7,spotify:track:3BxWKCI06eQ5Od8TY2JBeA,The Pussycat Dolls,Buttons,0,,45441
8,spotify:track:7H6ev70Weq6DdpZyyTmUXk,Destiny's Child,Say My Name,0,,103422
9,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM,OutKast,Hey Ya! - Radio Mix / Club Mix,0,,34437


In [80]:
#Save data in dok matrix (optimized sparse matrix object)
    #Create a sparse pid x trackindex matrix
    #If a pid i has song j, mat[i,j]=1
mat = sp.dok_matrix((3000, 110716), dtype=np.float32)
for pid, trackindex in zip(train['pid'], train['trackindex']):
    mat[pid, trackindex] = 1.0

## Building and NN Using Keras

In [83]:
import numpy as np
import theano.tensor as T
import keras
from keras import backend as K
from keras import initializations
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Merge, Flatten
from keras.optimizers import Adam
from keras.regularizers import l2
from time import time
import multiprocessing as mp
import sys
import math

def init_normal(shape, name=None):
    return initializations.normal(shape, scale=0.01, name=name)

def get_model(num_users, num_items, latent_dim, regs=[0,0]):
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embedding',
                                  init = init_normal, W_regularizer = l2(regs[0]), input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding',
                                  init = init_normal, W_regularizer = l2(regs[1]), input_length=1)   
    
    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MF_Embedding_User(user_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))
    
    # Element-wise product of user and item embeddings 
    predict_vector = merge([user_latent, item_latent], mode = 'mul')
    
    # Final prediction layer
    #prediction = Lambda(lambda x: K.sigmoid(K.sum(x)), output_shape=(1,))(predict_vector)
    prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = 'prediction')(predict_vector)
    
    model = Model(input=[user_input, item_input], 
                output=prediction)
    return model

def get_train_instances(train, num_negatives):
    user_input, item_input, labels = [],[],[]
    num_users = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in xrange(num_negatives):
            j = np.random.randint(num_items)
            while train.has_key((u, j)):
                j = np.random.randint(num_items)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

# Specify hyperparameters
num_factors = 8
regs = [0,0]
num_negatives = 4
learner = 'adam'
learning_rate = 0.001
epochs = 15
batch_size = 200
verbose = 1

# Save model
model_out_file = './GMF_%d_%d.h5' %(num_factors, time())
    
# Loading data
train = mat
num_users, num_items = train.shape
print("Load data done")

# Build model
model = get_model(num_users, num_items, num_factors, regs)
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
    
# Train model
for epoch in xrange(epochs):
    # Generate training instances
    user_input, item_input, labels = get_train_instances(train, num_negatives)

    # Training
    hist = model.fit([np.array(user_input), np.array(item_input)], #input
                     np.array(labels), # labels 
                     validation_split=0.20, batch_size=batch_size, nb_epoch=1, verbose=0, shuffle=True)
    print(hist.history)

Load data done
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
item_input (InputLayer)          (None, 1)             0                                            
____________________________________________________________________________________________________
user_input (InputLayer)          (None, 1)             0                                            
____________________________________________________________________________________________________
item_embedding (Embedding)       (None, 1, 8)          885728      item_input[0][0]                 
____________________________________________________________________________________________________
user_embedding (Embedding)       (None, 1, 8)          24000       user_input[0][0]                 
____________________________________________________________________________

## Generating Reccomendations for Challenge Playlists

Having trained our model, we will use our model to predict the missing songs in the challenge playlist. While we don't have access to Spotify's evaluation script, we can do a sanity check by just exmaining the reccomendaitons to see if they make any sense. 

In [114]:
#Challenge pid's are indexed wierd since empty playlists were removed by out groupby
#Let's manually reindex
challenge.loc[challenge['pid']== 1000000, 'pid'] = 0
challenge.loc[challenge['pid']== 1000016, 'pid'] = 1
challenge.loc[challenge['pid']== 1000020, 'pid'] = 2
challenge.loc[challenge['pid']== 1000023,'pid'] = 3

In [121]:
sample_challenge = challenge[1:20]
sample_challenge

Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts,trackindex
199875,spotify:track:5MhsZlmKJG6X5kTHkdwC4B,AronChupa,I'm an Albatraoz,0,70.0,76258
199876,spotify:track:0GZoB8h0kqXn7XFm4Sj06k,Lorde,Yellow Flicker Beat - From The Hunger Games: M...,0,70.0,3871
199877,spotify:track:35kahykNu00FPysz3C2euR,Lorde,White Teeth Teens,0,70.0,44050
199878,spotify:track:3G6hD9B2ZHOsgf4WfNu7X1,Lorde,Team,0,70.0,46400
199879,spotify:track:6WQLkih8nE0JdUCEyLaGnQ,Alesso,Heroes (we could be),1,73.0,92718
199880,spotify:track:37sINbJZcFdHFAsVNsPq1i,The Script,Superheroes,1,73.0,44532
199881,spotify:track:0yhPEz5KxlDwckGJaMlZqM,Fall Out Boy,Centuries,1,73.0,13981
199882,spotify:track:5j9iuo3tMmQIfnEEQOOjxh,American Authors,Best Day Of My Life,1,73.0,81402
199883,spotify:track:4eLSCSELtKxZwXnFbNLXT5,Imagine Dragons,On Top Of The World,1,73.0,66069
199884,spotify:track:4PvD06Pmbm2rHG2JjSlElF,Banks,Beggin For Thread,2,63.0,62807


In [132]:
#First save the challenge playlists using the same format as trianing
#ie. sparse dok matrix
chall_mat = sp.dok_matrix((4, 110716), dtype=np.float32)
for pid, trackindex in zip(sample_challenge['pid'], sample_challenge['trackindex']):
    chall_mat[pid, trackindex] = 1.0

In [133]:
playlist_input, track_input, labels = get_train_instances(chall_mat, 0)

In [135]:
predictions = model.predict([np.array(playlist_input), np.array(track_input)])

In [136]:
predictions


array([[  1.84105467e-02],
       [  1.93946925e-03],
       [  9.88554180e-01],
       [  2.85070785e-03],
       [  1.51838904e-04],
       [  3.03956727e-03],
       [  3.86640383e-03],
       [  4.51667984e-05],
       [  1.20616639e-02],
       [  2.82539101e-03],
       [  4.91999381e-05],
       [  7.64941284e-03],
       [  1.88684076e-04],
       [  1.38368189e-01],
       [  4.61007096e-03],
       [  2.47132294e-02],
       [  1.54368463e-03],
       [  9.71875131e-01],
       [  2.28365549e-04]], dtype=float32)