In [1]:
import keras
from keras.layers import Activation, Dense, Input, Flatten, Dropout, Lambda, Softmax
from keras.models import Model
from keras import regularizers
from keras import backend as K
import tensorflow as tf

import numpy as np
from scipy import sparse

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = sparse.load_npz('raw_sparse.npz')

In [3]:
import pandas as pd

df = pd.read_csv('raw_df.csv', index_col=0)

  mask |= (ar1 == a)


In [4]:
df.head()

Unnamed: 0,user_id,track_id
0,0,38738
1,0,14546
2,0,23065
3,0,39808
4,0,53221


In [5]:
n_playlists = data.shape[0]
n_songs = data.shape[1]

n_test = 10000
n_val = 10000

playlist_ids = np.random.permutation(n_playlists)

test_ids = playlist_ids[:n_test]
val_ids = playlist_ids[n_test:n_test+n_val]
train_ids = playlist_ids[n_test+n_val:]

train_data = data.tocsr()[train_ids]


In [6]:
val_df = df.loc[df.user_id.isin(val_ids)]

val_df_test_inds = []
for user_id, group in val_df.groupby(['user_id']):
    if len(group.index) > 5:
        test_inds = np.random.choice(group.index, int(0.2 * len(group.index)), replace=False)
        val_df_test_inds += list(test_inds)

val_df_train_inds = list(set(val_df.index) - set(val_df_test_inds))

val_test_df = df.iloc[val_df_test_inds]
val_train_df = df.iloc[val_df_train_inds]

In [7]:
len(val_test_df.user_id.unique())

10000

In [8]:
print (len(val_df))
print (len(val_test_df))
print (len(val_train_df))
print (len(df))

994983
194963
800020
39106490


In [9]:
val_train_data = sparse.coo_matrix(([1] * len(val_train_df), (val_train_df.user_id, val_train_df.track_id))).tocsr()
val_test_data = sparse.coo_matrix(([1] * len(val_test_df), (val_test_df.user_id, val_test_df.track_id))).tocsr()

In [10]:
val_train_data = val_train_data[val_train_data.getnnz(1)>0]
val_test_data = val_test_data[val_test_data.getnnz(1)>0]

In [11]:
train_data = train_data[train_data.getnnz(1)>0]

In [12]:
train_data.shape

(373740, 69678)

In [13]:
del df

In [14]:
K.clear_session()

h = 200

inputs = Input(shape=(n_songs,), name='dae_input')
inputs_normed = Lambda(lambda  x: tf.nn.l2_normalize(x, dim=1))(inputs)
x = Dropout(0.5)(inputs_normed)
latent = Dense(h, activation='tanh', name='latent_vector', kernel_regularizer=regularizers.l2(0.01))(x)

encoder = Model(inputs, latent, name='encoder')
encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dae_input (InputLayer)       (None, 69678)             0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 69678)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 69678)             0         
_________________________________________________________________
latent_vector (Dense)        (None, 200)               13935800  
Total params: 13,935,800
Trainable params: 13,935,800
Non-trainable params: 0
_________________________________________________________________


In [15]:
latent_inputs = Input(shape=(h,), name='dae_latent')

logits = Dense(n_songs, name='dae_output', kernel_regularizer=regularizers.l2(0.01))(latent_inputs)
probs = Softmax(name='dae_probs')(logits) 
decoder = Model(latent_inputs, probs, name='decoder')
decoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dae_latent (InputLayer)      (None, 200)               0         
_________________________________________________________________
dae_output (Dense)           (None, 69678)             14005278  
_________________________________________________________________
dae_probs (Softmax)          (None, 69678)             0         
Total params: 14,005,278
Trainable params: 14,005,278
Non-trainable params: 0
_________________________________________________________________


In [35]:
dae = Model(inputs=inputs, outputs=decoder(encoder(inputs)), name='autoencoder')
dae.summary()

dae.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dae_input (InputLayer)       (None, 69678)             0         
_________________________________________________________________
encoder (Model)              (None, 200)               13935800  
_________________________________________________________________
decoder (Model)              (None, 69678)             14005278  
Total params: 27,941,078
Trainable params: 27,941,078
Non-trainable params: 0
_________________________________________________________________


In [36]:
from sklearn.utils import shuffle

def data_generator(data, batch_size):
    N = data.shape[0]
    while True:
        data = shuffle(data)
        for i in range(0, N, batch_size):
            batch = data[i:i+batch_size]
            yield batch.todense(), batch.todense()

In [37]:
from keras.callbacks import Callback

def calculate_recall(preds, test, batch_size=100, R=20):
    N = test.shape[0]
    M = test.shape[1]
    total_recall = 0.
    for i in range(0, N, batch_size):
        test_batch = test[i:i+batch_size]
        pred_batch = preds[i:i+batch_size]
        
        dense_test = test_batch.todense()
        
        test_total_items = np.sum(dense_test, axis=1)
        ranked_items = np.argsort(pred_batch, axis=1)
         
        recall = np.expand_dims(np.sum(np.where(ranked_items >= M-R, dense_test, 0), axis=1), 1)/np.minimum(test_total_items, 20)
        total_recall += np.sum(recall)
       
    return total_recall/N

class RecMetrics(Callback):
    
    def __init__(self, val_train_data, val_test_data):
        self.val_train_data = val_train_data
        self.val_test_data = val_test_data
        return
    
    def on_train_begin(self, logs={}):
        self.ndcgs = []
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):
        preds = np.asarray(self.model.predict(val_train_data))
        recall = calculate_recall(preds, self.val_test_data)
        print ("Recall: {}".format(recall))
        return
 
    def on_batch_begin(self, batch, logs={}):
        return
 
    def on_batch_end(self, batch, logs={}):
        return
    
rmetrics = RecMetrics(val_train_data, val_test_data)

In [None]:
dae.fit(x=train_data[0:320], y=train_data[0:320], 
        callbacks=[rmetrics],
        epochs=30)

Epoch 1/30
Recall: 0.0002399657652289232
Epoch 2/30
Recall: 0.000331331519357835
Epoch 3/30
