In [1]:
import keras
from keras.layers import Activation, Dense, Input, Flatten, Dropout, Lambda, Softmax
from keras.models import Model
from keras import regularizers
from keras import backend as K
import tensorflow as tf

import numpy as np
from scipy import sparse

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = sparse.load_npz('raw_sparse.npz')

In [22]:
import pandas as pd

df = pd.read_csv('raw_df.csv', index_col=0)

  mask |= (ar1 == a)


In [23]:
df.head()

Unnamed: 0,user_id,track_id
0,0,38738
1,0,14546
2,0,23065
3,0,39808
4,0,53221


In [24]:
n_playlists = data.shape[0]
n_songs = data.shape[1]

n_test = 10000
n_val = 10000

playlist_ids = np.random.permutation(n_playlists)

test_ids = playlist_ids[:n_test]
val_ids = playlist_ids[n_test:n_test+n_val]
train_ids = playlist_ids[n_test+n_val:]

train_data = data.tocsr()[train_ids]


In [25]:
val_df = df.loc[df.user_id.isin(val_ids)]

val_df_test_inds = []
for user_id, group in val_df.groupby(['user_id']):
    if len(group.index) > 5:
        test_inds = np.random.choice(group.index, int(0.2 * len(group.index)), replace=False)
        val_df_test_inds += list(test_inds)

val_df_train_inds = list(set(val_df.index) - set(val_df_test_inds))

val_test_df = df.iloc[val_df_test_inds]
val_train_df = df.iloc[val_df_train_inds]

In [None]:
len(val_test_df.user_id.unique())

In [26]:
val_train_df.shape

(803025, 2)

In [8]:
print (len(val_df))
print (len(val_test_df))
print (len(val_train_df))
print (len(df))

996537
195299
801238
39106490


In [27]:
val_train_data = sparse.coo_matrix(([1] * len(val_train_df), (val_train_df.user_id, val_train_df.track_id))).tocsr()
val_test_data = sparse.coo_matrix(([1] * len(val_test_df), (val_test_df.user_id, val_test_df.track_id))).tocsr()

In [28]:
val_train_data = val_train_data[val_train_data.getnnz(1)>0]
val_test_data = val_test_data[val_test_data.getnnz(1)>0]

In [29]:
train_data = train_data[train_data.getnnz(1)>0]

In [32]:
val_test_data.shape

(10000, 69678)

In [33]:
del df

In [34]:
K.clear_session()

h = 200

inputs = Input(shape=(n_songs,), name='dae_input')
inputs_normed = Lambda(lambda  x: tf.nn.l2_normalize(x, dim=1))(inputs)
x = Dropout(0.5)(inputs_normed)
latent = Dense(h, activation='tanh', name='latent_vector', kernel_regularizer=regularizers.l2(0.01))(x)

encoder = Model(inputs, latent, name='encoder')
encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dae_input (InputLayer)       (None, 69678)             0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 69678)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 69678)             0         
_________________________________________________________________
latent_vector (Dense)        (None, 200)               13935800  
Total params: 13,935,800
Trainable params: 13,935,800
Non-trainable params: 0
_________________________________________________________________


In [35]:
latent_inputs = Input(shape=(h,), name='dae_latent')

logits = Dense(n_songs, name='dae_output', kernel_regularizer=regularizers.l2(0.01))(latent_inputs)
probs = Softmax(name='dae_probs')(logits) 
decoder = Model(latent_inputs, probs, name='decoder')
decoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dae_latent (InputLayer)      (None, 200)               0         
_________________________________________________________________
dae_output (Dense)           (None, 69678)             14005278  
_________________________________________________________________
dae_probs (Softmax)          (None, 69678)             0         
Total params: 14,005,278
Trainable params: 14,005,278
Non-trainable params: 0
_________________________________________________________________


In [36]:
dae = Model(inputs=inputs, outputs=decoder(encoder(inputs)), name='autoencoder')
dae.summary()

dae.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dae_input (InputLayer)       (None, 69678)             0         
_________________________________________________________________
encoder (Model)              (None, 200)               13935800  
_________________________________________________________________
decoder (Model)              (None, 69678)             14005278  
Total params: 27,941,078
Trainable params: 27,941,078
Non-trainable params: 0
_________________________________________________________________


In [37]:
from sklearn.utils import shuffle

def data_generator(data, batch_size):
    N = data.shape[0]
    while True:
        data = shuffle(data)
        for i in range(0, N, batch_size):
            batch = data[i:i+batch_size]
            yield batch.todense(), batch.todense()

In [38]:
from keras.callbacks import Callback

# def calculate_recall(preds, test, batch_size=100, R=20):
#     N = test.shape[0]
#     M = test.shape[1]
#     total_recall = 0.
#     for i in range(0, N, batch_size):
#         test_batch = test[i:i+batch_size]
#         pred_batch = preds[i:i+batch_size]
        
#         dense_test = test_batch.todense()
        
#         test_total_items = np.sum(dense_test, axis=1)
#         ranked_items = np.argsort(pred_batch, axis=1)
        
#         recall = np.expand_dims(np.sum(np.where(ranked_items >= M-R, dense_test, 0), axis=1), 1)/np.minimum(test_total_items, 20)
#         total_recall += np.sum(recall)
       
#     return total_recall/N


def calculate_recall_ndcg(preds, test, batch_size=100, R=20):
    N = test.shape[0]
    M = test.shape[1]
    total_recall = 0.
    
    ndcg = np.zeros(test.shape[0], dtype='float32')
    
    for i in range(0, N, batch_size):
        test_batch = test[i:i+batch_size]
        pred_batch = preds[i:i+batch_size]
        
        dense_test = test_batch.todense()
        
        test_total_items = np.sum(dense_test, axis=1)
        ranked_items = np.argsort(pred_batch, axis=1)
        
        recall = np.expand_dims(np.sum(np.where(ranked_items >= M-R, dense_test, 0), axis=1), 1)/np.minimum(test_total_items, 20)
        total_recall += np.sum(recall)
        
        ranks = np.where(ranked_items >= M-R, M - ranked_items , 0)
        matches = np.where(ranked_items >= M-R, dense_test, 0)
        dcg = np.sum(np.divide(matches, np.log(ranks + 1), out=np.zeros_like(matches, dtype='float32'), where=ranks!=0), axis=1)
       
        ndcg[i:i+batch_size] = dcg
    
    ndcg = ndcg/np.max(ndcg)
    total_ndcg = np.sum(ndcg)
    
    return total_recall/N, total_ndcg/N



class RecMetrics(Callback):
    
    def __init__(self, val_train_data, val_test_data):
        self.val_train_data = val_train_data
        self.val_test_data = val_test_data
        return
    
    def on_train_begin(self, logs={}):
        self.ndcgs = []
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):
        preds = np.asarray(self.model.predict(val_train_data))
        recall, ndcg = calculate_recall_ndcg(preds, self.val_test_data)
        print ("Recall: {}".format(recall))
        print ("NDCG: {}".format(recall))
        return
 
    def on_batch_begin(self, batch, logs={}):
        return
 
    def on_batch_end(self, batch, logs={}):
        return
    
rmetrics = RecMetrics(val_train_data, val_test_data)

In [43]:
dae.fit(x=train_data[:64000], y=train_data[:64000], 
        callbacks=[rmetrics],
        batch_size = 64,
        epochs=30)

Epoch 1/30
Recall: 0.00033171350708115406
NDCG: 0.00033171350708115406
Epoch 2/30
Recall: 0.0003461392159611974
NDCG: 0.0003461392159611974
Epoch 3/30
Recall: 0.00034985510963065134
NDCG: 0.00034985510963065134
Epoch 4/30
Recall: 0.0002545414058765451
NDCG: 0.0002545414058765451
Epoch 5/30
Recall: 0.00023371592846438047
NDCG: 0.00023371592846438047
Epoch 6/30
Recall: 0.0002826813301078007
NDCG: 0.0002826813301078007
Epoch 7/30
Recall: 0.0003726471318499181
NDCG: 0.0003726471318499181
Epoch 8/30
Recall: 0.00046536419248106533
NDCG: 0.00046536419248106533
Epoch 9/30
Recall: 0.0005227430490046585
NDCG: 0.0005227430490046585
Epoch 10/30
Recall: 0.00033332853378131996
NDCG: 0.00033332853378131996
Epoch 11/30
Recall: 0.00031115440115440116
NDCG: 0.00031115440115440116
Epoch 12/30

KeyboardInterrupt: 

In [None]:
def calculate_recall_ndcg(preds, test, batch_size=100, R=20):
    N = test.shape[0]
    M = test.shape[1]
    total_recall = 0.
    
    ndcg = np.zeros(test.shape[0], dtype='float32')
    
    for i in range(0, N, batch_size):
        test_batch = test[i:i+batch_size]
        pred_batch = preds[i:i+batch_size]
        
        dense_test = test_batch.todense()
        
        test_total_items = np.sum(dense_test, axis=1)
        ranked_items = np.argsort(pred_batch, axis=1)
        
        recall = np.expand_dims(np.sum(np.where(ranked_items >= M-R, dense_test, 0), axis=1), 1)/np.minimum(test_total_items, 20)
        total_recall += np.sum(recall)
        
        ranks = np.where(ranked_items >= M-R, M - ranked_items , 0)
        matches = np.where(ranked_items >= M-R, dense_test, 0)
        dcg = np.sum(np.divide(matches, np.log(ranks + 1), out=np.zeros_like(matches, dtype='float32'), where=ranks!=0), axis=1)
       
        ndcg[i:i+batch_size] = dcg
    
    ndcg = ndcg/np.max(ndcg)
    total_ndcg = np.sum(ndcg)
    
    return total_recall/N, total_ndcg/N


In [None]:
preds = np.asarray(dae.predict(val_train_data))
        

In [None]:
i = 0
batch_size = 128

test = val_test_data
test_batch = test[i:i+batch_size]
pred_batch = preds[i:i+batch_size]

dense_test = test_batch.todense()

test_total_items = np.sum(dense_test, axis=1)
ranked_items = np.argsort(pred_batch, axis=1)

# recall = np.expand_dims(np.sum(np.where(ranked_items >= M-R, dense_test, 0), axis=1), 1)/np.minimum(test_total_items, 20)
# total_recall += np.sum(recall)

In [None]:
M = test.shape[1]
R = 20
ranks = np.where(ranked_items >= M-R, M - ranked_items , 0)
matches = np.where(ranked_items >= M-R, dense_test, 0)
dcg = np.sum(np.divide(matches, np.log(ranks + 1), out=np.zeros_like(matches, dtype='float32'), where=ranks!=0), axis=1)

In [None]:
np.max(ndcg)

In [None]:
ndcg = np.zeros(test.shape[0], dtype='float32')
ndcg[i:i+batch_size] = dcg

In [None]:
ndcg = ndcg/np.max(ndcg)