In [1]:
import keras
from keras.layers import Activation, Dense, Input, Flatten, Dropout, Lambda, Softmax
from keras.models import Model
from keras import regularizers
from keras import backend as K
import tensorflow as tf

import numpy as np
from scipy import sparse

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = sparse.load_npz('raw_sparse.npz')

In [3]:
import pandas as pd

df = pd.read_csv('raw_df.csv', index_col=0)

  mask |= (ar1 == a)


In [4]:
df.head()

Unnamed: 0,user_id,track_id
0,0,38738
1,0,14546
2,0,23065
3,0,39808
4,0,53221


In [5]:
data.shape

(393740, 69678)

In [6]:
n_playlists = data.shape[0]
n_songs = data.shape[1]

n_test = 20000
n_val = 20000

playlist_ids = np.random.permutation(n_playlists)

test_ids = playlist_ids[:n_test]
val_ids = playlist_ids[n_test:n_test+n_val]
train_ids = playlist_ids[n_test+n_val:]

train_data = data.tocsr()[train_ids]


In [7]:
val_id_map = {id1: id2 for id1, id2 in enumerate(val_ids)}
rev_val_id_map = {id2: id1 for id1, id2 in enumerate(val_ids)}


In [8]:
val_holdout_songs = np.array([np.random.choice(n_songs, int(0.2 * n_songs), replace=False) for i in range(n_val)])

def binarize(test_inds):
    row = np.zeros((n_songs))
    row[test_inds] = 1
    return row

val_is_holdout = np.apply_along_axis(binarize, 1, val_holdout_songs)

In [10]:
val_df = df.loc[df.user_id.isin(val_ids)]

val_df_test_inds = []

for row, data in val_df.iterrows():
    val_user_id = rev_val_id_map[data['user_id']]
    if data['track_id'] in val_holdout_songs[val_user_id]:
        val_df_test_inds += [row]
        
# for user_id, group in val_df.groupby(['user_id']):
#     if len(group.index) > 5:
#         test_inds = np.random.choice(group.index, int(0.2 * len(group.index)), replace=False)
#         val_df_test_inds += list(test_inds)


            

In [11]:

val_df_train_inds = list(set(val_df.index) - set(val_df_test_inds))

val_test_df = df.iloc[val_df_test_inds]
val_train_df = df.iloc[val_df_train_inds]

print (len(val_test_df))
print (len(val_train_df))

396097
1585436


In [12]:
val_train_data = sparse.coo_matrix(([1] * len(val_train_df), (val_train_df.user_id, val_train_df.track_id))).tocsr()
val_test_data = sparse.coo_matrix(([1] * len(val_test_df), (val_test_df.user_id, val_test_df.track_id))).tocsr()

In [13]:
val_train_data = val_train_data[val_train_data.getnnz(1)>0]
val_test_data = val_test_data[val_test_data.getnnz(1)>0]

In [14]:
train_data = train_data[train_data.getnnz(1)>0]

In [17]:
val_test_data.shape

(20000, 69678)

In [18]:
del df

In [19]:
K.clear_session()

h1 = 600
h2 = 200

enc_inputs = Input(shape=(n_songs,), name='enc_input')
inputs_normed = Lambda(lambda  x: tf.nn.l2_normalize(x, dim=1))(enc_inputs)
x = Dropout(0.5)(inputs_normed)

latent1_enc = Dense(h1, activation='tanh', name='latent1_enc', kernel_regularizer=regularizers.l2(0.01))(x)
latent2_enc = Dense(h2, activation='tanh', name='latent2_enc', kernel_regularizer=regularizers.l2(0.01))(latent1_enc)

encoder = Model(enc_inputs, latent2_enc, name='encoder')
encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
enc_input (InputLayer)       (None, 69678)             0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 69678)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 69678)             0         
_________________________________________________________________
latent1_enc (Dense)          (None, 600)               41807400  
_________________________________________________________________
latent2_enc (Dense)          (None, 200)               120200    
Total params: 41,927,600
Trainable params: 41,927,600
Non-trainable params: 0
_________________________________________________________________


In [20]:
dec_inputs = Input(shape=(h2,), name='dec_inputs')

latent1_dec = Dense(h1, name='latent2_dec', kernel_regularizer=regularizers.l2(0.01))(dec_inputs)
logits = Dense(n_songs, name='logits_dec', kernel_regularizer=regularizers.l2(0.01))(latent1_dec)
probs = Softmax(name='probs_dec')(logits) 
decoder = Model(dec_inputs, probs, name='decoder')
decoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dec_inputs (InputLayer)      (None, 200)               0         
_________________________________________________________________
latent2_dec (Dense)          (None, 600)               120600    
_________________________________________________________________
logits_dec (Dense)           (None, 69678)             41876478  
_________________________________________________________________
probs_dec (Softmax)          (None, 69678)             0         
Total params: 41,997,078
Trainable params: 41,997,078
Non-trainable params: 0
_________________________________________________________________


In [116]:
dae = Model(inputs=enc_inputs, outputs=decoder(encoder(enc_inputs)), name='autoencoder')
dae.summary()

dae.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
enc_input (InputLayer)       (None, 69678)             0         
_________________________________________________________________
encoder (Model)              (None, 200)               41927600  
_________________________________________________________________
decoder (Model)              (None, 69678)             41997078  
Total params: 83,924,678
Trainable params: 83,924,678
Non-trainable params: 0
_________________________________________________________________


In [117]:
from sklearn.utils import shuffle

def data_generator(data, batch_size):
    N = data.shape[0]
    while True:
        data = shuffle(data)
        for i in range(0, N, batch_size):
            batch = data[i:i+batch_size]
            yield batch.todense(), batch.todense()

In [119]:
from keras.callbacks import Callback

# def calculate_recall(preds, test, batch_size=100, R=20):
#     N = test.shape[0]
#     M = test.shape[1]
#     total_recall = 0.
#     for i in range(0, N, batch_size):
#         test_batch = test[i:i+batch_size]
#         pred_batch = preds[i:i+batch_size]
        
#         dense_test = test_batch.todense()
        
#         test_total_items = np.sum(dense_test, axis=1)
#         ranked_items = np.argsort(pred_batch, axis=1)
        
#         recall = np.expand_dims(np.sum(np.where(ranked_items >= M-R, dense_test, 0), axis=1), 1)/np.minimum(test_total_items, 20)
#         total_recall += np.sum(recall)
       
#     return total_recall/N


def calculate_recall_ndcg(preds, test, val_is_holdout, batch_size=100, recall_k=20, ndcg_k=50):
    N = test.shape[0]
    M = test.shape[1]
    total_recall = 0.
    
    ndcg = np.zeros(test.shape[0], dtype='float32')
    
    for i in range(0, N, batch_size):
        test_batch = test[i:i+batch_size]
        pred_batch = preds[i:i+batch_size]
        test_inds_batch = val_is_holdout[i:i+batch_size]
        
        pred_batch = np.where(test_inds_batch, preds[i:i+batch_size], 0)

        dense_test = test_batch.todense()
        
        test_total_items = np.sum(dense_test, axis=1)
        ranked_items = np.argsort(pred_batch, axis=1)
        
        recall = np.expand_dims(np.sum(np.where(ranked_items > M-recall_k, dense_test, 0), axis=1), 1)/np.minimum(test_total_items, 20)
        total_recall += np.sum(recall)
        
        ranks = np.where(ranked_items > M-ndcg_k, M - ranked_items , 0)
        matches = np.where(ranked_items > M-ndcg_k, dense_test, 0)
        dcg = np.sum(np.divide(matches, np.log(ranks + 1), out=np.zeros_like(matches, dtype='float32'), where=ranks!=0), axis=1)
       
        ndcg[i:i+batch_size] = dcg
    
    ndcg = ndcg/np.max(ndcg)
    total_ndcg = np.sum(ndcg)
    
    return total_recall/N, total_ndcg/N

class RecMetrics(Callback):
    
    def __init__(self, val_train_data, val_test_data, val_is_holdout):
        self.val_train_data = val_train_data
        self.val_test_data = val_test_data
        self.val_is_holdout = val_is_holdout
        return
    
    def on_train_begin(self, logs={}):
        self.ndcg_hist = []
        self.recall_hist = []
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):
        preds = np.asarray(self.model.predict(self.val_train_data))
        recall, ndcg = calculate_recall_ndcg(preds, self.val_test_data, self.val_is_holdout)
        print ("Recall: {}".format(recall))
        print ("NDCG: {}".format(recall))
        self.recall_hist.append(recall)
        self.ndcg_hist.append(ndcg)
        return
 
    def on_batch_begin(self, batch, logs={}):
        if (batch + 1) % 100 == 0:
            preds = np.asarray(self.model.predict(self.val_train_data))
            recall, ndcg = calculate_recall_ndcg(preds, self.val_test_data, self.val_is_holdout)
            print ("")
            print ("Recall: {}".format(recall))
            print ("NDCG: {}".format(recall))
        return
 
    def on_batch_end(self, batch, logs={}):
        return
    
rec_metrics = RecMetrics(val_train_data, val_test_data, val_is_holdout)

In [120]:
val_holdout_songs.shape

(20000, 13935)

In [121]:
dae.fit(x=train_data, y=train_data, 
        batch_size = 128,
        callbacks = [rec_metrics],
        epochs=30)

Epoch 1/30
 12672/353740 [>.............................] - ETA: 3:06:07 - loss: 911.2678
Recall: 0.00029111743433724843
NDCG: 0.00029111743433724843
 25472/353740 [=>............................] - ETA: 4:03:50 - loss: 898.3143
Recall: 0.0002945127144527297
NDCG: 0.0002945127144527297
 38272/353740 [==>...........................] - ETA: 4:11:18 - loss: 889.9550
Recall: 0.00031656908429547733
NDCG: 0.00031656908429547733
 51072/353740 [===>..........................] - ETA: 4:10:11 - loss: 886.2078
Recall: 0.00030394764935786587
NDCG: 0.00030394764935786587
 63872/353740 [====>.........................] - ETA: 4:04:59 - loss: 882.1180
Recall: 0.00031090660350141736
NDCG: 0.00031090660350141736
 68992/353740 [====>.........................] - ETA: 4:18:36 - loss: 880.7896

KeyboardInterrupt: 

In [63]:
preds = np.asarray(dae.predict(val_train_data[0:5000]))

In [113]:
calculate_recall_ndcg(preds, val_test_data[0:5000], val_is_holdout[0:5000], val_holdout_songs.shape[1])

(0.0003290238270501429, 0.002361402130126953)