In [21]:
import keras
from keras.layers import Activation, Dense, Input, Flatten, Dropout, Lambda, Softmax
from keras.models import Model
from keras import regularizers
from keras import backend as K
import tensorflow as tf

import numpy as np
from scipy import sparse
import pandas as pd
import os
import bottleneck as bn

In [2]:
unique_sid = list()
with open(os.path.join('pro_sg', 'unique_sid.txt'), 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

n_items = len(unique_sid)

In [3]:
df = pd.read_csv('pro_sg/train.csv')
n_users = df['uid'].max() + 1

rows, cols = df['uid'], df['sid']
train = sparse.csr_matrix((np.ones_like(rows),
                         (rows, cols)), dtype='float64',
                         shape=(n_users, n_items))

In [4]:
def load_tr_te_data(csv_file_tr, csv_file_te):
    tp_tr = pd.read_csv(csv_file_tr)
    tp_te = pd.read_csv(csv_file_te)

    start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
    end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())

    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

    data_tr = sparse.csr_matrix((np.ones_like(rows_tr),
                             (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te),
                             (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    return data_tr, data_te

In [5]:
valid_train, valid_test = load_tr_te_data('pro_sg/validation_tr.csv', 'pro_sg/validation_te.csv')

In [24]:
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG

In [52]:
K.clear_session()

h1 = 200

enc_inputs = Input(shape=(n_items,), name='enc_input')
inputs_normed = Lambda(lambda  x: tf.nn.l2_normalize(x, dim=1))(enc_inputs)
x = Dropout(0.5)(inputs_normed)

latent1_enc = Dense(h1, activation='tanh', name='latent1_enc', kernel_regularizer=regularizers.l2(0.01))(x)

encoder = Model(enc_inputs, latent1_enc, name='encoder')
encoder.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
enc_input (InputLayer)       (None, 69675)             0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 69675)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 69675)             0         
_________________________________________________________________
latent1_enc (Dense)          (None, 200)               13935200  
Total params: 13,935,200
Trainable params: 13,935,200
Non-trainable params: 0
_________________________________________________________________


In [53]:
dec_inputs = Input(shape=(h1,), name='dec_inputs')

logits = Dense(n_items, name='logits_dec', kernel_regularizer=regularizers.l2(0.01))(dec_inputs)
probs = Softmax(name='probs_dec')(logits) 
decoder = Model(dec_inputs, probs, name='decoder')
decoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dec_inputs (InputLayer)      (None, 200)               0         
_________________________________________________________________
logits_dec (Dense)           (None, 69675)             14004675  
_________________________________________________________________
probs_dec (Softmax)          (None, 69675)             0         
Total params: 14,004,675
Trainable params: 14,004,675
Non-trainable params: 0
_________________________________________________________________


In [54]:

def customLoss(yTrue,yPred):
    return -K.mean(K.sum(yTrue * K.log(yPred))


dae = Model(inputs=enc_inputs, outputs=decoder(encoder(enc_inputs)), name='autoencoder')
dae.summary()

dae.compile(loss=customLoss, optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
enc_input (InputLayer)       (None, 69675)             0         
_________________________________________________________________
encoder (Model)              (None, 200)               13935200  
_________________________________________________________________
decoder (Model)              (None, 69675)             14004675  
Total params: 27,939,875
Trainable params: 27,939,875
Non-trainable params: 0
_________________________________________________________________


In [55]:
from sklearn.utils import shuffle

def data_generator(data, batch_size):
    N = data.shape[0]
    while True:
        roworder = np.random.permutation(N)
        for start in range(0, N, batch_size):
            end = min(N, start+batch_size)
            batch = data[roworder[start:end]]
            batch = batch.toarray().astype('float32')
            yield (batch, batch)
    

In [56]:
from keras.callbacks import Callback


class RecMetrics(Callback):
    
    def __init__(self, val_train_data, val_test_data):
        self.val_train_data = val_train_data
        self.val_test_data = val_test_data
        return
    
    def on_train_begin(self, logs={}):
        self.ndcg_hist = []
        self.recall_hist = []
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):
        N = self.val_train_data.shape[0]
        ndcg_list = []
        for start in range(0, N, 500):
            end = min(N, start+500)
            X = self.val_train_data[start:end]
            X = X.toarray().astype('float32')
            preds = np.asarray(self.model.predict(X))
            preds[X.nonzero()] = -np.inf
            ndcg_list.append(NDCG_binary_at_k_batch(preds, self.val_test_data[start:end]))
        print ("NDCG: {}".format(np.mean(ndcg_list)))
        
        return
 
    def on_batch_begin(self, batch, logs={}):
        return
 
    def on_batch_end(self, batch, logs={}):
        return
    
rec_metrics = RecMetrics(valid_train, valid_test)

In [None]:
import gc
gc.collect()
dae.fit_generator(train_generator(train, 500),
                 verbose=1, 
                 epochs=30,
                 steps_per_epoch = train.shape[0]//500,
                 callbacks=[rec_metrics])
# dae.fit(x=train, y=train,
#         batch_size = 500,
#         verbose=2,
#         epochs=30)

Epoch 1/30
NDCG: 0.2694218756511447
Epoch 2/30
NDCG: 0.3037702374050533
Epoch 3/30
NDCG: 0.32158573017953457
Epoch 4/30