
## Graph Autoencoder based Collaborative Filtering
#Update 2020.03.21

 - As the model is so big we need to save and reload the last save epoch with checkpoint.

#Update 2020.03.14
- Deep and wide neighours n_order, k_neighbour -> $n^k$ inputs
- **Note** the model consumes lots of RAM with deeper and wider nodes

Main settings in 4A: 

#Update 2020.03.02
- Integrated validation set during training
- Integrated early stopping with delta = 1e-5
- Use 'adadelta' optimizer for dynamic learning rate
- user n neibor + item n neibor
- @base: 3/03/20 discussion



#[New Model](https://drive.google.com/file/d/1kN5loA18WyF1-I7BskOw6c9P1bdArxk7/view?usp=sharing): 

![Click](https://drive.google.com/file/d/1kN5loA18WyF1-I7BskOw6c9P1bdArxk7/view?usp=sharing)


#Model implementation framework

TF2.0 and Keras implementation

- Create GMF model
    - Create helper methods: User/item latent
    - Create loss functions
    - Handle input $u_i, v_j$
    - Handle output $\hat{r}_{ij}$

## Organise imports


In [1]:
#@title
#import
#tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, Concatenate, Embedding, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import dot, add
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
#dt_dir_name= "C:/Users/jiyu/Desktop/Mo/sample_data/ml-1m"
dt_dir_name= "C:/Users/thinguyen/Desktop/PhD_2020/Python Code/GNN/Mo/sample_data/Amazon_Book_small"


In [3]:
#prepare folder structures

saved_model_dir = 'saved_models_WiHi_MLP(2,1)/'
!mkdir -p "saved_models_WiHi_MLP(2,1)"

A subdirectory or file saved_models_WiHi_MLP(2,1) already exists.
Error occurred while processing: saved_models_WiHi_MLP(2,1).


In [4]:
load_saved_model = True
n_order = 2
k_neighbour=1
lr = 0.0005
l1_reg=1e-5
l2_reg=1e-4
k=20

In [None]:
dataset = pd.read_csv(dt_dir_name +'/'+ 'ratings.csv', names=['user_id', 'item_id', 'rating'])
#dataset = pd.read_csv(dt_dir_name +'/'+ "ratings.csv")


In [None]:
#reindex from 0 ids
dataset.user_id = dataset.user_id.astype('category').cat.codes.values
dataset.item_id = dataset.item_id.astype('category').cat.codes.values
#createMFModel(dataset=dataset)

##Turn original dataset to negative sample dataset

In [None]:
#Version 1.2 (flexible + superfast negative sampling uniform)
import random
import time
import scipy

def neg_sampling(ratings_df, n_neg=1, neg_val=0, pos_val=1, percent_print=5):
  """version 1.2: 1 positive 1 neg (2 times bigger than the original dataset by default)

    Parameters:
    input rating data as pandas dataframe: userId|movieId|rating
    n_neg: include n_negative / 1 positive

    Returns:
    negative sampled set as pandas dataframe
            userId|movieId|interact (implicit)
  """
  sparse_mat = scipy.sparse.coo_matrix((ratings_df.rating, (ratings_df.user_id, ratings_df.item_id)))
  dense_mat = np.asarray(sparse_mat.todense())
  print(dense_mat.shape)

  nsamples = ratings_df[['user_id', 'item_id']]
  nsamples['rating'] = nsamples.apply(lambda row: 1, axis=1)
  length = dense_mat.shape[0]
  printpc = int(length * percent_print/100)

  nTempData = []
  i = 0
  start_time = time.time()
  stop_time = time.time()

  extra_samples = 0
  for row in dense_mat:
    if(i%printpc==0):
      stop_time = time.time()
      print("processed ... {0:0.2f}% ...{1:0.2f}secs".format(float(i)*100 / length, stop_time - start_time))
      start_time = stop_time

    n_non_0 = len(np.nonzero(row)[0])
    zero_indices = np.where(row==0)[0]
    if(n_non_0 * n_neg + extra_samples >= len(zero_indices)):
      print(i, "non 0:", n_non_0,": len ",len(zero_indices))
      neg_indices = zero_indices.tolist()
      extra_samples = n_non_0 * n_neg + extra_samples - len(zero_indices)
    else:
      neg_indices = random.sample(zero_indices.tolist(), n_non_0 * n_neg + extra_samples)
      extra_samples = 0

    nTempData.extend([(uu, ii, rr) for (uu, ii, rr) in zip(np.repeat(i, len(neg_indices))
                    , neg_indices, np.repeat(neg_val, len(neg_indices)))])
    i+=1

  nsamples=nsamples.append(pd.DataFrame(nTempData, columns=["user_id","item_id", "rating"]),ignore_index=True)
  nsamples.reset_index(drop=True)
  return nsamples

In [None]:
neg_dataset = neg_sampling(dataset, n_neg=1)
neg_dataset.shape

##Create train test set


In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(neg_dataset, test_size=0.2, random_state=2020)


#Create deep embedding using MLP of the [model](https://drive.google.com/file/d/1kN5loA18WyF1-I7BskOw6c9P1bdArxk7/view?usp=sharing)

In [None]:
uids = np.sort(dataset.user_id.unique())
iids = np.sort(dataset.item_id.unique())

n_users = len(uids)
n_items = len(iids)

In [None]:
n_users, n_items

## Create deep autoencoder (Skipped this)


Reference: [keras](https://blog.keras.io/building-autoencoders-in-keras.html)

#Create rating matrix

In [None]:
import scipy
sparse_mat = scipy.sparse.coo_matrix((neg_dataset.rating, (neg_dataset.user_id, neg_dataset.item_id)))
rating_matrix = np.asarray(sparse_mat.todense())

In [None]:
rating_matrix

#Helper functions

In [None]:
def create_hidden_size(n_hidden_layers = 3, n_latent_factors = 8):
  """Sizes of each hidden layer, decreasing order"""
  hidden_size = [n_latent_factors*2**i for i in reversed(range(n_hidden_layers))]
  return hidden_size

In [None]:
create_hidden_size()

### Create nearest neighbour (using cosine similarity)

Deep and wide version! n order + k neighbour 
Total: $k + k^2 + ...+ k^n$
This is fuking insane!
- Order 2: first $k$ rows
- Order 3: next $k^2$ rows
- Order 4: next $k^3$ rows

Important pattern when parsing data:


$[order 2 \rightarrow order 3 \rightarrow order 4]$

samples:

$[k \rightarrow k^2 \rightarrow k^3 ]$

**Note**: don't care about loop (self-loop) e.g. $\Delta$

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def create_closest_neighbour_list(rating_matrix, n_order, k_neighbour):
  """return index list of most (k) similar rows that sorted descendingly of 2, 3,..n order
  
  Params:
     n_order: 1 -> its self, 2-> depth = 2 (include 1 further)
     k_neighour: number of neighour nodes each node in each order from 1 -> n.

  """
  k_nb = []
  idx = 0
  cos_matrix = cosine_similarity(rating_matrix, rating_matrix)
  #print(cos_matrix)
  for row in cos_matrix:
    k_largest = np.argsort(-row)[:k_neighbour+1]
    k_largest = k_largest.tolist()
    if idx in k_largest:
      k_largest.remove(idx)
    k_nb.append(k_largest[:k_neighbour])
    idx += 1
  k_nb_2nd = np.stack(k_nb, axis=1)
  #print(k_nb_2nd)

  temp = k_nb_2nd
  for o in range(2, n_order):
    start_idx = sum([k_neighbour*k_neighbour**i for i in range(o-2)])

    #print([k_neigbour*k_neigbour**i for i in range(o-2)],"start:", start_idx)
    temp1 = np.concatenate([np.asarray([k_nb_2nd[:, k] for k in row]).T for row in temp[start_idx:,:]])
    temp = np.concatenate([temp,temp1])

  return temp



#Create model with Keras with shared autoencoder layers

Reference: shared vision model: https://keras.io/getting-started/functional-api-guide/#shared-vision-model

Problem: graph disconnect : https://github.com/keras-team/keras/issues/11151


###Create custom loss for ui,um,& items

Currently not in use!!!

###Create shared autoencoder

In [None]:
def createSharedAutoEncoder(input_shape, hidden_size, names=['user_encoder', 'user_decoder']):
    """This method is to create autoencoder

    Parameters: 
    input_shape: tuble for shape. For this method, one value is expected, e.g. (30, ).
    hidden_size: the array that contains number of neuron each layers, e.g. [10, 20, 1]
  
    Returns: 
    encoder: the encoder model
    decoder: the decoder model
    """
    # shared autoencoder
    input=Input(shape=input_shape)
    encoded = input
    for nn in hidden_size[:-1]:
        encoded = Dense(nn, activation='relu',kernel_initializer='he_uniform')(encoded) 
    encoded = Dense(hidden_size[-1], activation='relu',kernel_initializer='he_uniform',
                  name=names[0])(encoded) 
    encoder = Model(input, encoded, name=names[0])

    #------- decoder model
    hidden_size.reverse()
    decoderinput = Input(shape=(hidden_size[0]))
    decoded = decoderinput
    for nn in hidden_size[1:]:
      decoded = Dense(nn, activation='relu', kernel_initializer='he_uniform')(decoded)
    decoded = Dense(input_shape[0], activation='relu', kernel_initializer='he_uniform', name=names[1])(decoded)
    decoder = Model(decoderinput, decoded, name=names[1])
    return encoder, decoder

###Integrate autoencoders + mlp + custom loss

In [None]:
import numpy as np
def get_input_weights(n_order, k_neighbour, decay=4):
  layer_weights = [np.repeat(decay**(n_order-o-1), k_neighbour**o) for o in range(n_order)]
  layer_weights_flat = np.concatenate(layer_weights).ravel()
  layer_weights_sum = np.sum(layer_weights_flat)
  layer_weights_normalized = layer_weights_flat / layer_weights_sum
  return layer_weights_normalized

get_input_weights(2, 1, 4)


In [None]:
def create_model(n_users, n_items, n_order=2, k_neighbour=1, latent_factors=64, lr = 0.0005, l1_reg=1e-5, l2_reg=1e-4):
  """
  number of depth = n_order, n_order=2: 1 node + 1 deeper node
  """

  #user shared autoencoder
  hidden_size = create_hidden_size() #for autoencoder
  uencoder, udecoder = createSharedAutoEncoder((n_items,), hidden_size)
  #item shared autoencoder
  hidden_size = create_hidden_size() #for autoencoder
  iencoder, idecoder = createSharedAutoEncoder((n_users,), 
                                               hidden_size,['item_encoder','item_decoder'])   

  #create n inputs + shared autoencoder
  u_inputs = []
  v_inputs = []

  u_encoded = []
  v_encoded = []
  u_decoded = []
  v_decoded = []
  #n-order proximity by comparing n embedded vecs
  input_weights = get_input_weights(n_order, k_neighbour, decay=4)

  for i in range(n_order):
    u_inputs.extend([Input(shape=(n_items,), name= f'ui{i}{k}') for k in range(k_neighbour**i)])
    v_inputs.extend([Input(shape=(n_users,), name= f'vj{i}{k}') for k in range(k_neighbour**i)])

  u_encoded.extend([uencoder(u_i) for u_i in u_inputs])
  v_encoded.extend([iencoder(v_j) for v_j in v_inputs])
  
  u_decoded.extend([udecoder(u_en) for u_en in u_encoded])
  v_decoded.extend([idecoder(v_en) for v_en in v_encoded])

  #get ALL COMBINED embeddings from 2 encoders(Need work with combining method)
  uii_encoded = add([u_encoded[i]*input_weights[i] for i in range(len(u_encoded))]) if n_order > 1 and k_neighbour > 0 else u_encoded[0]
  vji_encoded = add([v_encoded[i]*input_weights[i] for i in range(len(u_encoded))]) if n_order > 1 and k_neighbour > 0 else v_encoded[0]

  concat = layers.concatenate([uii_encoded, vji_encoded])
  mlp = concat
  for i in range(3,-1,-1):
    if i == 0:
      mlp = Dense(8**i, activation='sigmoid', name="mlp")(mlp)
    else:
      mlp = Dense(8*2**i, activation='sigmoid')(mlp)
      if i >= 2:
        mlp = BatchNormalization()(mlp)
        mlp = Dropout(0.2)(mlp)

  model = Model(inputs=[u_inputs, v_inputs], 
                outputs=[u_decoded, v_decoded, mlp])
  
  udecoder_names=["user_decoder" if x==0 else f"user_decoder_{x}" for x in range(len(input_weights))]
  vdecoder_names=["item_decoder" if x==0 else f"item_decoder_{x}" for x in range(len(input_weights))]

  udecoder_dict = {ukey: 'mean_squared_error' for ukey in udecoder_names}
  vdecoder_dict = {vkey: 'mean_squared_error' for vkey in vdecoder_names}

  udecoder_metric_dict = {ukey: 'mse' for ukey in udecoder_names}
  vdecoder_metric_dict = {vkey: 'mse' for vkey in udecoder_names}
  

  losses={'mlp':'binary_crossentropy', **udecoder_dict, **vdecoder_dict}
  
  metrics={'mlp':['binary_accuracy'
                        ], 
              **udecoder_metric_dict,
              **vdecoder_metric_dict
           }
  adadelta=tf.keras.optimizers.Adadelta(learning_rate=lr)
  model.compile(optimizer='adadelta', loss=losses, metrics=metrics)

  model.summary()

  return  model

##Argparse

Store all settings here

In [None]:

import os
if load_saved_model:
  saved_list = os.listdir(saved_model_dir)
  saved_list.sort()
  print(saved_list)
  if(len(saved_list) != 0): 
    last_saved = saved_list[-1]
    model = tf.keras.models.load_model(saved_model_dir+'/'+last_saved)
  else:
    model = create_model(n_users, n_items, n_order, k_neighbour)


###Create data generator using rating matrix

It takes rating matrix and generate a sequence of users, items, and ratings

In [None]:
closest_uneighbor = create_closest_neighbour_list(rating_matrix, n_order, k_neighbour)
closest_ineighbor = create_closest_neighbour_list(rating_matrix.T, n_order,k_neighbour)
closest_uneighbor.shape, closest_ineighbor.shape

In [None]:
from tensorflow.keras.utils import Sequence
import math

class DataGenerator(Sequence):
    def __init__(self, dataset, rating_matrix, batch_size=100, n_order = 2, k_neighbour=1, shuffle=True):
        'Initialization'
        self.n_order = n_order
        self.batch_size = batch_size
        self.dataset = dataset
        self.shuffle = shuffle
        self.indices = self.dataset.index
        self.rating_matrix = rating_matrix

        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return math.floor(len(self.dataset) / self.batch_size)

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        idxs = [i for i in range(index*self.batch_size,(index+1)*self.batch_size)]

        # Find list of IDs
        list_IDs_temp = [self.indices[k] for k in idxs]

        # Generate data
        uids = self.dataset.iloc[list_IDs_temp,[0]].to_numpy().reshape(-1)
        iids = self.dataset.iloc[list_IDs_temp,[1]].to_numpy().reshape(-1)

        Users = np.stack([rating_matrix[row] for row in uids])
        Items = np.stack([rating_matrix[:, col] for col in iids])
        ratings = self.dataset.iloc[list_IDs_temp,[2]].to_numpy().reshape(-1)

        if n_order > 1 and k_neighbour > 0:
          u_neighbors = [closest_uneighbor[:,index] for index in uids ]
          i_neighbors = [closest_ineighbor[:,index] for index in iids]
          #print([np.stack([rating_matrix[row] for row in u_neighbors[i]]) for i in range(len(u_neighbors))])

          User_neighbors =list(zip(*[[rating_matrix[rowId] for rowId in u_neighbors[i]] for i in range(len(u_neighbors))]))
          #print([u for u in User_neighbors])#, User_neighbors.shape)
          User_neighbors = np.array([np.stack(batch) for batch in User_neighbors])

          Item_neighbors =list(zip(*[[rating_matrix[:,colId] for colId in i_neighbors[i]] for i in range(len(i_neighbors))]))
          Item_neighbors = np.array([np.stack(batch) for batch in Item_neighbors])

          return [Users, *User_neighbors, Items, *Item_neighbors],[Users,*User_neighbors, Items, *Item_neighbors, ratings]
        else:
          return [Users, Items],[Users, Items, ratings]
     

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indices = np.arange(len(self.dataset))
        if self.shuffle == True:
            np.random.shuffle(self.indices)


##Training with data generator

In [None]:
#early_stop = EarlyStopping(monitor='val_mlp_loss', min_delta = 0.0001, patience=10)
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
#                               patience=10, min_lr=0.000001)

In [None]:
checkpoint_path= saved_model_dir + "/model-{epoch:02d}-{mlp_binary_accuracy:.2f}.hdf5"
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, monitor='mlp_binary_accuracy',verbose=1, save_best_only=True, mode='max')

In [None]:
train_generator = DataGenerator(train, rating_matrix, batch_size=256, n_order=n_order, k_neighbour=k_neighbour, shuffle=False)
#val_generator = DataGenerator(val, rating_matrix, batch_size=512, n_order=n_order, k_neighbour=k_neighbour,  shuffle=False)

In [None]:
history = model.fit(train_generator,
                    # validation_data=val_generator,
                    epochs=100, 
                    verbose=2, callbacks=[cp_callback,
                                          # early_stop
                                          ],
                    #workers=4,
                    shuffle=False)

## Plot losses

There are several losses, pick the one we need

Let's now see how our model does! I'll do a small post-processing step to round off our prediction to the nearest integer. This is usually not done, and thus just a whimsical step, since the training ratings are all integers! There are better ways to encode this intger requirement (one-hot encoding!), but we won't discuss them in this post.

In [None]:
test_datagenerator = DataGenerator(test, rating_matrix)

results = model.evaluate(test_datagenerator)

print(results)

In [None]:
#####################################################################
#Cal HR according to NCF
#Create user & item list:
tmp_lst_u=train.user_id.unique()
#tmp_lst_i=train.item_id.unique()
tmp_lst_i=dataset.item_id.unique()
tmp_lst_u.sort(), tmp_lst_i.sort()
lst_user=tmp_lst_u.tolist()
lst_item=tmp_lst_i.tolist()


In [None]:
def Top_100_Unused_item(user_id):
    tmp_df_used_item=train.loc[(train['user_id']==user_id) & (train['rating']==1)]
    tmp_lst=tmp_df_used_item['item_id'].values.tolist()
    #lst_un_item= set(lst_item) - set(tmp_lst)
    lst_un_item=[x for x in lst_item if x not in tmp_lst]
    
    #random 100 items:
    tmp_no=100000
    np.random.seed(2020)
    lst_100_un_item=(np.random.choice(lst_un_item,tmp_no))
    
    
    #Create DataFrame
    
    tmp_df=pd.DataFrame(columns=['user_id', 'item_id', 'rating', 'prediction'])
    tmp_df['item_id']=lst_100_un_item
    tmp_df['user_id']=user_id
    tmp_df['rating']=0.0

 
    top_datagenerator = DataGenerator(tmp_df, rating_matrix)
    tmp_y_hat = model.predict(top_datagenerator)
    y_hat= tmp_y_hat[4]
    tmp_arr=y_hat.flatten().tolist()
    tmp_df['prediction']=tmp_arr
    return tmp_df
    
    
#tạo item_id array for each user:
def recommend(df,u,k):
    tmp_df=df.sort_values(by=['prediction'],ascending=False)
    tmp_df=tmp_df.head(k)
    
    
    #reset index sẽ dễ cho việc .iloc hoặc .loc
    tmp_df.reset_index(drop=True, inplace=True) 
    
    tmp_arrItem=tmp_df['item_id'].to_numpy()
    return (tmp_arrItem,tmp_df)

def dcg_at_k(r, k):
    assert k >= 1
    r = np.asfarray(r)[:k] != 0
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k):
    assert k >= 1
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

In [None]:
import random

test_2=test.copy()
test_2.reset_index(drop=True, inplace=True)


k=20
rd_no =10
np.random.seed(2020)
rd_lst_usr=np.random.choice(lst_user,rd_no)
#rd_lst_usr=lst_user
#________________________________________________________________________________________________
#tạo dataframe HR
df_HR=pd.DataFrame(columns=['user_id', 'HR','NDCG'])
df_HR['user_id']=rd_lst_usr
df_HR=df_HR.sort_values(by=['user_id'],ascending=True)

for u in rd_lst_usr:
    df_100_Unused=Top_100_Unused_item(u)
    #get top 20 prediction:
    arr_top_k,df_top_k=recommend(df_100_Unused,u,k)
    
    #Check_with_TestData(df_top_k,test_2)
    for i in range(len(df_top_k)):
    #Column sort: "user_id -> item_id -> rating -> prediction
        usr=df_top_k.iloc[i,0]
        itm=df_top_k.iloc[i,1]
        #check xem có row nào trong test_2 thỏa mãn ko, nếu có sẽ tạo ra 1 df có >=1 row
        chk=len(test_2.loc[(test_2["user_id"]==usr) & (test_2["item_id"]==itm) & (test_2["rating"]==1)])
        if chk==1: 
            df_top_k.loc[(df_top_k["user_id"]==usr) & (df_top_k["item_id"]==itm),"rating"]=1
        
    rating_lst=df_top_k['rating'].tolist()   
    #################################################
    #Tính HR:
    tmp_cnt=0
    for r in rating_lst:
        if r!=0:
            tmp_cnt += 1
    tmp_hr = tmp_cnt/len(rating_lst)
    df_HR.loc[df_HR["user_id"]==int(u),["HR"]]=tmp_hr
       
    ##########################################################
    #Tính NDCG:
    
    ndcg=ndcg_at_k(rating_lst, k)    
    df_HR.loc[df_HR["user_id"]==int(u),["NDCG"]]=ndcg
#print(df_HR)    

In [None]:
df_HR

In [None]:
#Calculate HR and NDCG for the model
HR_temp= df_HR.sum(0)

HR=HR_temp[1]/(len(df_HR))

NDCG=HR_temp[2]/(len(df_HR))
print("HR= ", HR)
print("NDCG= ", NDCG)


#References

Input layer:

- Embedding layer: [Link](https://gdcoder.com/-what-is-an-embedding-layer/)
- Embedding lookup: [link text](https://keras.io/layers/embeddings/)
- Multi input: [link text](https://keras.io/getting-started/functional-api-guide/#multi-input-and-multi-output-models)
    