In [41]:
import sys
import os
import papermill as pm
import scrapbook as sb
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

In [42]:
from libs.timer import Timer
from libs.lightgcn import LightGCN
from libs.ImplicitCF import ImplicitCF
from libs.python_splitters import python_stratified_split
from libs.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from libs.constants import SEED as DEFAULT_SEED
from libs.deeprec_utils import prepare_hparams

In [43]:
yaml_file = "../libs/lightgcn.yaml"


df = pd.read_json('dataset100k.json')

df = df.drop(columns=['title','text','property_dict'])

print(df.head())

                                           hotel_url         author  \
0  Hotel_Review-g60978-d568121-Reviews-Belle_View...       Sammy2co   
1  Hotel_Review-g60978-d568121-Reviews-Belle_View...  Itravel_1983I   
2  Hotel_Review-g60978-d568121-Reviews-Belle_View...     happy02117   
3  Hotel_Review-g60978-d568121-Reviews-Belle_View...        neruals   
4  Hotel_Review-g60978-d568121-Reviews-Belle_View...         fitztp   

        date  rating  
0 2010-07-01       5  
1 2010-06-01       5  
2 2010-06-01       4  
3 2010-05-01       4  
4 2010-05-01       5  


In [44]:
train, test = python_stratified_split(df, ratio=0.75)
data = ImplicitCF(train=train, test=test, seed=DEFAULT_SEED)

print(data)

  df = train if test is None else train.append(test)


<libs.ImplicitCF.ImplicitCF object at 0x1d37939d0>


In [45]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
EPOCHS = 50
BATCH_SIZE = 1024

yaml_file = "./libs/lightgcn.yaml"

hparams = prepare_hparams(yaml_file,
                          n_layers=3,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.005,
                          eval_epoch=5,
                          top_k=TOP_K,
                         )

In [46]:
SEED = DEFAULT_SEED  # Set None for non-deterministic results

model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [47]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)37.1s: train loss = 0.19702 = (mf)0.19678 + (embed)0.00024
Epoch 2 (train)37.1s: train loss = 0.00964 = (mf)0.00917 + (embed)0.00047
Epoch 3 (train)35.1s: train loss = 0.00444 = (mf)0.00391 + (embed)0.00053
Epoch 4 (train)35.9s: train loss = 0.00277 = (mf)0.00221 + (embed)0.00056
Epoch 5 (train)34.8s + (eval)0.7s: train loss = 0.00195 = (mf)0.00137 + (embed)0.00058, recall = 0.00559, ndcg = 0.00379, precision = 0.00056, map = 0.00326
Epoch 6 (train)34.3s: train loss = 0.00158 = (mf)0.00099 + (embed)0.00059
Epoch 7 (train)34.6s: train loss = 0.00136 = (mf)0.00076 + (embed)0.00059
Epoch 8 (train)34.7s: train loss = 0.00118 = (mf)0.00059 + (embed)0.00059
Epoch 9 (train)34.4s: train loss = 0.00108 = (mf)0.00050 + (embed)0.00058
Epoch 10 (train)34.4s + (eval)0.3s: train loss = 0.00097 = (mf)0.00040 + (embed)0.00057, recall = 0.00559, ndcg = 0.00379, precision = 0.00056, map = 0.00326
Epoch 11 (train)34.6s: train loss = 0.00090 = (mf)0.00035 + (embed)0.00056
Epoch 12 (train)34

In [48]:
topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

topk_scores.tail()

Unnamed: 0,author,hotel_url,prediction
3575,yungkara,Hotel_Review-g319807-d1442948-Reviews-Cherry_H...,6.119183
3576,yungkara,Hotel_Review-g55287-d10632707-Reviews-Hampton_...,5.794817
3577,yungkara,Hotel_Review-g662620-d1223003-Reviews-Sahas_St...,5.685108
3578,yungkara,Hotel_Review-g635614-d2149964-Reviews-Villa_Di...,5.683023
3579,yungkara,Hotel_Review-g1390118-d2440868-Reviews-Shwe_Th...,5.603535


In [49]:
eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.003192
NDCG:	0.003724
Precision@K:	0.000559
Recall@K:	0.005587
