# LightGCN(simplified GCN model for recommendation)
Linear and neat Graph Convolution Network model for recommendation.

In [1]:
import sys
import os
import scrapbook as sb
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.7.13 (default, Mar 29 2022, 02:18:16) 
[GCC 7.5.0]
Pandas version: 1.3.5
Tensorflow version: 2.7.3


In [10]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
EPOCHS = 50
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

# yaml_file = "../../recommenders/models/deeprec/config/lightgcn.yaml"
yaml_file = 'lightgcn.yaml'
user_file = "../../tests/resources/deeprec/lightgcn/user_embeddings.csv"
item_file = "../../tests/resources/deeprec/lightgcn/item_embeddings.csv"

### Load and split data

In [4]:
df = pd.read_csv('Movielens_100k.csv')
df.columns = ['userID', 'itemID', 'rating', 'timestamp']
train, test = python_stratified_split(df, ratio=0.75)

### Process data
`ImplicitDF`는 훈련 과정을 위해 데이터를 초기화하고 로드하는 클래스이다. 0보다 큰 평점은 implicit positive interaction으로 바뀌고, adjacency 행렬 *R*이 생성된다.   
`get_norm_adj_mat` : user-item 그래프의 정규화된 adjacency 행렬이 `adj_dir`에 있으면 로드, 아니면 `create_norm_adj_mat` 호출하여 행렬을 만들어 `adj_dir`에 저장   
`train_loader` : 훈련 데이터의 batch 생성 - 유저의 배치 샘플하고, 각 유저에 대해 positive item 하나, negative item 하나 샘플한다.

In [6]:
data = ImplicitCF(train=train, test=test, seed=DEFAULT_SEED)

### Prepare hyper-parameters
`LightGCN` 모델에서 중요한 매개변수들은,   
- `data` : *LightGCNDataset* 객체를 초기화한다
- `epochs` , `n_layers`, `top_k`
- `eval_epoch` : `None`이 아니라면 검증 메트릭이 테스트셋에서 모든 'eval_epoch' 에폭마다 계산된다. 이 경우, 훈련 과정에서 모델의 효과를 관찰할 수 있다.  

완성된 매개변수 리스트는 `yaml_file`에서 찾을 수 있다. yaml 파일을 읽기 위해 `prepare_hparams`를 사용, 매개변수 풀세트를 준비. 함수의 매개변수로 전달된 매개변수들은 yaml 세팅을 덮어쓴다.

In [11]:
hparams = prepare_hparams(yaml_file, n_layers=3,
                         batch_size=BATCH_SIZE, epochs=EPOCHS,
                         learning_rate=0.005, eval_epoch=5,
                         top_k=TOP_K)


### Create and train model

In [12]:
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


2022-07-22 16:04:44.442051: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-22 16:04:44.482198: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-22 16:04:44.501737: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-22 16:04:44.502390: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

In [None]:
with Timer() as train_t:
    model.fit()
    

In [14]:
print("Took {} seconds for training.".format(train_t.interval))

Took 33.17596260500068 seconds for training.


### Recommendation and Evaluation
추천과 검증은 훈련 과정에서 특정 테스트 셋에서 수행된다. 훈련 후에 새로운 데이터에 대해 수행할 수 있다.   

In [15]:
# recommendation
topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)
topk_scores.head()

Unnamed: 0,userID,itemID,prediction
0,1,7,5.792503
1,1,475,5.48312
2,1,919,5.352049
3,1,89,5.296583
4,1,1,5.276996


In [16]:
# evaluation
eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.135738
NDCG:	0.455456
Precision@K:	0.400424
Recall@K:	0.213484


In [17]:
# Record results with papermill for tests
sb.glue("map", eval_map)
sb.glue("ndcg", eval_ndcg)
sb.glue("precision", eval_precision)
sb.glue("recall", eval_recall)

### Infer embeddings
training 세트에서 유저와 아이템의 임베딩을 csv 파일로 추출할 수 있다.

In [18]:
model.infer_embedding(user_file, item_file)