In [1]:
import sys
sys.path.append("../../")
import os
import papermill as pm
import pandas as pd
import numpy as np
import tensorflow as tf
from reco_utils.common.timer import Timer
from reco_utils.recommender.deeprec.models.graphrec.lightgcn import LightGCN
from reco_utils.recommender.deeprec.DataModel.ImplicitCF import ImplicitCF
from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.common.constants import SEED as DEFAULT_SEED
from reco_utils.recommender.deeprec.deeprec_utils import prepare_hparams
from reco_utils.recommender.deeprec.deeprec_utils import cal_metric
from utils.general import *
from utils.data_helper import *
from utils.task_helper import *

In [2]:
lightgcn_dir = 'data_folder/my/LightGCN-training-folder'
rawdata_dir = 'data_folder/my/DKN-training-folder'
tag = 'small'

In [3]:
create_dir(lightgcn_dir)
prepare_dataset(lightgcn_dir, rawdata_dir, tag)

load_instance_file: train_small.txt   done.
load_instance_file: valid_small.txt   done.
load_instance_file: test_small.txt   done.


In [4]:
df_train = pd.read_csv(
        os.path.join(lightgcn_dir, 'lightgcn_train_{0}.txt'.format(tag)),
        sep=' ',
        engine="python",
        names=['userID', 'itemID', 'rating'],
        header=0
    )

In [5]:
df_train.head()

Unnamed: 0,userID,itemID,rating
0,2585321750,2088267428,0
1,2585321750,2149729472,0
2,2585321750,2111412754,0
3,2585321750,2286875509,0
4,2151433223,3025653783,1


In [6]:
df_valid = pd.read_csv(
        os.path.join(lightgcn_dir, 'lightgcn_valid_{0}.txt'.format(tag)),
        sep=' ',
        engine="python",
        names=['userID', 'itemID', 'rating'],
        header=0
    )
# df_test = pd.read_csv(
#         os.path.join(path, 'test.txt'),
#         sep=' ',
#         engine="python",
#         names=['userID', 'itemID', 'rating'],
#         header=0
#     )

In [7]:
data = ImplicitCF(
    train=df_train, test=df_valid, seed=0,
    col_user='userID',
    col_item='itemID',
    col_rating='rating'
)

In [8]:
yaml_file = './lightgcn.yaml'


hparams = prepare_hparams(yaml_file,                          
                          learning_rate=0.005,
                          eval_epoch=5,
                          top_k=10,
                          save_model=True,
                          epochs=10,
                          save_epoch=5
                         )
hparams.MODEL_DIR = os.path.join(lightgcn_dir, 'saved_models')
hparams.values


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



<bound method HParams.values of HParams([('DNN_FIELD_NUM', None), ('EARLY_STOP', 100), ('FEATURE_COUNT', None), ('FIELD_COUNT', None), ('L', None), ('MODEL_DIR', 'data_folder/my/LightGCN-training-folder/saved_models'), ('PAIR_NUM', None), ('SUMMARIES_DIR', None), ('T', None), ('activation', None), ('att_fcn_layer_sizes', None), ('attention_activation', None), ('attention_dropout', 0.0), ('attention_layer_sizes', None), ('attention_size', None), ('batch_size', 1024), ('cate_embedding_dim', None), ('cate_vocab', None), ('contextEmb_file', None), ('cross_activation', 'identity'), ('cross_l1', 0.0), ('cross_l2', 0.0), ('cross_layer_sizes', None), ('cross_layers', None), ('data_format', None), ('decay', 0.0001), ('dilations', None), ('dim', None), ('doc_size', None), ('dropout', [0.0]), ('dtype', 32), ('embed_l1', 0.0), ('embed_l2', 0.0), ('embed_size', 64), ('embedding_dropout', 0.3), ('enable_BN', False), ('entityEmb_file', None), ('entity_dim', None), ('entity_embedding_method', None), (

In [9]:
hparams.values

<bound method HParams.values of HParams([('DNN_FIELD_NUM', None), ('EARLY_STOP', 100), ('FEATURE_COUNT', None), ('FIELD_COUNT', None), ('L', None), ('MODEL_DIR', 'data_folder/my/LightGCN-training-folder/saved_models'), ('PAIR_NUM', None), ('SUMMARIES_DIR', None), ('T', None), ('activation', None), ('att_fcn_layer_sizes', None), ('attention_activation', None), ('attention_dropout', 0.0), ('attention_layer_sizes', None), ('attention_size', None), ('batch_size', 1024), ('cate_embedding_dim', None), ('cate_vocab', None), ('contextEmb_file', None), ('cross_activation', 'identity'), ('cross_l1', 0.0), ('cross_l2', 0.0), ('cross_layer_sizes', None), ('cross_layers', None), ('data_format', None), ('decay', 0.0001), ('dilations', None), ('dim', None), ('doc_size', None), ('dropout', [0.0]), ('dtype', 32), ('embed_l1', 0.0), ('embed_l2', 0.0), ('embed_size', 64), ('embedding_dropout', 0.3), ('enable_BN', False), ('entityEmb_file', None), ('entity_dim', None), ('entity_embedding_method', None), (

In [10]:
model = LightGCN(hparams, data, seed=0)


Already create adjacency matrix.
Already normalize adjacency matrix.



  d_inv = np.power(rowsum, -0.5).flatten()


Using xavier initialization.









In [11]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)23.9s: train loss = 0.08026 = (mf)0.07923 + (embed)0.00103
Epoch 2 (train)25.5s: train loss = 0.01845 = (mf)0.01661 + (embed)0.00184
Epoch 3 (train)25.7s: train loss = 0.01158 = (mf)0.00934 + (embed)0.00225
Epoch 4 (train)24.7s: train loss = 0.00890 = (mf)0.00642 + (embed)0.00248
Save model to path /data/home/jialia/jialia/kdd2020tutorial/formal_02/recommenders/scenarios/KDD2020-tutorial/data_folder/my/LightGCN-training-folder/saved_models/epoch_5
Epoch 5 (train)21.9s + (eval)2.0s: train loss = 0.00749 = (mf)0.00490 + (embed)0.00259, recall = 0.26090, ndcg = 0.13775, precision = 0.02609, map = 0.10053
Epoch 6 (train)22.1s: train loss = 0.00629 = (mf)0.00366 + (embed)0.00262
Epoch 7 (train)25.6s: train loss = 0.00563 = (mf)0.00302 + (embed)0.00261
Epoch 8 (train)26.3s: train loss = 0.00510 = (mf)0.00255 + (embed)0.00255
Epoch 9 (train)26.1s: train loss = 0.00483 = (mf)0.00235 + (embed)0.00248
Instructions for updating:
Use standard file APIs to delete files with this pref

In [12]:
user_emb_file = os.path.join(lightgcn_dir, 'user.emb.txt')
item_emb_file = os.path.join(lightgcn_dir, 'item.emb.txt')
model.infer_embedding(
    user_emb_file,
    item_emb_file    
)

In [13]:
def group_labels(labels, preds, group_keys):
    """Devide labels and preds into several group according to values in group keys.
    Args:
        labels (list): ground truth label list.
        preds (list): prediction score list.
        group_keys (list): group key list.
    Returns:
        all_labels: labels after group.
        all_preds: preds after group.
    """
    all_keys = list(set(group_keys))
    group_labels = {k: [] for k in all_keys}
    group_preds = {k: [] for k in all_keys}
    for l, p, k in zip(labels, preds, group_keys):
        group_labels[k].append(l)
        group_preds[k].append(p)
    all_labels = []
    all_preds = []
    for k in all_keys:
        all_labels.append(group_labels[k])
        all_preds.append(group_preds[k])
    return all_labels, all_preds

def load_emb_file(emb_file):
    res = {}
    with open(emb_file, 'r') as rd:
        while True:
            line = rd.readline()
            if not line:
                break
            words = line.strip().split('\t')
            values = [float(a) for a in words[1].split(' ')]
            res[words[0]] = np.asarray(values, dtype=np.float32)
    return res

In [14]:
def infer_scores_via_embeddings(test_filename, user_emb_file, item_emb_file):
    print('loading embedding file...', end=' ')
    user2vec = load_emb_file(user_emb_file)
    item2vec = load_emb_file(item_emb_file)
    preds, labels, groupids = [], [], []
    with open(test_filename, 'r') as rd:
        while True:
            line = rd.readline()
            if not line:
                break
            words = line.strip().split('%')
            tokens = words[0].split(' ')
            userid = words[1]
            itemid = tokens[2]
            pred = user2vec[userid].dot(item2vec[itemid])
            preds.append(pred)
            labels.append(int(tokens[0]))
            groupids.append(userid)
    print('done')
    return labels, preds, groupids
            

In [15]:
test_filename = os.path.join(rawdata_dir, 'test_{}.txt'.format(tag)) 
labels, preds, group_keys = infer_scores_via_embeddings(test_filename, user_emb_file, item_emb_file)
group_labels, group_preds = group_labels(labels, preds, group_keys)


loading embedding file... done


In [16]:
res_pairwise = cal_metric(
                group_labels, group_preds, ['ndcg@2;4;6', "group_auc"]
            )
print(res_pairwise)
res_pointwise = cal_metric(labels, preds, ['auc'])
print(res_pointwise)    

{'ndcg@2': 0.4044, 'ndcg@4': 0.4965, 'ndcg@6': 0.5334, 'group_auc': 0.8178}
{'auc': 0.8207}
