In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.nn.init import xavier_normal_, xavier_uniform_
import argparse
import numpy as np
import time
import sys
from os.path import abspath
import random
import pandas as pd

# from util.sampler import  next_batch_pairwise
from util.conf import OptionConf
import torch
import torch.nn as nn 
import torch.nn.functional as F
from scipy.sparse import coo_matrix
from util.loss_torch import bpr_loss, l2_reg_loss, EmbLoss, contrastLoss
from util.init import *
from base.torch_interface import TorchGraphInterface
import os
import numpy as np 
import time 
from torch.optim.lr_scheduler import ReduceLROnPlateau

from util.conf import ModelConf
from base.recommender import Recommender
from util.algorithm import find_k_largest
from time import strftime, localtime
from data.loader import FileIO
from util.evaluation import ranking_evaluation
from util.sampler import  next_batch_pairwise
from data.ui_graph import Interaction

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Base Recommender

In [3]:
class GraphRecommender(Recommender):
    def __init__(self, conf, training_set, test_set,knowledge_set, **kwargs):
        super(GraphRecommender, self).__init__(conf, training_set, test_set, knowledge_set,**kwargs)
        self.data = Interaction(conf, training_set, test_set)
        self.bestPerformance = []
        top = self.ranking['-topN'].split(',')
        self.topN = [int(num) for num in top]
        self.max_N = max(self.topN)

    def print_model_info(self):
        super(GraphRecommender, self).print_model_info()
        # # print dataset statistics
        print('Training Set Size: (user number: %d, item number %d, interaction number: %d)' % (self.data.training_size()))
        print('Test Set Size: (user number: %d, item number %d, interaction number: %d)' % (self.data.test_size()))
        print('=' * 80)

    def build(self):
        pass

    def train(self):
        pass

    def predict(self, u):
        pass

    def test(self, user_emb, item_emb):
        def process_bar(num, total):
            rate = float(num) / total
            ratenum = int(50 * rate)
            r = '\rProgress: [{}{}]{}%'.format('+' * ratenum, ' ' * (50 - ratenum), ratenum*2)
            sys.stdout.write(r)
            sys.stdout.flush()

        # predict
        rec_list = {}
        user_count = len(self.data.test_set)
        for i, user in enumerate(self.data.test_set):
            # s_find_candidates = time.time()
            
            
            # candidates = predict(user)
            user_id  = self.data.get_user_id(user)
            score = torch.matmul(user_emb[user_id], item_emb.transpose(0, 1))
            candidates = score.cpu().numpy()
            
            # e_find_candidates = time.time()
            # print("Calculate candidates time: %f s" % (e_find_candidates - s_find_candidates))
            # predictedItems = denormalize(predictedItems, self.data.rScale[-1], self.data.rScale[0])
            rated_list, li = self.data.user_rated(user)
            for item in rated_list:
                candidates[self.data.item[item]] = -10e8
            
            # s_find_k_largest = time.time()
            ids, scores = find_k_largest(self.max_N, candidates)
            # e_find_k_largest = time.time()
            # print("Find k largest candidates: %f s" % (e_find_k_largest - s_find_k_largest))
            item_names = [self.data.id2item[iid] for iid in ids]
            rec_list[user] = list(zip(item_names, scores))
            if i % 1000 == 0:
                process_bar(i, user_count)
        process_bar(user_count, user_count)
        print('')
        return rec_list

    def evaluate(self, rec_list):
        self.recOutput.append('userId: recommendations in (itemId, ranking score) pairs, * means the item is hit.\n')
        for user in self.data.test_set:
            line = str(user) + ':'
            for item in rec_list[user]:
                line += ' (' + str(item[0]) + ',' + str(item[1]) + ')'
                if item[0] in self.data.test_set[user]:
                    line += '*'
            line += '\n'
            self.recOutput.append(line)
        current_time = strftime("%Y-%m-%d %H-%M-%S", localtime(time.time()))
        # output prediction result
        out_dir = self.output['-dir']
        file_name = self.config['model.name'] + '@' + current_time + '-top-' + str(self.max_N) + 'items' + '.txt'
        FileIO.write_file(out_dir, file_name, self.recOutput)
        print('The result has been output to ', abspath(out_dir), '.')
        file_name = self.config['model.name'] + '@' + current_time + '-performance' + '.txt'
        self.result = ranking_evaluation(self.data.test_set, rec_list, self.topN)
        self.model_log.add('###Evaluation Results###')
        self.model_log.add(self.result)
        FileIO.write_file(out_dir, file_name, self.result)
        print('The result of %s:\n%s' % (self.model_name, ''.join(self.result)))

    def fast_evaluation(self, epoch, user_embed, item_embed, kwargs=None):
        print('Evaluating the model...')
        s_test = time.time()
        rec_list = self.test(user_embed, item_embed)
        e_test = time.time() 
        print("Test time: %f s" % (e_test - s_test))
        
        s_measure = time.time()
        measure = ranking_evaluation(self.data.test_set, rec_list, [self.max_N])
        e_measure = time.time()
        print("Measure time: %f s" % (e_measure - s_measure))
        
        if len(self.bestPerformance) > 0:
            count = 0
            performance = {}
            for m in measure[1:]:
                k, v = m.strip().split(':')
                performance[k] = float(v)
            for k in self.bestPerformance[1]:
                if self.bestPerformance[1][k] > performance[k]:
                    count += 1
                else:
                    count -= 1
            if count < 0:
                self.bestPerformance[1] = performance
                self.bestPerformance[0] = epoch + 1
                try:
                    self.save(kwargs)
                except:
                    self.save()
        else:
            self.bestPerformance.append(epoch + 1)
            performance = {}
            for m in measure[1:]:
                k, v = m.strip().split(':')
                performance[k] = float(v)
            self.bestPerformance.append(performance)
            try:
                self.save(kwargs)
            except:
                self.save()
        print('-' * 120)
        print('Real-Time Ranking Performance ' + ' (Top-' + str(self.max_N) + ' Item Recommendation)')
        measure = [m.strip() for m in measure[1:]]
        print('*Current Performance*')
        print('Epoch:', str(epoch + 1) + ',', '  |  '.join(measure))
        bp = ''
        # for k in self.bestPerformance[1]:
        #     bp+=k+':'+str(self.bestPerformance[1][k])+' | '
        bp += 'Hit Ratio' + ':' + str(self.bestPerformance[1]['Hit Ratio']) + '  |  '
        bp += 'Precision' + ':' + str(self.bestPerformance[1]['Precision']) + '  |  '
        bp += 'Recall' + ':' + str(self.bestPerformance[1]['Recall']) + '  |  '
        # bp += 'F1' + ':' + str(self.bestPerformance[1]['F1']) + ' | '
        bp += 'NDCG' + ':' + str(self.bestPerformance[1]['NDCG'])
        print('*Best Performance* ')
        print('Epoch:fast_evaluation', str(self.bestPerformance[0]) + ',', bp)
        print('-' * 120)
        performance_ep = {
            'Epoch': epoch,
            'Hit Ratio': str(self.bestPerformance[1]['Hit Ratio']),
            'Precision': str(self.bestPerformance[1]['Precision']),
            'Recall': str(self.bestPerformance[1]['Recall']),
            'NDCG': str(self.bestPerformance[1]['NDCG']),
            'Measure Time': e_measure - s_measure
        }
        return measure, performance_ep
        
    def save_model(self, model, hyper_params):
        hp = list(hyper_params.values())
        hyper_params = '-p:' + str(hp[0]) + '-drop:' + str(hp[1]) + '-layers:' + str(hp[2]) + '-input_dim:' + str(hp[3]) + '-hyper_dim:' + str(hp[4]) + '-hyperedge_num:' + str(hp[5]) + 'lr:' + str(hp[6]) + 'reg:' + str(hp[7])
        # save model 
        current_time = strftime("%Y-%m-%d", localtime(time.time()))
        out_dir = 'results/'+ str(hp[9]) + '/' + self.config['model.name'] + '/' +self.config['model.name'] + hyper_params
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        file_name =  self.config['model.name'] + '@' + current_time + '-weight' + '.pth'
        weight_file = out_dir + '/' + file_name 
        torch.save(model.state_dict(), weight_file)

    def save_loss(self, train_losses, rec_losses, reg_losses, hyper_params):
        hp = list(hyper_params.values())
        hyper_params = '-p:' + str(hp[0]) + '-drop:' + str(hp[1]) + '-layers:' + str(hp[2]) + '-input_dim:' + str(hp[3]) + '-hyper_dim:' + str(hp[4]) + '-hyperedge_num:' + str(hp[5]) + 'lr:' + str(hp[6]) + 'reg:' + str(hp[7])
        out_dir = 'results/' + str(hp[9]) + '/' + self.config['model.name'] + '/' +self.config['model.name'] + hyper_params
    
        df_train_loss = pd.DataFrame(train_losses, columns = ['ep', 'loss'])
        df_rec_loss = pd.DataFrame(rec_losses, columns = ['ep', 'loss'])
        df_reg_loss = pd.DataFrame(reg_losses, columns = ['ep', 'loss'])
        
        df_train_loss.to_csv(out_dir + '/train_loss.csv')
        df_rec_loss.to_csv(out_dir + '/rec_loss.csv')
        df_reg_loss.to_csv(out_dir + '/reg_loss.csv')

    def save_perfomance_training(self, log_train, hyper_params):
        hp = list(hyper_params.values())
        hyper_params = '-p:' + str(hp[0]) + '-drop:' + str(hp[1]) + '-layers:' + str(hp[2]) + '-input_dim:' + str(hp[3]) + '-hyper_dim:' + str(hp[4]) + '-hyperedge_num:' + str(hp[5]) + 'lr:' + str(hp[6]) + 'reg:' + str(hp[7])
        out_dir = 'results/' + str(hp[9]) + '/' + self.config['model.name'] + '/' +self.config['model.name'] + hyper_params
        
        df_train_log = pd.DataFrame(log_train)
        df_train_log.to_csv(out_dir + '/train_performance.csv')
    
    def save_performance_test(self, log_test, hyper_params):
        hp = list(hyper_params.values())
        hyper_params = '-p:' + str(hp[0]) + '-drop:' + str(hp[1]) + '-layers:' + str(hp[2]) + '-input_dim:' + str(hp[3]) + '-hyper_dim:' + str(hp[4]) + '-hyperedge_num:' + str(hp[5]) + 'lr:' + str(hp[6]) + 'reg:' + str(hp[7])
        out_dir = 'results/' + str(hp[9]) + '/' + self.config['model.name'] + '/' +self.config['model.name'] + hyper_params
        
        df_test_log = pd.DataFrame(log_test)
        df_test_log.to_csv(out_dir + '/test_performance.csv')

## Model

In [4]:
class LGCN_Encoder(nn.Module):
    def __init__(self, data, emb_size, n_layers):
        super(LGCN_Encoder, self).__init__()
        self.data = data
        self.latent_size = emb_size
        self.layers = n_layers
        self.norm_adj = data.norm_adj
        self.embedding_dict = self._init_model()
        self.sparse_norm_adj = TorchGraphInterface.convert_sparse_mat_to_tensor(self.norm_adj).cuda()
        self.ln = torch.nn.LayerNorm(self.latent_size).cuda()
        self.non_linear = nn.LeakyReLU(0.1)
        # self.apply(xavier_uniform_initialization)
        
    def _init_model(self):
        initializer = nn.init.xavier_uniform_
        embedding_dict = nn.ParameterDict({
            'user_emb': nn.Parameter(initializer(torch.empty(self.data.n_users, self.latent_size)).cuda()),
            'item_emb': nn.Parameter(initializer(torch.empty(self.data.n_items, self.latent_size)).cuda()),
        })
        return embedding_dict

    def forward(self):
        ego_embeddings = torch.cat([self.embedding_dict['user_emb'], self.embedding_dict['item_emb']], 0)
        all_embeddings = [ego_embeddings]
        for k in range(self.layers):
            ego_embeddings = torch.sparse.mm(self.sparse_norm_adj, ego_embeddings)
            all_embeddings += [ego_embeddings]
        all_embeddings = torch.stack(all_embeddings, dim=1)
        all_embeddings = torch.mean(all_embeddings, dim=1)
        user_all_embeddings = all_embeddings[:self.data.n_users]
        item_all_embeddings = all_embeddings[self.data.n_users:]
        return user_all_embeddings, item_all_embeddings

## Util

In [5]:
def calculate_loss(anchor_emb, pos_emb, neg_emb, batch_size):
    calc_reg_loss = EmbLoss()
    rec_loss = bpr_loss(anchor_emb, pos_emb, neg_emb)
    reg_loss = reg * calc_reg_loss(anchor_emb, pos_emb, neg_emb) / batch_size
    return rec_loss, reg_loss

In [6]:
def predict(u, rec, user_emb, item_emb):
    user_id  = rec.data.get_user_id(u)
    score = torch.matmul(user_emb[user_id], item_emb.transpose(0, 1))
    return score.cpu().numpy()  

## Train

In [7]:
def train(hyper_params):
    performance_dict = {'Epoch':[], 
                        'Hit Ratio': [], 
                        'Precision':[], 
                        'Recall': [],
                        'NDCG': [], 
                        'Measure Time': []}
    total_train_losses = []
    total_rec_losses = []
    total_reg_losses = []
    
    for ep in range(maxEpoch):
        train_losses = []
        rec_losses = []
        reg_losses = []
        
        for n, batch in enumerate(next_batch_pairwise(rec.data, batchSize)):
            user_idx, pos_idx, neg_idx = batch
            train_model.train()
            user_emb, item_emb = train_model()

            anchor_emb = user_emb[user_idx]
            pos_emb = item_emb[pos_idx]
            neg_emb = item_emb[neg_idx]

            loss_rec, loss_reg = calculate_loss(anchor_emb, pos_emb, neg_emb, batchSize)
    #         print(f"Loss rec: {loss_rec}; Loss reg: {loss_reg}")
            
            batch_loss = loss_rec + loss_reg 
            #total_train_losses.append([loss_rec.item(), loss_reg.item(), batch_loss.item()])
            train_losses.append(batch_loss.item())
            rec_losses.append(loss_rec.item())
            reg_losses.append(loss_reg.item())
            
    #         print(f"Batch loss: {batch_loss}")
            optimizer.zero_grad()
            batch_loss.backward()

            torch.nn.utils.clip_grad_norm_(train_model.parameters(), 4)
            optimizer.step()
        
        batch_train_loss = np.mean(train_losses)
        batch_rec_loss = np.mean(rec_losses)
        batch_reg_loss = np.mean(reg_losses)
        
        total_train_losses.append([ep, batch_train_loss])
        total_rec_losses.append([ep, batch_rec_loss])
        total_reg_losses.append([ep, batch_reg_loss])
        scheduler.step(batch_train_loss)
        
        # Evaluation
        train_model.eval()
        with torch.no_grad():
            user_emb, item_emb = train_model()
            measure, performance_ep = rec.fast_evaluation(ep, user_emb, item_emb)
            for key, value in performance_ep.items():
                if key in performance_dict:
                    performance_dict[key].append(value)
                else:
                    continue
          
    rec.save_model(train_model, hyper_params)
    rec.save_loss(total_train_losses, total_rec_losses, total_reg_losses, hyper_params)
    rec.save_perfomance_training(performance_dict, hyper_params)
    
    
#         df_train = df_train.append(new_data, ignore_index=True)
#     current_time = datetime.now()
#     df_train.to_csv('results/'+ model + '/' + model+'@' + idx + '_train_['+ params + ']'+ str(current_time) +'.csv', index=False)
    
    return user_emb, item_emb

## Test

In [8]:
def test(rec, user_emb, item_emb, hyper_params):
    
    def process_bar(num, total):
        rate = float(num) / total
        ratenum = int(50 * rate)
        r = '\rProgress: [{}{}]{}%'.format('+' * ratenum, ' ' * (50 - ratenum), ratenum*2)
        sys.stdout.write(r)
        sys.stdout.flush()

    # predict
    rec_list = {}
    user_count = len(rec.data.test_set)
    for i, user in enumerate(rec.data.test_set):
        # s_find_candidates = time.time()
        candidates = predict(user, rec, user_emb, item_emb)
        # e_find_candidates = time.time()
        # print("Calculate candidates time: %f s" % (e_find_candidates - s_find_candidates))
        # predictedItems = denormalize(predictedItems, self.data.rScale[-1], self.data.rScale[0])
        rated_list, li = rec.data.user_rated(user)
        for item in rated_list:
            candidates[rec.data.item[item]] = -10e8

        # s_find_k_largest = time.time()
        ids, scores = find_k_largest(rec.max_N, candidates)
        # e_find_k_largest = time.time()
        # print("Find k largest candidates: %f s" % (e_find_k_largest - s_find_k_largest))
        item_names = [rec.data.id2item[iid] for iid in ids]
        rec_list[user] = list(zip(item_names, scores))
        if i % 1000 == 0:
            process_bar(i, user_count)
    process_bar(user_count, user_count)
    print('')
    result = ranking_evaluation(rec.data.test_set, rec_list, rec.topN)
    print(result)
    rec.save_performance_test(result, hyper_params)

## Model

In [9]:
dataset = 'lastfm'
reg = 0.001
maxEpoch = 500
batchSize = 2048
args = {
    'p':  0.1,
    'drop_rate': 0,
    'n_layers': 2,#set
    'input_dim': 8,#set
    'hyper_dim': 128,# doesn't matter
    'hyperedge_num': 128,#set
    'lr': 0.001,#set
    'reg': reg,
    'leaky': 0.5,
    'dataset': dataset
}
model = 'LightGCN'
config = ModelConf('./conf/' + model + '.conf')
training_data = FileIO.load_data_set('./dataset/' + dataset + '/' +config['training.set'], config['model.type'])
test_data = FileIO.load_data_set('./dataset/' + dataset + '/'  +config['test.set'], config['model.type'])
knowledge_set =''
data = Interaction(config, training_data, test_data)
rec = GraphRecommender(config, training_data, test_data, knowledge_set, **args)

parameter ss_rate is not found in the configuration file!


In [10]:
train_model = LGCN_Encoder(rec.data, args['hyper_dim'], args['n_layers'])
optimizer  = torch.optim.Adam(train_model.parameters(), lr=args['lr'])
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=5)

  i = torch.LongTensor([coo.row, coo.col])


In [11]:
user_emb, item_emb = train(args)
test(rec, user_emb, item_emb, args)

Evaluating the model...
Progress: [++++++++++++++++++++++++++++++++++++++++++++++++++]100%
Test time: 3.284163 s
Measure time: 0.014926 s
------------------------------------------------------------------------------------------------------------------------
Real-Time Ranking Performance  (Top-20 Item Recommendation)
*Current Performance*
Epoch: 1, Hit Ratio:0.13575  |  Precision:0.0836  |  Recall:0.13607  |  NDCG:0.15898
*Best Performance* 
Epoch:fast_evaluation 1, Hit Ratio:0.13575  |  Precision:0.0836  |  Recall:0.13607  |  NDCG:0.15898
------------------------------------------------------------------------------------------------------------------------
Evaluating the model...
Progress: [++++++++++++++++++++++++++++++++++++++++++++++++++]100%
Test time: 0.180216 s
Measure time: 0.015509 s
------------------------------------------------------------------------------------------------------------------------
Real-Time Ranking Performance  (Top-20 Item Recommendation)
*Current Perfo