In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.nn.init import xavier_normal_, xavier_uniform_
import argparse
import numpy as np
import time
import sys
from os.path import abspath

from base.graph_recommender import GraphRecommender
from util.sampler import next_batch_pairwise_kg, next_batch_pairwise
from util.conf import OptionConf
import torch
import torch.nn as nn 
import torch.nn.functional as F
from scipy.sparse import coo_matrix
from util.loss_torch import bpr_loss, l2_reg_loss, EmbLoss, contrastLoss
from util.init import *
from base.torch_interface import TorchGraphInterface
import os
import numpy as np 
import time 
from torch.optim.lr_scheduler import ReduceLROnPlateau

from data.loader import FileIO
from util.conf import ModelConf
from base.recommender import Recommender
from data.ui_graph import Interaction
# from data.knowledge import Knowledge
from util.algorithm import find_k_largest
from time import strftime, localtime
from data.loader import FileIO
from util.evaluation import ranking_evaluation


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def innerProduct(usrEmbeds, itmEmbeds):
	return torch.sum(usrEmbeds * itmEmbeds, dim=-1)

def pairPredict(ancEmbeds, posEmbeds, negEmbeds):
	return innerProduct(ancEmbeds, posEmbeds) - innerProduct(ancEmbeds, negEmbeds)

def calcRegLoss(model):
	ret = 0
	for W in model.parameters():
		ret += W.norm(2).square()
	# ret += (model.usrStruct + model.itmStruct)
	return ret

def contrastLoss(embeds1, embeds2, nodes, temp):
	embeds1 = F.normalize(embeds1 + 1e-8, p=2)
	embeds2 = F.normalize(embeds2 + 1e-8, p=2)
	pckEmbeds1 = embeds1[nodes]
	pckEmbeds2 = embeds2[nodes]
	nume = torch.exp(torch.sum(pckEmbeds1 * pckEmbeds2, dim=-1) / temp)
	deno = torch.exp(pckEmbeds1 @ pckEmbeds2.T / temp).sum(-1) + 1e-8
	return -torch.log(nume / deno).mean()


In [4]:
class GraphRecommender(Recommender):
    def __init__(self, conf, training_set, test_set, **kwargs):
        super(GraphRecommender, self).__init__(conf, training_set, test_set, **kwargs)
        self.data = Interaction(conf, training_set, test_set)
        self.bestPerformance = []
        top = self.ranking['-topN'].split(',')
        self.topN = [int(num) for num in top]
        self.max_N = max(self.topN)

    def print_model_info(self):
        super(GraphRecommender, self).print_model_info()
        # # print dataset statistics
        print('Training Set Size: (user number: %d, item number %d, interaction number: %d)' % (self.data.training_size()))
        print('Test Set Size: (user number: %d, item number %d, interaction number: %d)' % (self.data.test_size()))
        print('=' * 80)

    def build(self):
        pass

    def train(self):
        pass

    def predict(self, u):
        pass

    def test(self, user_emb, item_emb):
        def process_bar(num, total):
            rate = float(num) / total
            ratenum = int(50 * rate)
            r = '\rProgress: [{}{}]{}%'.format('+' * ratenum, ' ' * (50 - ratenum), ratenum*2)
            sys.stdout.write(r)
            sys.stdout.flush()

        # predict
        rec_list = {}
        user_count = len(self.data.test_set)
        for i, user in enumerate(self.data.test_set):
            # s_find_candidates = time.time()
            
            
            # candidates = predict(user)
            user_id  = self.data.get_user_id(user)
            score = torch.matmul(user_emb[user_id], item_emb.transpose(0, 1))
            candidates = score.cpu().numpy()
            
            # e_find_candidates = time.time()
            # print("Calculate candidates time: %f s" % (e_find_candidates - s_find_candidates))
            # predictedItems = denormalize(predictedItems, self.data.rScale[-1], self.data.rScale[0])
            rated_list, li = self.data.user_rated(user)
            for item in rated_list:
                candidates[self.data.item[item]] = -10e8
            
            # s_find_k_largest = time.time()
            ids, scores = find_k_largest(self.max_N, candidates)
            # e_find_k_largest = time.time()
            # print("Find k largest candidates: %f s" % (e_find_k_largest - s_find_k_largest))
            item_names = [self.data.id2item[iid] for iid in ids]
            rec_list[user] = list(zip(item_names, scores))
            if i % 1000 == 0:
                process_bar(i, user_count)
        process_bar(user_count, user_count)
        print('')
        return rec_list

    def evaluate(self, rec_list):
        self.recOutput.append('userId: recommendations in (itemId, ranking score) pairs, * means the item is hit.\n')
        for user in self.data.test_set:
            line = str(user) + ':'
            for item in rec_list[user]:
                line += ' (' + str(item[0]) + ',' + str(item[1]) + ')'
                if item[0] in self.data.test_set[user]:
                    line += '*'
            line += '\n'
            self.recOutput.append(line)
        current_time = strftime("%Y-%m-%d %H-%M-%S", localtime(time.time()))
        # output prediction result
        out_dir = self.output['-dir']
        file_name = self.config['model.name'] + '@' + current_time + '-top-' + str(self.max_N) + 'items' + '.txt'
        FileIO.write_file(out_dir, file_name, self.recOutput)
        print('The result has been output to ', abspath(out_dir), '.')
        file_name = self.config['model.name'] + '@' + current_time + '-performance' + '.txt'
        self.result = ranking_evaluation(self.data.test_set, rec_list, self.topN)
        self.model_log.add('###Evaluation Results###')
        self.model_log.add(self.result)
        FileIO.write_file(out_dir, file_name, self.result)
        print('The result of %s:\n%s' % (self.model_name, ''.join(self.result)))

    def fast_evaluation(self, epoch, user_embed, item_embed, kwargs=None):
        print('Evaluating the model...')
        s_test = time.time()
        rec_list = self.test(user_embed, item_embed)
        e_test = time.time() 
        print("Test time: %f s" % (e_test - s_test))
        
        s_measure = time.time()
        measure = ranking_evaluation(self.data.test_set, rec_list, [self.max_N])
        e_measure = time.time()
        print("Measure time: %f s" % (e_measure - s_measure))
        
        if len(self.bestPerformance) > 0:
            count = 0
            performance = {}
            for m in measure[1:]:
                k, v = m.strip().split(':')
                performance[k] = float(v)
            for k in self.bestPerformance[1]:
                if self.bestPerformance[1][k] > performance[k]:
                    count += 1
                else:
                    count -= 1
            if count < 0:
                self.bestPerformance[1] = performance
                self.bestPerformance[0] = epoch + 1
                try:
                    self.save(kwargs)
                except:
                    self.save()
        else:
            self.bestPerformance.append(epoch + 1)
            performance = {}
            for m in measure[1:]:
                k, v = m.strip().split(':')
                performance[k] = float(v)
            self.bestPerformance.append(performance)
            try:
                self.save(kwargs)
            except:
                self.save()
        print('-' * 120)
        print('Real-Time Ranking Performance ' + ' (Top-' + str(self.max_N) + ' Item Recommendation)')
        measure = [m.strip() for m in measure[1:]]
        print('*Current Performance*')
        print('Epoch:', str(epoch + 1) + ',', '  |  '.join(measure))
        bp = ''
        # for k in self.bestPerformance[1]:
        #     bp+=k+':'+str(self.bestPerformance[1][k])+' | '
        bp += 'Hit Ratio' + ':' + str(self.bestPerformance[1]['Hit Ratio']) + '  |  '
        bp += 'Precision' + ':' + str(self.bestPerformance[1]['Precision']) + '  |  '
        bp += 'Recall' + ':' + str(self.bestPerformance[1]['Recall']) + '  |  '
        # bp += 'F1' + ':' + str(self.bestPerformance[1]['F1']) + ' | '
        bp += 'NDCG' + ':' + str(self.bestPerformance[1]['NDCG'])
        print('*Best Performance* ')
        print('Epoch:fast_evaluation', str(self.bestPerformance[0]) + ',', bp)
        print('-' * 120)
        return measure

In [5]:
class HCCFModel(nn.Module):
    def __init__(self, config, data):
        super(HCCFModel, self).__init__()
        self.data = data
        self._parse_args(config)

        init = nn.init.xavier_uniform_
        adj = self.data.bi_interaction_mat
        self.adj  = TorchGraphInterface.convert_sparse_mat_to_tensor(adj).to(device) 
        
        # init embedding
        self.user_embedding = nn.Parameter(init(torch.zeros(self.data.user_num, self.input_dim)))
        self.item_embedding = nn.Parameter(init(torch.zeros(self.data.item_num, self.input_dim)))

        self.uHyper = nn.Parameter(init(torch.zeros(self.input_dim, self.hyper_dim)))
        self.iHyper = nn.Parameter(init(torch.zeros(self.input_dim, self.hyper_dim)))

        self.edgeDropper = SpAdjDropEdge()
        self.gcnLayer = GCNLayer(self.leaky)
        self.hgnnLayer = HGNNLayer(self.leaky, self.hyper_dim)
        
    def _parse_args(self, config):
        self.gnn_layer = int(config['gnn_layer'])
        self.input_dim = int(config['embedding.size'])
        self.hyper_dim = int(config['hyper.size'])
        self.drop_rate = float(config['dropout'])
        self.leaky = float(config['leaky'])
        self.temp = float(config['temp'])

    def calculate_loss(self, ancs, poss, negs, keep_rate):
        uEmbeds, iEmbeds, gcnEmbedsLst, hyperEmbedsLst = self.forward(keep_rate)

        ancEmbeds = uEmbeds[ancs]
        posEmbeds = iEmbeds[poss]
        negEmbeds = iEmbeds[negs]
        scoreDiff = pairPredict(ancEmbeds, posEmbeds, negEmbeds)
        bprLoss = - (scoreDiff).sigmoid().log().mean()

        sslLoss = 0
        for i in range(self.gnn_layer):
            embeds1 = gcnEmbedsLst[i].detach()
            embeds2 = hyperEmbedsLst[i]
            sslLoss += contrastLoss(embeds1[:self.data.user_num], embeds2[:self.data.user_num], torch.unique(ancs), self.temp) + contrastLoss(embeds1[self.data.user_num:], embeds2[self.data.user_num:], torch.unique(poss), self.temp)
        return bprLoss, sslLoss

        
    def forward(self, keep_rate):
        uEmbed = self.user_embedding   
        iEmbed = self.item_embedding      
        
        # print("uEmbed: ")
        # print(uEmbed)  
        embeds = torch.cat((uEmbed, iEmbed), dim=0)
        lats = [embeds]
        gnnLats = []
        hyperLats = []
        uuHyper = uEmbed @ self.uHyper
        iiHyper = iEmbed @ self.iHyper

        for i in range(self.gnn_layer):
            dropped_edge = self.edgeDropper(self.adj, keep_rate)
            temEmbeds = self.gcnLayer(dropped_edge.to_dense(), lats[-1])
            # if torch.any(torch.isnan(lats[-1])):
            #     import pdb; pdb.set_trace()
            hyperULat = self.hgnnLayer(F.dropout(uuHyper, p=1-keep_rate), lats[-1][:self.data.user_num])
            hyperILat = self.hgnnLayer(F.dropout(iiHyper, p=1-keep_rate), lats[-1][self.data.user_num:])
            gnnLats.append(temEmbeds)
            hyperLats.append(torch.cat([hyperULat, hyperILat], dim=0))
            lats.append(temEmbeds + hyperLats[-1])
        embeds = sum(lats)
        user_embed =  embeds[:self.data.user_num]
        item_embed = embeds[self.data.user_num:]
        # print(user_embed)
        # print(item_embed)
        return user_embed, item_embed, gnnLats, hyperLats

class GCNLayer(nn.Module):
	def __init__(self, leaky):
		super(GCNLayer, self).__init__()
		self.act = nn.LeakyReLU(negative_slope=leaky)

	def forward(self, adj, embeds):
		return self.act(torch.spmm(adj, embeds))

class HGNNLayer(nn.Module):
    def __init__(self, leaky,  hyper_dim):
        super(HGNNLayer, self).__init__()
        self.hyper_dim = hyper_dim
        self.act = nn.LeakyReLU(negative_slope=leaky)
        self.fc1 = nn.Linear(hyper_dim, hyper_dim ,bias=False) 
        self.fc2 = nn.Linear(hyper_dim, hyper_dim ,bias=False)  
        self.fc3 = nn.Linear(hyper_dim, hyper_dim ,bias=False)  

    def forward(self, adj, embeds):
        
        lat1 = self.act(adj.T @ embeds)
        lat2 = self.act(self.fc1(lat1.T).T) +  lat1
        lat3 = self.act(self.fc2(lat2.T).T) + lat2
        lat4 = self.act(self.fc3(lat3.T).T) + lat3 
        ret = self.act(adj @ lat4)
        return ret

class SpAdjDropEdge(nn.Module):
	def __init__(self):
		super(SpAdjDropEdge, self).__init__()

	def forward(self, adj, keepRate):
		if keepRate == 1.0:
			return adj
		vals = adj._values()
		idxs = adj._indices()
		edgeNum = vals.size()
		mask = ((torch.rand(edgeNum) + keepRate).floor()).type(torch.bool)
		newVals = vals[mask] / keepRate
		newIdxs = idxs[:, mask]
		return torch.sparse.FloatTensor(newIdxs, newVals, adj.shape)


In [6]:
class HCCF(GraphRecommender):
    def __init__(self, conf, training_set, test_set, **kwargs):
        GraphRecommender.__init__(self, conf, training_set, test_set, **kwargs)
        # config = OptionConf(self.config['HGNN'])

        self.reg_loss = EmbLoss() 
        self.model = HCCFModel(self.config, self.data )

        self._parse_config(self.config)
        self.model.to(device)

        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lRate)
        self.scheduler = ReduceLROnPlateau(self.optimizer, 'min', factor=self.lr_decay,patience=7)

    def _parse_config(self, config):
        self.lRate = float(config['learnRate'])
        self.lr_decay = float(config['learnRateDecay'])
        self.maxEpoch = int(config['num.max.epoch'])
        self.batchSize = int(config['batch_size'])
        self.reg = float(config['reg.lambda'])
        self.embeddingSize = int(config['embedding.size'])
        self.hyperDim = int(config['hyper.size'])
        self.dropRate = float(config['dropout'])
        self.negSlove = float(config['leaky'])
        self.nLayers = int(config['gnn_layer'])

    def train(self):
        model = self.model 

        for ep in range(self.maxEpoch):
            for n, batch in enumerate(next_batch_pairwise(self.data, self.batchSize)):
                user_idx, pos_idx, neg_idx = batch
                model.train()
                # s_model = time.time()
                rec_user_emb, rec_item_emb, _, _ = model(keep_rate=1- self.dropRate)
                bpr_loss, ssl_loss = model.calculate_loss(user_idx, pos_idx, neg_idx, keep_rate= 1-self.dropRate)
                batch_loss = bpr_loss + ssl_loss 

                self.optimizer.zero_grad()
                batch_loss.backward()

                torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
                self.optimizer.step()
            model.eval()
            with torch.no_grad():
                self.user_emb, self.item_emb, _, _ = model(keep_rate=1)
            
            s_eval = time.time()
            self.fast_evaluation(ep)
            e_eval = time.time()
            print("Eval time: %f s" % (e_eval - s_eval))

        self.user_emb, self.item_emb = self.best_user_emb, self.best_item_emb

    def save(self):
        with torch.no_grad():
            self.best_user_emb, self.best_item_emb, _,_ = self.model(keep_rate=1)

    def predict(self, u):
        user_id  = self.data.get_user_id(u)
        score = torch.matmul(self.user_emb[user_id], self.item_emb.transpose(0, 1))
        return score.cpu().numpy()

In [7]:
model = 'HCCF'
config = ModelConf('./conf/' + model + '.conf')

lRate = float(config['learnRate'])
lr_decay = float(config['learnRateDecay'])
maxEpoch = int(config['num.max.epoch'])
batchSize = int(config['batch_size'])
reg = float(config['reg.lambda'])
embeddingSize = int(config['embedding.size'])
hyperDim = int(config['hyper.size'])
dropRate = float(config['dropout'])
negSlove = float(config['leaky'])
nLayers = int(config['gnn_layer'])

# ss_rate = float(config['ss_rate'])

training_data = FileIO.load_data_set(config['training.set'], config['model.type'])
test_data = FileIO.load_data_set(config['test.set'], config['model.type'])

rec = GraphRecommender(config, training_data, test_data, **args)
train_model = HCCF(rec.data, embeddingSize, args['n_layers'])
optimizer  = torch.optim.Adam(train_model.parameters(), lr=lRate)

NameError: name 'args' is not defined

In [None]:
for ep in range(maxEpoch):
    for n, batch in enumerate(next_batch_pairwise(rec.data, batchSize)):
        user_idx, pos_idx, neg_idx = batch
        train_model.train()
        # s_model = time.time()
        rec_user_emb, rec_item_emb, _, _ = train_model(keep_rate=1- dropRate)
        bpr_loss, ssl_loss = train_model.calculate_loss(user_idx, pos_idx, neg_idx, keep_rate= 1- dropRate)
        batch_loss = bpr_loss + ssl_loss 

        optimizer.zero_grad()
        batch_loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
        optimizer.step()
    
    train_model.eval()
    with torch.no_grad():
        user_emb, item_emb, _, _ = train_model(keep_rate=1)

    s_eval = time.time()
    rec.fast_evaluation(ep)
    e_eval = time.time()
    print("Eval time: %f s" % (e_eval - s_eval))