In [1]:
import math
import time
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.utils.data as Data
from torch.autograd import Variable
import torch.optim as optim
import json
import random
import os

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Load_data():

    def __init__(self, train_rating_path, test_rating_path, test_negative_path):
        self.train_rating = self.load_rating(train_rating_path)
        self.test_rating = self.load_rating(test_rating_path)
        # self.test_negative = self.load_test_negative_from_file(test_negative_path)
        self.test_negative = self.load_test_negative(test_negative_path)
        self.train_group = self.get_train_group()

    def load_test_negative(self, path):
        test_negative = {}
        data = np.loadtxt(path, delimiter='\t', dtype='str')
        records = len(data)
        # print(data[0][1])
        for i in range(records):
            test_negative[int(data[i][0])] = eval(data[i][1])
        return test_negative

    def load_rating(self, path):
        rating = np.loadtxt(path, delimiter='\t')
        record_count = len(rating[:, 0])
        user_count = int(max(rating[:, 0])) + 1
        item_count = int(max(rating[:, 1])) + 1
        print('Loaded:', path)
        print('Num of users:', user_count)
        print('Num of items:', item_count)
        print('Data sparsity:', record_count / (user_count * item_count))
        # remove the last column: timestamp
        return torch.from_numpy(rating[:, :-2])

    def get_train_group(self):
        neg = {}    
        # get negative samples
        if os.path.exists('data/train_neg_dict.json'):
            neg = self.load_neg_dict_from_json('data/train_neg_dict.json')            
        else:
            neg = self.get_negative(self.train_rating, 100)
            self.save_neg_dict_to_json(neg, 'data/train_neg_dict.json')

        # save negative sample for resampling
        self.train_negative = neg
        
        record_count = len(self.train_rating[:, 0])
        groups = []
        for r in range(record_count):
            u = int(self.train_rating[r, 0])
            i = int(self.train_rating[r, 1])
            j = int(random.sample(neg[u], 1)[0])
            groups.append([u, i, j])
        return torch.tensor(groups)

    def resample_train_group(self):
        record_count = len(self.train_rating[:, 0])
        groups = []
        for r in range(record_count):
            u = int(self.train_rating[r, 0])
            i = int(self.train_rating[r, 1])
            j = int(random.sample(self.train_negative[u], 1)[0])
            groups.append([u, i, j])
        return torch.tensor(groups)
        
    def get_negative(self, data, sample_count):
        print('Calculating negative samples...')
        neg = {}
        record_count = len(data[:, 0])
        user_count = int(max(data[:, 0])) + 1
        item_count = int(max(data[:, 1])) + 1
        for u in range(user_count):
            neg[u] = []
        last_u = 0
        neg[0] = set(range(item_count))
        # record_count = 100
        for r in tqdm(range(record_count)):
            u = int(data[r, 0])
            if u != last_u:
                neg[last_u] = random.sample(list(neg[last_u]), sample_count)
                neg[u] = set(range(item_count))
            last_u = u
            i = int(data[r, 1])
            neg[u] = neg[u] - set([i])
        # neg[last_u] = set(range(item_count))
        neg[last_u] = random.sample(list(neg[last_u]), sample_count)

        return neg

    def load_test_negative_from_file(self, path):
        result = {}
        neg = np.loadtxt('data/test.negative', delimiter='\t', dtype='str')
        print('Loaded:', path)
        print('1000 negative test cases for each user')
        record_count = len(neg)
        for r in tqdm(range(record_count)):
            ui = tuple(map(int, neg[r, 0][1:-1].split(',')))
            u = ui[0]
            i = ui[1]
            if u not in result:
                result[u] = []
            else:
                result[u].append(list(map(int, neg[r, 1:])))
                result[u].append(i)
        return result


    def save_neg_dict_to_json(self, neg, path):
        print('Saving negative samples to file:', path)
        with open(path, "w") as f:
            json.dump(neg, f, sort_keys=True)
    
    def save_neg_dict_to_pickle(self, neg, path):
        print('Saving negative samples to file:', path)
        with open(path, "wb") as f:
            pickle.dump(neg, f)

    def load_neg_dict_from_json(self, path):
        print('Loading negative samples from file', path)
        with open(path, 'r') as f:
            d = json.load(f)
            keys = list(map(int, d.keys()))
            neg = {}
            for i in range(len(keys)):
                neg[keys[i]] = list(d.values())[i]
            return neg

    def load_neg_dict_from_pickle(self, path):
        print('Loading negative samples from file', path)
        with open(path, 'rb') as f:
            return pickle.load(f)

In [3]:
class Config():
    def __init__(self):
        self.DEBUG = True
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.lr = 0.03
        self.epoches = 2 if self.DEBUG else 50
        self.batch_size = 512
        self.train_path = 'data/train.rating'
        self.test_path = 'data/test.rating'
        self.test_negative_path = 'data/test.negative'
args = Config()

In [4]:
print('Data loading...')
data = Load_data(args.train_path, args.test_path, args.test_negative_path)

Data loading...
Loaded: data/train.rating
Num of users: 943
Num of items: 1152
Data sparsity: 0.08930017968657948
Loaded: data/test.rating
Num of users: 943
Num of items: 1148
Data sparsity: 0.0008710801393728223
Loading negative samples from file data/train_neg_dict.json


In [18]:
class ConvNCF(nn.Module):

    def __init__(self, user_count, item_count):
        super(ConvNCF, self).__init__()

        self.user_count = user_count
        self.item_count = item_count

        self.embedding_size = 64

        self.P = nn.Embedding(self.user_count, self.embedding_size).to(args.device)
        self.Q = nn.Embedding(self.item_count, self.embedding_size).to(args.device)

        # cnn 定义及参数
        self.channel_size = 32
        self.kernel_size = 2
        self.strides = 2
        self.cnn = nn.Sequential(
            # batch_size * 1 * 64 * 64
            nn.Conv2d(1, self.channel_size, self.kernel_size, stride=self.strides),
            nn.ReLU(),
            # batch_size * 32 * 32 * 32
            nn.Conv2d(self.channel_size, self.channel_size, self.kernel_size, stride=self.strides),
            nn.ReLU(),
            # batch_size * 32 * 16 * 16
            nn.Conv2d(self.channel_size, self.channel_size, self.kernel_size, stride=self.strides),
            nn.ReLU(),
            # batch_size * 32 * 8 * 8
            nn.Conv2d(self.channel_size, self.channel_size, self.kernel_size, stride=self.strides),
            nn.ReLU(),
            # batch_size * 32 * 4 * 4
            nn.Conv2d(self.channel_size, self.channel_size, self.kernel_size, stride=self.strides),
            nn.ReLU(),
            # batch_size * 32 * 2 * 2
            nn.Conv2d(self.channel_size, self.channel_size, self.kernel_size, stride=self.strides),
            nn.ReLU(),
            # batch_size * 32 * 1 * 1
        )
        # 全连接层定义
        self.fc = nn.Linear(32, 1)
    def forward(self, user_ids, item_ids, is_pretrain):

        user_ids = list(map(int, user_ids))
        item_ids = list(map(int, item_ids))

        user_embeddings = self.P(torch.tensor(user_ids).to(args.device))
        item_embeddings = self.Q(torch.tensor(item_ids).to(args.device))
        
        if is_pretrain:
            # 内积
            prediction = torch.sum(torch.mul(user_embeddings, item_embeddings), dim=1)
        else:
            # 外积
            interaction_map = torch.bmm(user_embeddings.unsqueeze(2), item_embeddings.unsqueeze(1))
            interaction_map = interaction_map.view((-1, 1, self.embedding_size, self.embedding_size))

            # cnn
            feature_map = self.cnn(interaction_map)  # output: batch_size * 32 * 1 * 1
            feature_vec = feature_map.view((-1, 32))
            prediction = self.fc(feature_vec)
            prediction = prediction.view((-1))

        return prediction

In [19]:
#定义BPR损失函数
class BPRLoss(nn.Module):

    def __init__(self):
        super(BPRLoss, self).__init__()
        self.sigmoid = nn.Sigmoid()

    def forward(self, pos_preds, neg_preds):
        distance = pos_preds - neg_preds
        loss = torch.sum(torch.log((1 + torch.exp(-distance))))
        
        return loss

In [20]:
model = ConvNCF(int(max(data.train_group[:, 0])) + 1, int(max(data.train_group[:, 1])) + 1)
model = model.to(args.device)
optimizer = optim.Adagrad(model.parameters(), lr = args.lr, weight_decay=1e-2)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1, last_epoch=-1)
bpr_loss = BPRLoss().to(args.device)

In [21]:
train_loader = Data.DataLoader(data.train_group, batch_size = args.batch_size, shuffle=True, num_workers=4)

In [22]:
#评价指标br、ndcg
def scoreK(topi, k):
    hr = 1.0 if 99 in topi[0:k] else 0.0
    if hr:
        ndcg = math.log(2) / math.log(topi.tolist().index(99) + 2)
    else:
        ndcg = 0
    return hr, ndcg

In [23]:
hr10 = []
ndcg10 = []
best_hr, best_ndcg = 0, 0
model_path = ''
#模型评价函数
def evaluate(epoch):
    global best_hr, best_ndcg, model_path
    model.eval()
    user_count = len(data.test_negative)
    hrs = []
    ndcgs = []
    for u in range(user_count):

        item_ids = torch.tensor(data.test_negative[u]).to(args.device)
        user_ids = torch.tensor([u] * len(item_ids)).to(args.device)
        predictions = model(user_ids, item_ids, False)
        topv, topi = torch.topk(predictions, 10, dim=0)
        hr, ndcg = scoreK(topi, 10)
        hrs.append(hr)
        ndcgs.append(ndcg)
    with open('./ConvNCF-work/{}.log'.format(Time), 'a+') as f:
        f.write('HR@10:{:.4f},\tNDCG@10:{:.4f}\n'.format(sum(hrs) / len(hrs), sum(ndcgs) / len(ndcgs)))

    hr10.append(sum(hrs) / len(hrs))
    ndcg10.append(sum(ndcgs) / len(ndcgs))
    if sum(hrs) / len(hrs) > best_hr:
        best_hr, best_ndcg = sum(hrs) / len(hrs), sum(ndcgs) / len(ndcgs)
        model_path = './ConvNCF-temp/{}_epoch{}.model'.format(Time, epoch)    
        torch.save(model.state_dict(), model_path)

In [26]:
losses = []
accuracies = []
model.train()
Time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
print('train...')
for epoch in range(args.epoches):    

    t1 = time.time()
    total_loss = 0
    total_acc = 0

    for batch_idx, train_data in enumerate(train_loader):                    
        user_ids = Variable(train_data[:, 0].to(args.device))
        pos_item_ids = Variable(train_data[:, 1].to(args.device))
        neg_item_ids = Variable(train_data[:, 2].to(args.device))
        optimizer.zero_grad()
        if epoch < 10:    
            #在前十轮首先预训练BPR，这个值可以自由设置
            pos_preds = model(user_ids, pos_item_ids, True)
            neg_preds = model(user_ids, neg_item_ids, True)                        
        else:
            #训练ConvNCF                        
            pos_preds = model(user_ids, pos_item_ids, False)
            neg_preds = model(user_ids, neg_item_ids, False)                        

        loss = bpr_loss(pos_preds, neg_preds)                    
        total_loss += loss.item()                    
        loss.backward()
        optimizer.step()
    t2 = time.time()
    losses.append(total_loss / (batch_idx + 1))
    with open('./ConvNCF-work/{}.log'.format(Time), 'a+') as f:
        f.write('Epoch:{},\tTrain loss:{:.4f},\tTime:{:.4f}\n'.format(epoch, losses[-1],  t2-t1))  
    evaluate(epoch)
    scheduler.step()

print('Done...')
print('best_hr:{}\tbest_ndcg:{}'.format(best_hr, best_ndcg))
print("The best model is saved to {}".format(model_path))


train...
Done...
best_hr:0.08059384941675504	best_ndcg:0.037491862987662786
The best model is saved to ./ConvNCF-temp/2023-02-13-20-43-05_epoch0.model
