In [9]:
import numpy as np
import json
import pandas as pd
import torch
import pickle
from collections import defaultdict
from tqdm import tqdm
# import warnings
# warnings.filterwarnings("ignore")

In [3]:
n_users = 6611
n_items = 79937
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [5]:
# load data
# to do: str_id -> int
cite_file = 'data/paper_file_ann.txt'
coauthor_file = 'data/author_file_ann.txt'
author_train_file = 'data/bipartite_train_ann.txt'
author_test_file = 'data/bipartite_test_ann.txt'

import random as rd

def generate_test(all_user_ratings):
    ratings_test = {}
    for user in all_user_ratings:
        ratings_test[user] = rd.sample(all_user_ratings[user], 1)[0]
    return ratings_test

def load_data(cite_file, coauthor_file, author_train_file, author_test_file, n_users):
    citation, author_train, coauthor = defaultdict(list), defaultdict(list), defaultdict(list)
    user_ratings_train = defaultdict(list)
    test_ratings = defaultdict(list)
    
    train_users = set()

    with open(cite_file, 'r') as f:
        line = f.readlines()
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        citation['src'].append(src)
        citation['tgt'].append(tgt)

    with open(coauthor_file, 'r') as f:
        line = f.readlines()
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        coauthor['src'].append(src)
        coauthor['tgt'].append(tgt)
    
    with open(author_train_file, 'r') as f:
        line = f.readlines()
        train_interacts = len(line)
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        user_ratings_train[src].append(tgt)
        author_train['src'].append(src)
        author_train['tgt'].append(tgt+n_users)
        train_users.add(src)
    
    with open(author_test_file, 'r') as f:
        line = f.readlines()
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        test_ratings[src].append(tgt)
        
    user_ratings_test = generate_test(user_ratings_train)
            
    return citation, author_train, coauthor, user_ratings_train, user_ratings_test, train_interacts, list(train_users), test_ratings

citation, author_train, coauthor, user_ratings_train, user_ratings_test, train_interacts, train_users, test_ratings = \
            load_data(cite_file, coauthor_file, author_train_file, author_test_file, n_users)

In [11]:
# load pickle 
feature_file = 'data/feature.pkl'

def load_item_feature(feature_file):
    with open(feature_file, 'rb') as f:
        feature_matrix = pickle.load(f)
    # feature_matrix_shape: 79937*512
    print(feature_matrix.shape)
    return feature_matrix
# torch.Size([79937, 512])
item_feature = load_item_feature(feature_file)
item_feature_dim = 512

torch.Size([79937, 512])


In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BPR(nn.Module):
    def __init__(self, num_users, num_items, item_feature_dim):
        super(BPR, self).__init__()
        self.user_embedding = nn.Embedding(num_users, item_feature_dim)
        self.item_embedding = nn.Embedding(num_items, item_feature_dim)
        # 初始化项目嵌入为预训练特征
        self.item_embedding.weight.data = item_feature.clone().detach()

    def forward(self, user_indices, item_indices_pos, item_indices_neg):
        user_emb = self.user_embedding(user_indices)
        item_emb_pos = self.item_embedding(item_indices_pos)
        item_emb_neg = self.item_embedding(item_indices_neg)

        # 计算BPR损失
        pos_scores = torch.sum(user_emb * item_emb_pos, dim=1)
        neg_scores = torch.sum(user_emb * item_emb_neg, dim=1)
        loss = -F.logsigmoid(pos_scores - neg_scores).mean()
        return loss

    def predict(self, users, items):
        user_emb = self.user_embedding(users)
        item_emb = self.item_embedding(items)
        scores = torch.sum(user_emb * item_emb, dim=1)
        return scores

In [20]:
def generate_train_batch(user_ratings_train, n, batch_size, train_users, test_ratings):
    t = []
    user_pos_neg = []
    for b in range(batch_size):
        u = rd.sample(train_users, 1)[0]
        i = rd.sample(user_ratings_train[u], 1)[0]
        j = rd.randint(0, n - 1)
        # one negative sample
        while j in user_ratings_train[u] and j in test_ratings[u]:
            j = rd.randint(0, n - 1)
        t.append([u, i, 1])
        t.append([u, j, 0])
        user_pos_neg.append([u, i, j])
    train_batch = np.asarray(t)
    user_pos_neg = np.asarray(user_pos_neg)
    return train_batch, user_pos_neg

def generate_test_batch(user_ratings, user_ratings_test, n, train_users, test_ratings):
    t = []
    for u in train_users:
        i = user_ratings_test[u]
        rated = user_ratings[u]
        for j in range(10):
            k = np.random.randint(0, n)
            while k in rated and k in test_ratings[u]:
                k = np.random.randint(0, n)
            t.append([u, i, 1])
            t.append([u, k, 0])
    test_batch = np.asarray(t)
    return test_batch

In [36]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

def evaluation(pred, labels):
    auc = cal_auc(pred, labels)
    pred = (pred > 0.5).astype(int)
    labels = (labels > 0.5).astype(int)
    precision = precision_score(pred, labels, average='binary')
    recall = recall_score(pred, labels, average='binary')
    f1score = f1_score(pred, labels, average='binary')
    return auc, precision, recall, f1score

def cal_auc(pred, labels):
    P_ind = []  # 正样本下标
    F_ind = []  # 负样本下标

    #  计数过程
    for ind, val in enumerate(labels):
        if val > 0.5:
            P_ind.append(ind)
        else:
            F_ind.append(ind)

    new_data = [[p, l] for p, l in zip(pred, labels)]
    new_data.sort(key=lambda x:x[0])

    # 求正样本rank之和
    rank_sum = 0
    for ind, [prob, label] in enumerate(new_data):
        if label>0.5:
            rank_sum+=ind
    auc = (rank_sum - len(P_ind)*(1+len(P_ind))/2) / (len(P_ind)*len(F_ind))
    return auc

In [38]:
# params
batch_size = 65536
# emb_dim = 64
lr = 0.001
num_epoches = 15

In [39]:
model = BPR(n_users, n_items, item_feature_dim).to(device)
opt = torch.optim.Adam(lr=lr, params=model.parameters(), weight_decay=0.0001)

In [40]:
########################### START TRAINING & TESTING & EVALUATION#####################################
all_loss = []
performance = []
best_auc = 0
best_precision = 0
model_save_path = 'bpr_model/'
pre_loss = 0
for epoch in range(1, num_epoches+1):
    model.train()
    train_loss = 0
    n_batches = train_interacts // batch_size + 1
    pbar = tqdm(total=n_batches*batch_size*2)
    for i in range(n_batches):
        _, uij = generate_train_batch(user_ratings_train, n_items, batch_size, train_users, test_ratings)
        users, pos, neg = uij[:, 0], uij[:, 1], uij[:, 2]
        users, pos, neg = torch.tensor(users).to(device), torch.tensor(pos).to(device), torch.tensor(neg).to(device)
        loss = model(users, pos, neg)
        train_loss += loss.item()
        opt.zero_grad()
        loss.backward()
        opt.step()
        pbar.update(batch_size*2)
    train_loss /= n_batches
    all_loss.append(train_loss)
    print("epoch {}, train loss: {:4f}".format(epoch, train_loss))
    
    model.eval()
    with torch.no_grad():
        t_uij = generate_test_batch(user_ratings_train, user_ratings_test, n_items, train_users, test_ratings)
        users, items, labels = t_uij[:, 0], t_uij[:, 1], t_uij[:, 2]
        users, items = torch.tensor(users).to(device), torch.tensor(items).to(device)
        pred = model.predict(users, items)
        pred = pred.cpu().numpy()
        auc, precision, recall, f1score = evaluation(pred, labels)
        performance.append([auc, precision, recall, f1score])
        print("Evaluation: auc:{}, precision:{}, recall:{}, f1:{}".format(auc, precision, recall, f1score)) 
        if auc > best_auc and train_loss < pre_loss:
            best_auc = auc
            best_precision = precision
            state = {'net': model.state_dict(), 'opt':opt.state_dict(), 'epoch':epoch}
            torch.save(state, model_save_path+f"model_{epoch}.pth")
        pre_loss = train_loss

 24%|██▍       | 655360/2752512 [00:10<00:33, 63088.84it/s] 


epoch 1, train loss: 0.546612
Evaluation: auc:0.696197595679215, precision:0.5720775287704422, recall:0.6449414409514486, f1:0.6234426310665193


100%|██████████| 2752512/2752512 [00:17<00:00, 159525.04it/s]
100%|██████████| 2752512/2752512 [00:16<00:00, 168108.91it/s]

epoch 2, train loss: 0.380174
Evaluation: auc:0.8147068632302945, precision:0.7162325863113265, recall:0.742218988241909, f1:0.7477728857235454


100%|██████████| 2752512/2752512 [00:18<00:00, 151799.06it/s]


epoch 3, train loss: 0.312187
Evaluation: auc:0.8844355576580976, precision:0.7933070866141733, recall:0.8242362256680963, f1:0.8272540660034738


100%|██████████| 2752512/2752512 [00:18<00:00, 152288.15it/s]
100%|██████████| 2752512/2752512 [00:16<00:00, 162946.98it/s]

epoch 4, train loss: 0.279779
Evaluation: auc:0.9245370126982857, precision:0.8464566929133859, recall:0.8906119730971566, f1:0.8865416944206553


100%|██████████| 2752512/2752512 [00:18<00:00, 151953.59it/s]


epoch 5, train loss: 0.261954
Evaluation: auc:0.9459634351517223, precision:0.8812840702604482, recall:0.9267452908541244, f1:0.9220386242296541


100%|██████████| 2752512/2752512 [00:17<00:00, 154202.16it/s]
100%|██████████| 2752512/2752512 [00:16<00:00, 169534.62it/s]

epoch 6, train loss: 0.251740
Evaluation: auc:0.9577115029673846, precision:0.8980920654149, recall:0.9390683086268942, f1:0.9368336255508695


100%|██████████| 2752512/2752512 [00:17<00:00, 155079.11it/s]


epoch 7, train loss: 0.245674
Evaluation: auc:0.9655536973352054, precision:0.9059660811629315, recall:0.9462196287871295, f1:0.9443909522832384


100%|██████████| 2752512/2752512 [00:17<00:00, 153515.08it/s]
100%|██████████| 2752512/2752512 [00:16<00:00, 167956.58it/s]

epoch 8, train loss: 0.241588
Evaluation: auc:0.9688324124725173, precision:0.9142943670502726, recall:0.9489443943984931, f1:0.9501553955702426


100%|██████████| 2752512/2752512 [00:18<00:00, 151608.41it/s]


epoch 9, train loss: 0.240479
Evaluation: auc:0.9715279394020326, precision:0.9145972138098122, recall:0.950634891997835, f1:0.95109872373259


100%|██████████| 2752512/2752512 [00:17<00:00, 154263.60it/s]
100%|██████████| 2752512/2752512 [00:16<00:00, 159571.05it/s]

epoch 10, train loss: 0.240380
Evaluation: auc:0.9721389694258679, precision:0.9139915202907329, recall:0.9514911790794703, f1:0.9511653193400464


100%|██████████| 2752512/2752512 [00:18<00:00, 152144.76it/s]


epoch 11, train loss: 0.241036
Evaluation: auc:0.9726445985510314, precision:0.911114476075106, recall:0.9519876022157742, f1:0.9498326703289766


100%|██████████| 2752512/2752512 [00:17<00:00, 156781.14it/s]
100%|██████████| 2752512/2752512 [00:16<00:00, 160589.11it/s]

epoch 12, train loss: 0.242625
Evaluation: auc:0.9734461159306934, precision:0.9109630526953362, recall:0.9519372124849543, f1:0.9497272849260789


100%|██████████| 2752512/2752512 [00:17<00:00, 156712.27it/s]


epoch 13, train loss: 0.244652
Evaluation: auc:0.9748404881721006, precision:0.9099030890369473, recall:0.9536994592449273, f1:0.9499569207420699


100%|██████████| 2752512/2752512 [00:17<00:00, 155623.05it/s]
100%|██████████| 2752512/2752512 [00:16<00:00, 168373.24it/s]

epoch 14, train loss: 0.246887
Evaluation: auc:0.9752957953918866, precision:0.912931556632344, recall:0.9535728411338167, f1:0.9515467171717171


100%|██████████| 2752512/2752512 [00:17<00:00, 158986.31it/s]


epoch 15, train loss: 0.249456
Evaluation: auc:0.9745688019778407, precision:0.9082374318594791, recall:0.9532601387715899, f1:0.9488479517824515


