In [2]:
import numpy as np
import json
import pandas as pd
import torch
import pickle
from collections import defaultdict
from tqdm import tqdm
# import warnings
# warnings.filterwarnings("ignore")

In [3]:
n_users = 6611
n_items = 79937
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [4]:
# load data
# to do: str_id -> int
cite_file = 'data/paper_file_ann.txt'
coauthor_file = 'data/author_file_ann.txt'
author_train_file = 'data/bipartite_train_ann.txt'
author_test_file = 'data/bipartite_test_ann.txt'

import random as rd

def generate_test(all_user_ratings):
    ratings_test = {}
    for user in all_user_ratings:
        ratings_test[user] = rd.sample(all_user_ratings[user], 1)[0]
    return ratings_test

def load_data(cite_file, coauthor_file, author_train_file, author_test_file, n_users):
    citation, author_train, coauthor = defaultdict(list), defaultdict(list), defaultdict(list)
    user_ratings_train = defaultdict(list)
    test_ratings = defaultdict(list)
    
    train_users = set()

    with open(cite_file, 'r') as f:
        line = f.readlines()
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        citation['src'].append(src)
        citation['tgt'].append(tgt)

    with open(coauthor_file, 'r') as f:
        line = f.readlines()
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        coauthor['src'].append(src)
        coauthor['tgt'].append(tgt)
    
    with open(author_train_file, 'r') as f:
        line = f.readlines()
        train_interacts = len(line)
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        user_ratings_train[src].append(tgt)
        author_train['src'].append(src)
        author_train['tgt'].append(tgt+n_users)
        train_users.add(src)
    
    with open(author_test_file, 'r') as f:
        line = f.readlines()
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        test_ratings[src].append(tgt)
        
    user_ratings_test = generate_test(user_ratings_train)
            
    return citation, author_train, coauthor, user_ratings_train, user_ratings_test, train_interacts, list(train_users), test_ratings

citation, author_train, coauthor, user_ratings_train, user_ratings_test, train_interacts, train_users, test_ratings = \
            load_data(cite_file, coauthor_file, author_train_file, author_test_file, n_users)

In [5]:
# load pickle 
feature_file = 'data/feature.pkl'

def load_item_feature(feature_file):
    with open(feature_file, 'rb') as f:
        feature_matrix = pickle.load(f)
    # feature_matrix_shape: 79937*512
    print(feature_matrix.shape)
    return feature_matrix
# torch.Size([79937, 512])
item_feature = load_item_feature(feature_file)
item_feature_dim = 512

torch.Size([79937, 512])


In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LatentFactorizationModel(nn.Module):
    def __init__(self, num_users, num_items, num_factors):
        super(LatentFactorizationModel, self).__init__()
        self.user_factors = nn.Embedding(num_users, num_factors)
        self.item_factors = nn.Embedding(num_items, num_factors)
        self.item_factors.weight.data = item_feature.clone().detach()
        
        self.user_biases = nn.Embedding(num_users, 1)
        self.item_biases = nn.Embedding(num_items, 1)

    def forward(self, user_indices, item_indices):
        user_embedding = self.user_factors(user_indices)
        item_embedding = self.item_factors(item_indices)

        interaction = torch.sum(user_embedding * item_embedding, dim=1)

        user_bias = self.user_biases(user_indices).squeeze()
        item_bias = self.item_biases(item_indices).squeeze()
        prediction = interaction + user_bias + item_bias
        # print(prediction.shape)
        prediction = torch.sigmoid(prediction)
        return prediction

    def loss(self, predictions, ratings):
        return F.mse_loss(predictions, ratings)

    def predict(self, users, items):
        user_embedding = self.user_factors(users)
        item_embedding = self.item_factors(items)

        interaction = torch.sum(user_embedding * item_embedding, dim=1)

        user_bias = self.user_biases(users).squeeze()
        item_bias = self.item_biases(items).squeeze()
        prediction = interaction + user_bias + item_bias
        prediction = torch.sigmoid(prediction)
        return prediction

In [7]:
def generate_train_batch(user_ratings_train, n, batch_size, train_users, test_ratings):
    t = []
    user_pos_neg = []
    for b in range(batch_size):
        u = rd.sample(train_users, 1)[0]
        i = rd.sample(user_ratings_train[u], 1)[0]
        j = rd.randint(0, n - 1)
        # one negative sample
        while j in user_ratings_train[u] and j in test_ratings[u]:
            j = rd.randint(0, n - 1)
        t.append([u, i, 1])
        t.append([u, j, 0])
        user_pos_neg.append([u, i, j])
    train_batch = np.asarray(t)
    user_pos_neg = np.asarray(user_pos_neg)
    return train_batch, user_pos_neg

def generate_test_batch(user_ratings, user_ratings_test, n, train_users, test_ratings):
    t = []
    for u in train_users:
        i = user_ratings_test[u]
        rated = user_ratings[u]
        for j in range(10):
            k = np.random.randint(0, n)
            while k in rated and k in test_ratings[u]:
                k = np.random.randint(0, n)
            t.append([u, i, 1])
            t.append([u, k, 0])
    test_batch = np.asarray(t)
    return test_batch

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

def evaluation(pred, labels):
    auc = cal_auc(pred, labels)
    pred = (pred > 0.5).astype(int)
    labels = (labels > 0.5).astype(int)
    precision = precision_score(pred, labels, average='binary')
    recall = recall_score(pred, labels, average='binary')
    f1score = f1_score(pred, labels, average='binary')
    return auc, precision, recall, f1score

def cal_auc(pred, labels):
    P_ind = []  # 正样本下标
    F_ind = []  # 负样本下标

    #  计数过程
    for ind, val in enumerate(labels):
        if val > 0.5:
            P_ind.append(ind)
        else:
            F_ind.append(ind)

    new_data = [[p, l] for p, l in zip(pred, labels)]
    new_data.sort(key=lambda x:x[0])

    # 求正样本rank之和
    rank_sum = 0
    for ind, [prob, label] in enumerate(new_data):
        if label>0.5:
            rank_sum+=ind
    auc = (rank_sum - len(P_ind)*(1+len(P_ind))/2) / (len(P_ind)*len(F_ind))
    return auc

In [20]:
# params
batch_size = 65536
# emb_dim = 64
lr = 0.0005
num_epoches = 20

In [28]:
########################### START TRAINING & TESTING & EVALUATION#####################################
model = LatentFactorizationModel(n_users, n_items, item_feature_dim).to(device)
opt = torch.optim.Adam(lr=lr, params=model.parameters(), weight_decay=0.001)

all_loss = []
performance = []
best_auc = 0
best_precision = 0
model_save_path = 'model/'
pre_loss = 0
for epoch in range(1, num_epoches+1):
    model.train()
    train_loss = 0
    n_batches = train_interacts // batch_size + 1
    pbar = tqdm(total=n_batches*batch_size*2)
    for i in range(n_batches):
        uij, _  = generate_train_batch(user_ratings_train, n_items, batch_size, train_users, test_ratings)
        users, items, labels = uij[:, 0], uij[:, 1], uij[:, 2]
        users, items, labels = torch.tensor(users).to(device), torch.tensor(items).to(device), torch.tensor(labels).float().to(device)
        predictions = model(users, items).float()
        loss = model.loss(predictions, labels)
        train_loss += loss.item()
        opt.zero_grad()
        loss.backward()
        opt.step()
        pbar.update(batch_size*2)
    train_loss /= n_batches
    all_loss.append(train_loss)
    print("epoch {}, train loss: {:4f}".format(epoch, train_loss))
    
    model.eval()
    with torch.no_grad():
        t_uij = generate_test_batch(user_ratings_train, user_ratings_test, n_items, train_users, test_ratings)
        users, items, labels = t_uij[:, 0], t_uij[:, 1], t_uij[:, 2]
        users, items = torch.tensor(users).to(device), torch.tensor(items).to(device)
        pred = model.predict(users, items)
        pred = pred.cpu().numpy()
        auc, precision, recall, f1score = evaluation(pred, labels)
        performance.append([auc, precision, recall, f1score])
        print("Evaluation: auc:{}, precision:{}, recall:{}, f1:{}".format(auc, precision, recall, f1score)) 
        if auc > best_auc and train_loss < pre_loss:
            best_auc = auc
            best_precision = precision
            state = {'net': model.state_dict(), 'opt':opt.state_dict(), 'epoch':epoch}
            torch.save(state, model_save_path+f"model_{epoch}.pth")
        pre_loss = train_loss

 14%|█▍        | 393216/2752512 [01:21<08:06, 4852.75it/s]  
100%|██████████| 2752512/2752512 [00:17<00:00, 162174.56it/s]

epoch 1, train loss: 0.329619
Evaluation: auc:0.5090032503289859, precision:0.5143852210781344, recall:0.507249622959877, f1:0.510792502762971


100%|██████████| 2752512/2752512 [00:18<00:00, 150027.15it/s]


epoch 2, train loss: 0.320518
Evaluation: auc:0.5175242983622061, precision:0.5248334342822532, recall:0.5133902121104397, f1:0.519048760033545


100%|██████████| 2752512/2752512 [00:18<00:00, 145751.75it/s]
100%|██████████| 2752512/2752512 [00:17<00:00, 154719.01it/s]

epoch 3, train loss: 0.313353
Evaluation: auc:0.5217536056373888, precision:0.5345245305875227, recall:0.5174966648586047, f1:0.5258727924143223


100%|██████████| 2752512/2752512 [00:19<00:00, 144258.78it/s]


epoch 4, train loss: 0.307805
Evaluation: auc:0.5283427651174829, precision:0.5469412477286493, recall:0.525022893440121, f1:0.5357579892759405


100%|██████████| 2752512/2752512 [00:19<00:00, 142759.50it/s]
100%|██████████| 2752512/2752512 [00:17<00:00, 152173.51it/s]

epoch 5, train loss: 0.303566
Evaluation: auc:0.5321726920820705, precision:0.5560266505148395, recall:0.52766960295449, f1:0.5414771177255602


100%|██████████| 2752512/2752512 [00:18<00:00, 145064.45it/s]


epoch 6, train loss: 0.300016
Evaluation: auc:0.534914475598182, precision:0.5596608116293156, recall:0.5295204802361065, f1:0.5441736172969472


100%|██████████| 2752512/2752512 [00:19<00:00, 142371.34it/s]
100%|██████████| 2752512/2752512 [00:17<00:00, 153025.21it/s]

epoch 7, train loss: 0.297362
Evaluation: auc:0.5392658249813541, precision:0.5675348273773471, recall:0.5341542320464036, f1:0.5503388225274765


100%|██████████| 2752512/2752512 [00:19<00:00, 144797.73it/s]


epoch 8, train loss: 0.295386
Evaluation: auc:0.5407238454861525, precision:0.5716232586311326, recall:0.5356433395765935, f1:0.5530487268891559


100%|██████████| 2752512/2752512 [00:19<00:00, 144572.57it/s]
100%|██████████| 2752512/2752512 [00:16<00:00, 165331.90it/s]

epoch 9, train loss: 0.293410
Evaluation: auc:0.5434759865081861, precision:0.5755602665051484, recall:0.5358728905556104, f1:0.5550079943929737


100%|██████████| 2752512/2752512 [00:17<00:00, 153355.98it/s]


epoch 10, train loss: 0.292165
Evaluation: auc:0.5458823846641776, precision:0.5773773470623864, recall:0.5382172348083845, f1:0.5571099828323045


100%|██████████| 2752512/2752512 [00:18<00:00, 147031.14it/s]
100%|██████████| 2752512/2752512 [00:17<00:00, 160827.18it/s]

epoch 11, train loss: 0.290887
Evaluation: auc:0.5446493457336027, precision:0.5790430042398547, recall:0.5383413343094055, f1:0.5579508728925463


100%|██████████| 2752512/2752512 [00:17<00:00, 154549.29it/s]


epoch 12, train loss: 0.290161
Evaluation: auc:0.5463627595154599, precision:0.5820714718352513, recall:0.540563344630226, f1:0.5605500506740745


100%|██████████| 2752512/2752512 [00:19<00:00, 143335.08it/s]
100%|██████████| 2752512/2752512 [07:37<00:00, 134986.20it/s]

epoch 13, train loss: 0.289157
Evaluation: auc:0.5490499696236079, precision:0.5841913991520291, recall:0.5419300463548251, f1:0.5622677257159513


100%|██████████| 2752512/2752512 [07:38<00:00, 6003.79it/s]  


epoch 14, train loss: 0.288547
Evaluation: auc:0.5484480456386949, precision:0.5811629315566323, recall:0.5406852248394004, f1:0.5601938346567025


100%|██████████| 2752512/2752512 [00:19<00:00, 143408.64it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 148444.81it/s]

epoch 15, train loss: 0.287700
Evaluation: auc:0.5476251797178151, precision:0.5835857056329498, recall:0.5404495800086944, f1:0.5611899440120567


100%|██████████| 2752512/2752512 [00:18<00:00, 146115.05it/s]


epoch 16, train loss: 0.287100
Evaluation: auc:0.5479603541307674, precision:0.5858570563294972, recall:0.5416643333146666, f1:0.5628946372974074


100%|██████████| 2752512/2752512 [00:19<00:00, 144765.06it/s]
100%|██████████| 2752512/2752512 [00:17<00:00, 153886.44it/s]

epoch 17, train loss: 0.286508
Evaluation: auc:0.5474690237842014, precision:0.5847970926711085, recall:0.5407978939408791, f1:0.5619375350119677


100%|██████████| 2752512/2752512 [00:18<00:00, 147158.00it/s]


epoch 18, train loss: 0.286222
Evaluation: auc:0.5476188347938826, precision:0.5866141732283464, recall:0.5413790212135611, f1:0.563089579790404


100%|██████████| 2752512/2752512 [00:18<00:00, 145102.10it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 142919.38it/s]

epoch 19, train loss: 0.285640
Evaluation: auc:0.547759486569269, precision:0.5852513628104179, recall:0.5408166120952621, f1:0.562157287681992


100%|██████████| 2752512/2752512 [00:19<00:00, 143088.79it/s]


epoch 20, train loss: 0.284912
Evaluation: auc:0.548850863012377, precision:0.5844942459115687, recall:0.5431341372469008, f1:0.5630556710354535


