In [1]:
import numpy as np
import json
import pandas as pd
import torch
import pickle
from collections import defaultdict
from tqdm import tqdm
# import warnings
# warnings.filterwarnings("ignore")

In [2]:
n_users = 6611
n_items = 79937
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
# load data
# to do: str_id -> int
cite_file = 'data/paper_file_ann.txt'
coauthor_file = 'data/author_file_ann.txt'
author_train_file = 'data/bipartite_train_ann.txt'
author_test_file = 'data/bipartite_test_ann.txt'

import random as rd

def generate_test(all_user_ratings):
    ratings_test = {}
    for user in all_user_ratings:
        ratings_test[user] = rd.sample(all_user_ratings[user], 1)[0]
    return ratings_test

def load_data(cite_file, coauthor_file, author_train_file, author_test_file, n_users):
    citation, author_train, coauthor = defaultdict(list), defaultdict(list), defaultdict(list)
    user_ratings_train = defaultdict(list)
    test_ratings = defaultdict(list)
    
    train_users = set()

    with open(cite_file, 'r') as f:
        line = f.readlines()
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        citation['src'].append(src)
        citation['tgt'].append(tgt)

    with open(coauthor_file, 'r') as f:
        line = f.readlines()
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        coauthor['src'].append(src)
        coauthor['tgt'].append(tgt)
    
    with open(author_train_file, 'r') as f:
        line = f.readlines()
        train_interacts = len(line)
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        user_ratings_train[src].append(tgt)
        author_train['src'].append(src)
        author_train['tgt'].append(tgt+n_users)
        train_users.add(src)
    
    with open(author_test_file, 'r') as f:
        line = f.readlines()
    for l in line:
        src, tgt = l.strip().split(' ')
        src, tgt = int(src), int(tgt)
        test_ratings[src].append(tgt)
        
    user_ratings_test = generate_test(user_ratings_train)
            
    return citation, author_train, coauthor, user_ratings_train, user_ratings_test, train_interacts, list(train_users), test_ratings

citation, author_train, coauthor, user_ratings_train, user_ratings_test, train_interacts, train_users, test_ratings = \
            load_data(cite_file, coauthor_file, author_train_file, author_test_file, n_users)

In [4]:
# load pickle 
feature_file = 'data/feature.pkl'

def load_item_feature(feature_file):
    with open(feature_file, 'rb') as f:
        feature_matrix = pickle.load(f)
    # feature_matrix_shape: 79937*512
    print(feature_matrix.shape)
    return feature_matrix
# torch.Size([79937, 512])
item_feature = load_item_feature(feature_file)
item_feature_dim = 512

torch.Size([79937, 512])


In [5]:
import scipy.sparse as sp

def create_adj_mat(adj_mat):    
    def mean_adj_single(adj):
        # D^-1 * A
        rowsum = np.array(adj.sum(1))

        d_inv = np.power(rowsum, -1).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat_inv = sp.diags(d_inv)

        norm_adj = d_mat_inv.dot(adj)
        # norm_adj = adj.dot(d_mat_inv)
        print('generate single-normalized adjacency matrix.')
        return norm_adj.tocoo()

    def normalized_adj_single(adj):
        # D^-1/2 * A * D^-1/2
        rowsum = np.array(adj.sum(1))

        d_inv_sqrt = np.power(rowsum, -0.5).flatten()
        d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
        d_mat_inv_sqrt = sp.diags(d_inv_sqrt)

        # bi_lap = adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt)
        bi_lap = d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt)
        return bi_lap.tocoo()

    def check_adj_if_equal(adj):
        dense_A = np.array(adj.todense())
        degree = np.sum(dense_A, axis=1, keepdims=False)

        temp = np.dot(np.diag(np.power(degree, -1)), dense_A)
        print('check normalized adjacency matrix whether equal to this laplacian matrix.')
        return temp

    norm_adj_mat = mean_adj_single(adj_mat + sp.eye(adj_mat.shape[0]))
    # norm_adj_mat = normalized_adj_single(adj_mat + sp.eye(adj_mat.shape[0]))
    mean_adj_mat = mean_adj_single(adj_mat)

    print('already normalize adjacency matrix')
    return adj_mat.tocsr(), norm_adj_mat.tocsr(), mean_adj_mat.tocsr()

def normalize_adj(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv_sqrt = np.power(rowsum, -0.5).flatten()
    r_inv_sqrt[np.isinf(r_inv_sqrt)] = 0.
    r_mat_inv_sqrt = sp.diags(r_inv_sqrt)
    return mx.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt)

# info graph          
R = sp.dok_matrix((n_users, n_items), dtype=np.float32)
for user in train_users:
    for item in user_ratings_train[user]:
        R[user, item] = 1

          
adj_mat = sp.dok_matrix((n_users + n_items, n_users + n_items), dtype=np.float32)
adj_mat = adj_mat.tolil()
R = R.tolil()

adj_mat[:n_users, n_users:] = R
adj_mat[n_users:, :n_users] = R.T
adj_mat = adj_mat.todok()

_, info_norm_adj, _ = create_adj_mat(adj_mat)

generate single-normalized adjacency matrix.


  d_inv = np.power(rowsum, -1).flatten()


generate single-normalized adjacency matrix.
already normalize adjacency matrix


In [6]:
import torch
import torch.nn as nn
import torch.sparse as sparse
import torch.nn.functional as F

class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, num_layers, graph, keep_prob=0.6):
        super(LightGCN, self).__init__()
        self.n_users = num_users
        self.n_items = num_items
        self.embedding_dim = embedding_dim
        self.n_layers = num_layers

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        # self.item_embedding.weight.data = item_feature.clone().detach()
        
        nn.init.normal_(self.user_embedding.weight, std=0.1)
        nn.init.normal_(self.item_embedding.weight, std=0.1)

        self.graph = graph
        self.graph = self._convert_sp_mat_to_sp_tensor(self.graph)
        self.graph = self.graph.coalesce().to(device)
    
        self.keep_prob = keep_prob
    
    def _convert_sp_mat_to_sp_tensor(self, X):
        coo = X.tocoo().astype(np.float32)
        row = torch.Tensor(coo.row).long()
        col = torch.Tensor(coo.col).long()
        index = torch.stack([row, col])
        data = torch.FloatTensor(coo.data)
        return torch.sparse.FloatTensor(index, data, torch.Size(coo.shape))
    
    def __dropout_x(self, x, keep_prob):
        size = x.size()
        index = x.indices().t()
        values = x.values()
        random_index = torch.rand(len(values)) + keep_prob
        random_index = random_index.int().bool()
        index = index[random_index]
        values = values[random_index]/keep_prob
        g = torch.sparse.FloatTensor(index.t(), values, size)
        return g
    
    def __dropout(self, keep_prob):
        graph = self.__dropout_x(self.graph, keep_prob)
        return graph
    
    def forward(self, users, items):
        user_embedding = self.user_embedding.weight
        item_embedding = self.item_embedding.weight
        all_emb = torch.cat([user_embedding, item_embedding])
        embs = [all_emb]
        if self.training:
            g_droped = self.__dropout(self.keep_prob)
        else:
            g_droped = self.graph
        for layer in range(self.n_layers):
            all_emb = torch.sparse.mm(g_droped, all_emb)
            embs.append(all_emb)
        embs = torch.stack(embs, dim=1)
        light_out = torch.mean(embs, dim=1)
        users_embeddings, items_embeddings = torch.split(light_out, [self.n_users, self.n_items])
        items_emb = torch.index_select(item_embedding, 0, items)
        users_emb = torch.index_select(user_embedding, 0, users)
        # users_emb = users_embeddings[users]
        # items_emb = items_embeddings[items]
        scores = torch.mul(users_emb, items_emb).sum(dim=1)
        scores = torch.sigmoid(scores)
        return scores

    def loss(self, predictions, ratings):
        return F.mse_loss(predictions, ratings)

In [7]:
def generate_train_batch(user_ratings_train, n, batch_size, train_users, test_ratings):
    t = []
    user_pos_neg = []
    for b in range(batch_size):
        u = rd.sample(train_users, 1)[0]
        i = rd.sample(user_ratings_train[u], 1)[0]
        j = rd.randint(0, n - 1)
        # one negative sample
        while j in user_ratings_train[u] and j in test_ratings[u]:
            j = rd.randint(0, n - 1)
        t.append([u, i, 1])
        t.append([u, j, 0])
        user_pos_neg.append([u, i, j])
    train_batch = np.asarray(t)
    user_pos_neg = np.asarray(user_pos_neg)
    return train_batch, user_pos_neg

def generate_test_batch(user_ratings, user_ratings_test, n, train_users, test_ratings):
    t = []
    for u in train_users:
        i = user_ratings_test[u]
        rated = user_ratings[u]
        for j in range(10):
            k = np.random.randint(0, n)
            while k in rated and k in test_ratings[u]:
                k = np.random.randint(0, n)
            t.append([u, i, 1])
            t.append([u, k, 0])
    test_batch = np.asarray(t)
    return test_batch

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

def evaluation(pred, labels):
    auc = cal_auc(pred, labels)
    pred = (pred > 0.5).astype(int)
    labels = (labels > 0.5).astype(int)
    precision = precision_score(pred, labels, average='binary')
    recall = recall_score(pred, labels, average='binary')
    f1score = f1_score(pred, labels, average='binary')
    return auc, precision, recall, f1score

def cal_auc(pred, labels):
    P_ind = []  # 正样本下标
    F_ind = []  # 负样本下标

    #  计数过程
    for ind, val in enumerate(labels):
        if val > 0.5:
            P_ind.append(ind)
        else:
            F_ind.append(ind)

    new_data = [[p, l] for p, l in zip(pred, labels)]
    new_data.sort(key=lambda x:x[0])

    # 求正样本rank之和
    rank_sum = 0
    for ind, [prob, label] in enumerate(new_data):
        if label>0.5:
            rank_sum+=ind
    auc = (rank_sum - len(P_ind)*(1+len(P_ind))/2) / (len(P_ind)*len(F_ind))
    return auc

In [9]:
# params
batch_size = 65536
# emb_dim = 64
lr = 0.001
num_epoches = 50

In [10]:
########################### START TRAINING & TESTING & EVALUATION#####################################
model = LightGCN(n_users, n_items, 128, 4, info_norm_adj).to(device)
opt = torch.optim.Adam(lr=lr, params=model.parameters(), weight_decay=0.001)
# info_norm_adj = info_norm_adj.to(device)

all_loss = []
performance = []
best_auc = 0
best_precision = 0
model_save_path = 'model/'
pre_loss = 0
for epoch in range(1, num_epoches+1):
    model.train()
    train_loss = 0
    n_batches = train_interacts // batch_size + 1
    pbar = tqdm(total=n_batches*batch_size*2)
    for i in range(n_batches):
        uij, _  = generate_train_batch(user_ratings_train, n_items, batch_size, train_users, test_ratings)
        users, items, labels = uij[:, 0], uij[:, 1], uij[:, 2]
        users, items, labels = torch.tensor(users).to(device), torch.tensor(items).to(device), torch.tensor(labels).float().to(device)
        predictions = model(users, items).float()
        loss = model.loss(predictions, labels)
        train_loss += loss.item()
        opt.zero_grad()
        loss.backward()
        opt.step()
        pbar.update(batch_size*2)
    train_loss /= n_batches
    all_loss.append(train_loss)
    print("epoch {}, train loss: {:4f}".format(epoch, train_loss))
    
    model.eval()
    with torch.no_grad():
        t_uij = generate_test_batch(user_ratings_train, user_ratings_test, n_items, train_users, test_ratings)
        users, items, labels = t_uij[:, 0], t_uij[:, 1], t_uij[:, 2]
        users, items = torch.tensor(users).to(device), torch.tensor(items).to(device)
        pred = model(users, items)
        pred = pred.cpu().numpy()
        auc, precision, recall, f1score = evaluation(pred, labels)
        performance.append([auc, precision, recall, f1score])
        print("Evaluation: auc:{}, precision:{}, recall:{}, f1:{}".format(auc, precision, recall, f1score)) 
        if auc > best_auc and train_loss < pre_loss:
            best_auc = auc
            best_precision = precision
            state = {'net': model.state_dict(), 'opt':opt.state_dict(), 'epoch':epoch}
            torch.save(state, model_save_path+f"model_{epoch}.pth")
        pre_loss = train_loss

100%|██████████| 2752512/2752512 [00:20<00:00, 136716.29it/s]

epoch 1, train loss: 0.250575
Evaluation: auc:0.4959705216008065, precision:0.49772864930345245, recall:0.49719411293128224, f1:0.4974612375237417


100%|██████████| 2752512/2752512 [00:21<00:00, 130049.98it/s]


epoch 2, train loss: 0.250279
Evaluation: auc:0.49725144897775003, precision:0.5012113870381587, recall:0.5008700915487629, f1:0.5010406811731315


100%|██████████| 2752512/2752512 [00:20<00:00, 135356.18it/s]
100%|██████████| 2752512/2752512 [00:19<00:00, 144412.19it/s]

epoch 3, train loss: 0.250133
Evaluation: auc:0.49853810632177475, precision:0.5025741974560872, recall:0.5010945874537631, f1:0.5018333018333019


100%|██████████| 2752512/2752512 [00:20<00:00, 136289.59it/s]


epoch 4, train loss: 0.250057
Evaluation: auc:0.4984744484281868, precision:0.5016656571774682, recall:0.5005060958107352, f1:0.5010852056597068


100%|██████████| 2752512/2752512 [00:20<00:00, 134217.29it/s]
100%|██████████| 2752512/2752512 [00:19<00:00, 149399.88it/s]

epoch 5, train loss: 0.250023
Evaluation: auc:0.5002043160891055, precision:0.49939430648092065, recall:0.4991297767688233, f1:0.49926200658517206


100%|██████████| 2752512/2752512 [00:20<00:00, 135196.58it/s]


epoch 6, train loss: 0.250002
Evaluation: auc:0.5006940503452013, precision:0.49818291944276194, recall:0.4992033988316516, f1:0.49869263708363326


100%|██████████| 2752512/2752512 [00:19<00:00, 139389.46it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 153223.71it/s]

epoch 7, train loss: 0.249993
Evaluation: auc:0.5018966401986058, precision:0.4974258025439128, recall:0.500030443253775, f1:0.49872472217161595


100%|██████████| 2752512/2752512 [00:19<00:00, 139756.56it/s]


epoch 8, train loss: 0.249991
Evaluation: auc:0.5039348507099382, precision:0.5003028467595396, recall:0.5014722399295753, f1:0.500886860815912


100%|██████████| 2752512/2752512 [00:19<00:00, 142541.62it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 153145.01it/s]

epoch 9, train loss: 0.249993
Evaluation: auc:0.5043090657113267, precision:0.5013628104179285, recall:0.5011806732865101, f1:0.5012717253073337


100%|██████████| 2752512/2752512 [00:19<00:00, 140467.52it/s]


epoch 10, train loss: 0.249994
Evaluation: auc:0.503715156572325, precision:0.5043912780133253, recall:0.50271657108361, f1:0.5035525321239607


100%|██████████| 2752512/2752512 [00:19<00:00, 141525.27it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 144506.01it/s]

epoch 11, train loss: 0.249995
Evaluation: auc:0.503995595698292, precision:0.5040884312537856, recall:0.5013705232085304, f1:0.5027258037723312


100%|██████████| 2752512/2752512 [00:19<00:00, 142641.45it/s]


epoch 12, train loss: 0.249996
Evaluation: auc:0.505904888599718, precision:0.5066626287098728, recall:0.5031730277602334, f1:0.5049117988803212


100%|██████████| 2752512/2752512 [00:19<00:00, 143827.26it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 154714.88it/s]

epoch 13, train loss: 0.249997
Evaluation: auc:0.5066531977664547, precision:0.5105996365838885, recall:0.5046393295420533, f1:0.5076019870540418


100%|██████████| 2752512/2752512 [00:19<00:00, 144093.11it/s]


epoch 14, train loss: 0.249998
Evaluation: auc:0.5080290589353368, precision:0.5169594185342217, recall:0.5077636980189184, f1:0.512320297728023


100%|██████████| 2752512/2752512 [00:18<00:00, 146187.73it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 158000.85it/s]

epoch 15, train loss: 0.249998
Evaluation: auc:0.5125648545601825, precision:0.523167777104785, recall:0.5127329929953698, f1:0.5178978294759563


100%|██████████| 2752512/2752512 [00:19<00:00, 142928.83it/s]


epoch 16, train loss: 0.249999
Evaluation: auc:0.5152881480984856, precision:0.529678982434888, recall:0.5156173995076724, f1:0.5225536110426424


100%|██████████| 2752512/2752512 [00:19<00:00, 143622.19it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 155008.66it/s]

epoch 17, train loss: 0.249999
Evaluation: auc:0.5175756252474043, precision:0.5420956995760146, recall:0.5204471775190079, f1:0.53105090226735


100%|██████████| 2752512/2752512 [00:19<00:00, 144505.50it/s]


epoch 18, train loss: 0.249999
Evaluation: auc:0.5217135332421552, precision:0.5519382192610539, recall:0.5255417621869458, f1:0.5384166562036087


100%|██████████| 2752512/2752512 [00:19<00:00, 142872.58it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 157618.01it/s]

epoch 19, train loss: 0.250000
Evaluation: auc:0.5275412660810529, precision:0.5649606299212598, recall:0.5327412399691578, f1:0.5483780883930801


100%|██████████| 2752512/2752512 [00:19<00:00, 143402.37it/s]


epoch 20, train loss: 0.250000
Evaluation: auc:0.53383958421763, precision:0.5772259236826166, recall:0.5387224420576597, f1:0.5573099415204679


100%|██████████| 2752512/2752512 [00:18<00:00, 145601.55it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 148418.55it/s]

epoch 21, train loss: 0.250000
Evaluation: auc:0.5432780550102521, precision:0.5858570563294972, recall:0.5462066239376571, f1:0.5653374636527025


100%|██████████| 2752512/2752512 [00:19<00:00, 142767.21it/s]


epoch 22, train loss: 0.250000
Evaluation: auc:0.5524727171939551, precision:0.594488188976378, recall:0.5556105914153493, f1:0.5743922868157512


100%|██████████| 2752512/2752512 [00:19<00:00, 142445.99it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 153963.91it/s]

epoch 23, train loss: 0.250000
Evaluation: auc:0.5596831857864899, precision:0.5967595396729255, recall:0.5647183572871738, f1:0.5802969954427324


100%|██████████| 2752512/2752512 [00:19<00:00, 144837.09it/s]


epoch 24, train loss: 0.250000
Evaluation: auc:0.5758899760107212, precision:0.597213809812235, recall:0.5861546235472461, f1:0.5916325397896884


100%|██████████| 2752512/2752512 [00:19<00:00, 143421.37it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 139890.09it/s]

epoch 25, train loss: 0.250000
Evaluation: auc:0.5821431516827531, precision:0.5817686250757117, recall:0.6008194414036844, f1:0.5911405843706244


100%|██████████| 2752512/2752512 [00:19<00:00, 142685.32it/s]


epoch 26, train loss: 0.250000
Evaluation: auc:0.5890135275832682, precision:0.5589036947304664, recall:0.6211399626407283, f1:0.5883806381164168


100%|██████████| 2752512/2752512 [00:19<00:00, 142981.51it/s]
100%|██████████| 2752512/2752512 [00:18<00:00, 154281.83it/s]

epoch 27, train loss: 0.250000
Evaluation: auc:0.5940610551265482, precision:0.5087825560266506, recall:0.64345627944387, f1:0.5682490825145022


100%|██████████| 2752512/2752512 [00:19<00:00, 139894.74it/s]


KeyboardInterrupt: 

