#### Pytorch geometric 설치

Install Pytorch Geometric

In [1]:
# # If you get a TORCH or CUDA version error, visit the below website and enter the appropriate version manually.
# # https://pytorch-geometric.com/whl/
# import torch

# def format_pytorch_version(version):
#     return version.split('+')[0]

# TORCH_version = torch.__version__
# TORCH = format_pytorch_version(TORCH_version)

# def format_cuda_version(version):
#     return 'cu' + version.replace('.', '')

# CUDA_version = torch.version.cuda
# CUDA = format_cuda_version(CUDA_version)

# !pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
# !pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
# !pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
# !pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
# !pip install torch-geometric 

#### 라이브러리 불러오기 

In [49]:
import os
import random
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
# from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, Tensor
from torch_sparse import SparseTensor, matmul

from torch_geometric.utils import negative_sampling, structured_negative_sampling
from torch_geometric.data import DataLoader
import torch.nn.functional as F
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import Adj

import warnings; warnings.filterwarnings('ignore')

#### 데이터 불러오기 및 전처리

In [50]:
data_path = '../data/gowalla'
# saved_path = './saved'

In [51]:
train_file = data_path + '/gw_train.txt'
test_file = data_path + '/gw_test.txt'

In [52]:
user_list = data_path + '/gw_user_list.txt'
item_list = data_path + '/gw_item_list.txt'

In [53]:
train_edge_index = []
with open(train_file) as f:
    for row in f.readlines():
        if len(row) > 0:
            all = row.strip('\n').split(' ')
            id = int(all[0])
            items = [int(i) for i in all[1:]]
            for item in items:
                train_edge_index.append([id, item])
train_edge_index = torch.LongTensor(np.array(train_edge_index).T)
print(train_edge_index.shape)
train_edge_index

torch.Size([2, 810128])


tensor([[    0,     0,     0,  ..., 29857, 29857, 29857],
        [    0,     1,     2,  ...,  1853,   691,   674]])

In [54]:
test_edge_index = []
with open(test_file) as f:
    for row in f.readlines():
        if len(row) > 0:
            all = row.strip('\n').split(' ')
            id = int(all[0])
            items = [int(i) for i in all[1:]]
            for item in items:
                test_edge_index.append([id, item])
test_edge_index = torch.LongTensor(np.array(test_edge_index).T)
print(test_edge_index.shape)
test_edge_index

torch.Size([2, 217242])


tensor([[    0,     0,     0,  ..., 29856, 29857, 29857],
        [ 7580,  3730,  5983,  ...,  9923,   145,  5317]])

In [55]:
edge_index = torch.cat((train_edge_index, test_edge_index), 1)
edge_index

tensor([[    0,     0,     0,  ..., 29856, 29857, 29857],
        [    0,     1,     2,  ...,  9923,   145,  5317]])

In [56]:
num_users = len(np.unique(edge_index[0]))
num_items = len(np.unique(edge_index[1]))
print(num_users, num_items)

29858 40981


In [57]:
print(train_edge_index.shape)
print(test_edge_index.shape)

torch.Size([2, 810128])
torch.Size([2, 217242])


In [58]:
train_sparse_edge_index = SparseTensor(row=train_edge_index[0], col=train_edge_index[1]+num_users, 
                                       sparse_sizes=(num_users + num_items, num_users + num_items))
test_sparse_edge_index = SparseTensor(row=test_edge_index[0], col=test_edge_index[1]+num_users, 
                                     sparse_sizes=(num_users + num_items, num_users + num_items))

In [59]:
print(train_sparse_edge_index)
print(test_sparse_edge_index)

SparseTensor(row=tensor([    0,     0,     0,  ..., 29857, 29857, 29857]),
             col=tensor([29858, 29859, 29860,  ..., 31711, 41615, 43040]),
             size=(70839, 70839), nnz=810128, density=0.02%)
SparseTensor(row=tensor([    0,     0,     0,  ..., 29856, 29857, 29857]),
             col=tensor([31071, 31073, 31075,  ..., 45074, 30003, 35175]),
             size=(70839, 70839), nnz=217242, density=0.00%)


#### 모델 정의

In [60]:
embedding_dim =128
num_layers = 3

In [61]:
class LightGCN(MessagePassing):
    def __init__(self, num_users, num_items, embedding_dim=embedding_dim, K=num_layers, add_self_loops=False):
        
        super().__init__()
        self.num_users, self.num_items = num_users, num_items
        self.embedding_dim, self.K = embedding_dim, K
        self.add_self_loops = add_self_loops

        self.users_emb = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_dim) # e_u^0
        self.items_emb = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.embedding_dim) # e_i^0
        
        nn.init.normal_(self.users_emb.weight, std=0.1)
        nn.init.normal_(self.items_emb.weight, std=0.1)

    def forward(self, edge_index: SparseTensor):
        
        edge_index_norm = gcn_norm(edge_index, add_self_loops=self.add_self_loops)
        emb_0 = torch.cat([self.users_emb.weight, self.items_emb.weight])
        embs = [emb_0]
        emb_k = emb_0

        for i in range(self.K):
            emb_k = self.propagate(edge_index_norm, x=emb_k)
            embs.append(emb_k)

        embs = torch.stack(embs, dim=1)
        emb_final = torch.mean(embs, dim=1)
        users_emb_final, items_emb_final = torch.split(emb_final, [self.num_users, self.num_items]) 
        return users_emb_final, self.users_emb.weight, items_emb_final, self.items_emb.weight

    def message(self, x_j: Tensor) -> Tensor:
        return x_j

    def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
        return matmul(adj_t, x)

#### 손실 함수 정의

In [62]:
def bpr_loss(users_emb_final, users_emb_0, pos_items_emb_final, pos_items_emb_0, neg_items_emb_final, neg_items_emb_0, lambda_):
    reg_loss = lambda_ * (users_emb_0.norm(2).pow(2) +
                          pos_items_emb_0.norm(2).pow(2) +
                          neg_items_emb_0.norm(2).pow(2)) # L2 loss

    pos_scores = torch.mul(users_emb_final, pos_items_emb_final)
    pos_scores = torch.sum(pos_scores, dim=-1)
    neg_scores = torch.mul(users_emb_final, neg_items_emb_final)
    neg_scores = torch.sum(neg_scores, dim=-1)
    
    loss = -F.logsigmoid(pos_scores - neg_scores).sum() + reg_loss
    
    return loss

#### 평가 지표 정의
- 이미 사용한 item은 검증할 때 제외해 주어야 함(exclude_edge_index).
- 해당 user-item 쌍이 추천되지 않도록 score matrix에 인위적으로 낮은 값(-(1 << 10))을 부여.
- r 변수 계산에 시간이 오래 걸려, 추후 개선이 필요해 보임.
- torch.matmul이 시간이 오래 걸릴 경우 임베딩을 cpu로 detach 후 np.matmul로 대체. 

In [63]:
def RecallPrecision_at_K(groundTruth, r, k):
    num_correct_pred = torch.sum(r, dim=-1)
    user_num_liked = torch.Tensor([len(groundTruth[i]) for i in range(len(groundTruth))])
    recall = torch.mean(num_correct_pred / user_num_liked)
    precision = torch.mean(num_correct_pred) / k
    
    return recall.item(), precision.item()

In [64]:
def NDCG_at_K(groundTruth, r, k):
    assert len(r) == len(groundTruth)
    test_matrix = torch.zeros((len(r), k))
    for i, items in enumerate(groundTruth):
        length = min(len(items), k)
        test_matrix[i, :length] = 1
    max_r = test_matrix
    idcg = torch.sum(max_r * 1. / torch.log2(torch.arange(2, k + 2)), axis=1)
    dcg = r * (1. / torch.log2(torch.arange(2, k + 2)))
    dcg = torch.sum(dcg, axis=1)
    idcg[idcg == 0.] = 1.
    ndcg = dcg / idcg
    ndcg[torch.isnan(ndcg)] = 0.
    
    return torch.mean(ndcg).item()

In [65]:
def get_user_positive_items(edge_index):
    user_pos_items = {}
    for i in range(edge_index.shape[1]):
        user = edge_index[0][i].item()
        item = edge_index[1][i].item()
        if user not in user_pos_items:
            user_pos_items[user] = []
        user_pos_items[user].append(item)
        
    return user_pos_items

In [66]:
def get_metrics(model, edge_index, sparse_edge_index, exclude_edge_index, k):
    
    user_embedding, _, item_embedding, _  = model.forward(sparse_edge_index)
    
    user_embedding = np.array(user_embedding.cpu().detach().numpy())
    item_embedding = np.array(item_embedding.cpu().detach().numpy())
                  
    rating = torch.tensor(np.matmul(user_embedding, item_embedding.T))
    
    user_pos_items = get_user_positive_items(exclude_edge_index)
    exclude_users = []
    exclude_items = []
    for user, items in user_pos_items.items():
        exclude_users.extend([user] * len(items))
        exclude_items.extend(items)
    rating[exclude_users, exclude_items] = -(1 << 10)
    
    _, top_K_items = torch.topk(rating, k=k)
    
    users = edge_index[0].unique()
    test_user_pos_items = get_user_positive_items(edge_index)
    test_user_pos_items_list = [test_user_pos_items[user.item()] for user in users]
    
    r = []
    for user in users:
        ground_truth_items = test_user_pos_items[user.item()]
        label = list(map(lambda x: x in ground_truth_items, top_K_items[user]))
        r.append(label)
    r = torch.Tensor(np.array(r).astype('float'))
    
    recall, precision = RecallPrecision_at_K(test_user_pos_items_list, r, k)
    ndcg = NDCG_at_K(test_user_pos_items_list, r, k)

    return recall, precision, ndcg

In [67]:
def evaluation(model, edge_index, sparse_edge_index, exclude_edge_indices, k):
    
    users_emb_final, users_emb_0, items_emb_final, items_emb_0 = model.forward(sparse_edge_index)
    neg_edges = negative_sampling(edge_index, num_nodes=[num_users, num_items], num_neg_samples=edge_index.shape[1])

    user_indices, pos_item_indices, neg_item_indices = edge_index[0], edge_index[1], neg_edges[1]
    users_emb_final, users_emb_0 = users_emb_final[user_indices], users_emb_0[user_indices]

    pos_items_emb_final, pos_items_emb_0 = items_emb_final[pos_item_indices], items_emb_0[pos_item_indices]
    neg_items_emb_final, neg_items_emb_0 = items_emb_final[neg_item_indices], items_emb_0[neg_item_indices]

    recall, precision, ndcg = get_metrics(model, edge_index, sparse_edge_index, exclude_edge_indices, k)

    return recall, precision, ndcg

#### 학습 파라미터 설정
- batch_size는 메모리가 허용하는 한 크게 하는 것이 학습 속도를 빠르게 할 수 있음.
- evaluation 시간이 오래 걸리므로, 모델이 정상 작동하는 것을 확인했으면 check_step은 적당히 큰 값을 주는 것이 좋음.

In [68]:
epochs = 150
check_step = 10
batch_size = 2**14
lr = 0.01
K = 25
lambda_ = 1e-5

In [69]:
model = LightGCN(num_users, num_items)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [70]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {device}.")

model = model.to(device)

train_edge_index = train_edge_index.to(device)
train_sparse_edge_index = train_sparse_edge_index.to(device)

test_edge_index = test_edge_index.to(device)
test_sparse_edge_index = test_sparse_edge_index.to(device)

Using device cuda.


#### 학습 수행

In [71]:
%%time
max_score = 0
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    trn_loader = DataLoader(train_edge_index.T, batch_size, shuffle=True)
    trn_loss = 0
    
    for batch_pos_edges in trn_loader:
        
        users_emb_final, users_emb_0, items_emb_final, items_emb_0 = model.forward(train_sparse_edge_index)
        
        batch_pos_edges = batch_pos_edges.T
        batch_neg_edges = negative_sampling(edge_index, num_nodes=[num_users, num_items], num_neg_samples=batch_pos_edges.shape[1])
        
        user_indices, pos_item_indices, neg_item_indices = batch_pos_edges[0].to(device), batch_pos_edges[1].to(device), batch_neg_edges[1].to(device)
        
        users_emb_final, users_emb_0 = users_emb_final[user_indices], users_emb_0[user_indices]
        pos_items_emb_final, pos_items_emb_0 = items_emb_final[pos_item_indices], items_emb_0[pos_item_indices]
        neg_items_emb_final, neg_items_emb_0 = items_emb_final[neg_item_indices], items_emb_0[neg_item_indices]
        
        loss = bpr_loss(users_emb_final, users_emb_0, pos_items_emb_final, pos_items_emb_0, neg_items_emb_final, neg_items_emb_0, lambda_)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # optional
        
        trn_loss += loss.item()
        
    trn_loss = trn_loss / len(trn_loader)
    
    if epoch != 0 and epoch % check_step == 0:
        model.eval()
        recall, precision, ndcg = evaluation(model, test_edge_index, test_sparse_edge_index, train_edge_index, K)
        score = 0.75 * recall + 0.25 * ndcg
        
        if score > max_score:
            max_score = score
            # torch.save(model.state_dict(), os.path.join(saved_path, 'LightGCN_best_score.pt'))
        model = model.to(device) # in case np.matmul is used in 'get_metrics' function instead of torch.matmul.
            
        print(f'[{epoch:03d}/{epochs}] | loss: {trn_loss:.6f} | recall@{K}: {recall:.6f} | '
              f'precision@{K}: {precision:.6f} | ndcg@{K}: {ndcg:.6f} | score: {score:.6f}')

  0%|          | 0/150 [00:00<?, ?it/s]

[010/150] | loss: 1035.721968 | recall@25: 0.152889 | precision@25: 0.037989 | ndcg@25: 0.123450 | score: 0.145529
[020/150] | loss: 548.336195 | recall@25: 0.159274 | precision@25: 0.039405 | ndcg@25: 0.126927 | score: 0.151187
[030/150] | loss: 402.808467 | recall@25: 0.158863 | precision@25: 0.039169 | ndcg@25: 0.124941 | score: 0.150382
[040/150] | loss: 337.066671 | recall@25: 0.159686 | precision@25: 0.039080 | ndcg@25: 0.123266 | score: 0.150581
[050/150] | loss: 295.500232 | recall@25: 0.158011 | precision@25: 0.038664 | ndcg@25: 0.121444 | score: 0.148869
[060/150] | loss: 273.508620 | recall@25: 0.158883 | precision@25: 0.038755 | ndcg@25: 0.120868 | score: 0.149379
[070/150] | loss: 259.865804 | recall@25: 0.158319 | precision@25: 0.038449 | ndcg@25: 0.119739 | score: 0.148674
[080/150] | loss: 248.414511 | recall@25: 0.157474 | precision@25: 0.038181 | ndcg@25: 0.117385 | score: 0.147452
[090/150] | loss: 240.233991 | recall@25: 0.156215 | precision@25: 0.037862 | ndcg@25: 

#### 추천 수행

In [72]:
# model.load_state_dict(torch.load(os.path.join(saved_path, 'LightGCN_best_score.pt')))

In [73]:
edge_index = edge_index.to(device)

In [74]:
model.eval()
user_pos_items = get_user_positive_items(edge_index)

In [75]:
user_to_idx = {}
with open(user_list) as f:
    for row in f.readlines():
        user_id, idx = row.strip('\n').split(' ')
        user_to_idx[int(user_id)] = int(idx)

In [76]:
idx_to_item = {}
with open(item_list) as f:
    for row in f.readlines():
        item_id, idx = row.strip('\n').split(' ')
        idx_to_item[int(idx)] = int(item_id)

In [77]:
user_id = 5258
rec_k = 10

In [78]:
def make_predictions(user_id, num_recs):
    user = user_to_idx[user_id]
    # e_u = model.users_emb.weight[user]
    # i_u = model.items_emb.weight
    e_u = np.array(model.users_emb.weight.cpu().detach().numpy())[user] # detach if error occured
    i_u = np.array(model.items_emb.weight.cpu().detach().numpy())
    # scores = model.items_emb.weight @ e_u
    scores = torch.tensor(i_u @ e_u)

    exclude_items = user_pos_items[user]
    scores[exclude_items] = -(1 << 10)
    values, indices = torch.topk(scores, k=num_recs)
    rec_item_list = [str(idx_to_item[i]) for i in indices.cpu().numpy()]
    return rec_item_list

In [79]:
rec_item_list = ', '.join(make_predictions(user_id, rec_k))

In [80]:
print(f'{rec_k} Recommended items for user {user_id}:\n\n{rec_item_list}')

10 Recommended items for user 5258:

207515, 57155, 9116, 21557, 15400, 960336, 24860, 19855, 10082, 9310
