#REQUISITOS NECESSÁRIOS


In [None]:
# @title BIBLIOTECAS

import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from IPython import embed
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import csv
import os
import scipy.sparse as sp
from tqdm import tqdm, trange
import gzip
import json
from statistics import mean
import math

def format_pytorch_version(version):
    return version.split('+')[0]

def format_cuda_version(version):
    return 'cu' + version.replace('.', '')

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)
CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# @title TENSORBOARD

%load_ext tensorboard

logs_base_dir = "runs"
os.makedirs(logs_base_dir, exist_ok=True)

tb_fm = SummaryWriter(log_dir=f'{logs_base_dir}/{logs_base_dir}_FM/')
tb_gcn = SummaryWriter(log_dir=f'{logs_base_dir}/{logs_base_dir}_FM_with_GCN/')

In [None]:
# @title FUNÇÕES

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')


def convert_to_numeric(values):
    '''
    Converte uma lista de valores em representações numéricas.
    '''
    unique_values = sorted(set(values))
    mapping = {value: str(index).zfill(len(str(len(unique_values)))) for index, value in enumerate(unique_values)}
    return [mapping[value] for value in values]

def split_train_test(data, n_users):
    '''
    Separa os dados em conjuntos de treino e teste para cada usuário e remove os registros de timestamp.
    '''
    train_x, test_x = [], []
    for u in trange(n_users, desc='separando treino/teste e removendo timestamp...'):
        user_data = data[data[:, 0] == u]
        sorted_data = user_data[user_data[:, -1].argsort()]
        if len(sorted_data) == 1:
            train_x.append(sorted_data[0][:-1])
        else:
            train_x.append(sorted_data[:-1][:, :-1])
            test_x.append(sorted_data[-1][:-1])
    return np.vstack(train_x), np.stack(test_x)

def build_adj_mx(n_feat, data):
    '''
    Constrói uma matriz de adjacência a partir dos dados.
    '''
    train_mat = sp.dok_matrix((n_feat, n_feat), dtype=np.float32)
    for x in tqdm(data, desc=f"Construindo matrix de adjacência..."):
        train_mat[x[0], x[1]] = 1.0
        train_mat[x[1], x[0]] = 1.0
        if data.shape[1] > 2:
            for idx in range(len(x[2:])):
                train_mat[x[0], x[2 + idx]] = 1.0
                train_mat[x[1], x[2 + idx]] = 1.0
                train_mat[x[2 + idx], x[0]] = 1.0
                train_mat[x[2 + idx], x[1]] = 1.0
    return train_mat

def ng_sample(data, dims, num_ng=4):
    '''
    Realiza amostragem negativa nos dados.
    '''
    rating_mat = build_adj_mx(dims[-1], data)
    interactions = []
    min_item, max_item = dims[0], dims[1]
    for num, x in tqdm(enumerate(data), desc='performando amostragem negativa...'):
        interactions.append(np.append(x, 1))
        for t in range(num_ng):
            j = np.random.randint(min_item, max_item)
            while (x[0], j) in rating_mat or j == int(x[1]):
                j = np.random.randint(min_item, max_item)
            interactions.append(np.concatenate([[x[0], j], x[2:], [0]]))
    return np.vstack(interactions), rating_mat

def build_test_set(itemsnoninteracted, gt_test_interactions):
    '''
    Constrói o conjunto de teste a partir dos itens não interagidos e das interações verdadeiras de teste.
    '''
    test_set = []
    for pair, negatives in tqdm(zip(gt_test_interactions, itemsnoninteracted), desc="CONSTRUINDO CONJUNTO DE TESTE..."):
        negatives = np.delete(negatives, np.where(negatives == pair[1]))
        single_user_test_set = np.vstack([pair, ] * (len(negatives)+1))
        single_user_test_set[:, 1][1:] = negatives
        test_set.append(single_user_test_set.copy())
    return test_set

#PREPARANDO OS DADOS

In [None]:
# @title BAIXANDO OS DADOS

!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/AMAZON_FASHION_5.json.gz
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/All_Beauty_5.json.gz
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Appliances_5.json.gz
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Gift_Cards_5.json.gz
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Industrial_and_Scientific_5.json.gz
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Luxury_Beauty_5.json.gz
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Magazine_Subscriptions_5.json.gz
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Software_5.json.gz

In [None]:
# @title DATASET

file_paths = ['/content/AMAZON_FASHION_5.json.gz', '/content/All_Beauty_5.json.gz', '/content/Appliances_5.json.gz',
              '/content/Gift_Cards_5.json.gz', '/content/Industrial_and_Scientific_5.json.gz', '/content/Luxury_Beauty_5.json.gz', '/content/Magazine_Subscriptions_5.json.gz',
              '/content/Software_5.json.gz']

dataframes = []

for file_path in file_paths:
    a = getDF(file_path)
    dataframes.append(a)

df = pd.concat(dataframes, ignore_index=True)

df = df[['reviewerID', 'asin', 'overall', 'unixReviewTime']]
df.rename(columns={'reviewerID': 'user', 'asin': 'item', 'overall': 'rating', 'unixReviewTime': 'timestamp'}, inplace=True)
df['user'] = convert_to_numeric(df['user'])
df['item'] = convert_to_numeric(df['item'])
df['user'] = df['user'].astype(int)
df['item'] = df['item'].astype(int)
df = df[df['user'].between(0, 5000)]
df.reset_index(inplace=True)
data = df[['user', 'item', 'timestamp']].astype('int32').to_numpy()

add_dims=0
for i in range(data.shape[1] - 1):
    data[:, i] -= np.min(data[:, i])
    data[:, i] += add_dims
    add_dims = np.max(data[:, i]) + 1
dims = np.max(data, axis=0) + 1

In [None]:
# @title SEPARANDO TREINO / TESTE

train_x, test_x = split_train_test(data, dims[0])

assert train_x.shape[0] + test_x.shape[0] == len(data)

In [None]:
# @title AMOSTRAGEM NEGATIVA

train_x = train_x[:, :2]
dims = dims[:2]

train_x, rating_mat = ng_sample(train_x, dims)

In [None]:
# @title CLASSE DO DATASET

class PointData(Dataset):
    def __init__(self, data, dims):
        super(PointData, self).__init__()
        self.interactions = data
        self.dims = dims

    def __len__(self):
        return len(self.interactions)

    def __getitem__(self, index):
        return self.interactions[index][:-1], self.interactions[index][-1]

train_dataset = PointData(train_x, dims)

In [None]:
# @title PREPARANDO O CONJUNTO DE TESTE PARA INFERÊNCIA

zero_positions = np.asarray(np.where(rating_mat.A==0)).T

items2compute = []
for user in trange(dims[0]):
    aux = zero_positions[zero_positions[:, 0] == user][:, 1]
    items2compute.append(aux[aux >= dims[0]])

test_x = build_test_set(items2compute, test_x)

#MODELO BASE

In [None]:
# @title FM

class FM_operation(torch.nn.Module):

    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self, x):
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix

class FactorizationMachineModel(torch.nn.Module):
    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.linear = torch.nn.Linear(len(field_dims), 1)
        self.embedding = torch.nn.Embedding(field_dims[-1], embed_dim)
        self.fm = FM_operation(reduce_sum=True)

        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, interaction_pairs):
        out = self.linear(interaction_pairs.float()) + self.fm(self.embedding(interaction_pairs))
        return out.squeeze(1)

    def predict(self, interactions, device):
        test_interactions = torch.from_numpy(interactions).to(dtype=torch.long, device=device)
        output_scores = self.forward(test_interactions)
        return output_scores

#PIPELINE

In [None]:
# @title MÉTRICAS

def getHitRatio(recommend_list, gt_item):
    if gt_item in recommend_list:
        return 1
    else:
        return 0

def getNDCG(recommend_list, gt_item):
    idx = np.where(recommend_list == gt_item)[0]
    if len(idx) > 0:
        return math.log(2)/math.log(idx+2)
    else:
        return 0

In [None]:
# @title TREINO E TESTE

def train_one_epoch(model, optimizer, data_loader, criterion, device):
    model.train()
    total_loss = []

    for i, (interactions, targets) in enumerate(data_loader):
        interactions = interactions.to(device)
        targets = targets.to(device)

        predictions = model(interactions)

        loss = criterion(predictions, targets.float())
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss.append(loss.item())

    return mean(total_loss)


def test(model, test_x, device, topk=10):
    model.eval()

    HR, NDCG = [], []
    for user_test in test_x:
        gt_item = user_test[0][1]
        predictions = model.predict(user_test, device)
        _, indices = torch.topk(predictions, topk)
        recommend_list = user_test[indices.cpu().detach().numpy()][:, 1]

        HR.append(getHitRatio(recommend_list, gt_item))
        NDCG.append(getNDCG(recommend_list, gt_item))
    return mean(HR), mean(NDCG)


def train_and_test(topk, model, optimizer, criterion, data_loader, device, test_x, tb):
    for epoch_i in trange(10):
        train_loss = train_one_epoch(model, optimizer, data_loader, criterion, device)
        hr, ndcg = test(model, test_x, device, topk=topk)
        tb.add_scalar('train/loss', train_loss, epoch_i)
        tb.add_scalar('eval/HR@{topk}', hr, epoch_i)
        tb.add_scalar('eval/NDCG@{topk}', ndcg, epoch_i)




In [None]:
# @title OPTIMIZER

dims = train_dataset.dims
model = FactorizationMachineModel(dims, 32).to(device)
criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)

data_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=0)

In [None]:
# @title TREINAMENTO DO MODELO BASE
train_and_test(10, model, optimizer, criterion, data_loader, device, test_x, tb_fm)

#MODELO GCN

In [None]:
# @title PYTORCH GEOMETRIC
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric

In [None]:
# @title MATRIX ESPARSA

from torch_geometric.utils import from_scipy_sparse_matrix
from scipy.sparse import identity

edge_idx, edge_attr = from_scipy_sparse_matrix(rating_mat)

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

X = sparse_mx_to_torch_sparse_tensor(identity(rating_mat.shape[0]))


In [None]:
# @title CAMADA GCN

from torch_geometric.nn import GCNConv, GATConv

class GCELayer(torch.nn.Module):
    def __init__(self, field_dims, embed_dim, features, train_mat, attention=False):

        super().__init__()

        self.A = train_mat
        self.features = features
        if attention:
            self.GCN_module = GATConv(int(field_dims), embed_dim, heads=8, dropout=0.4)
        else:
            self.GCN_module = GCNConv(field_dims, embed_dim)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        return self.GCN_module(self.features, self.A)[x]


In [None]:
# @title FM + GCN

class FactorizationMachineModel_withGCN(torch.nn.Module):
    def __init__(self, field_dims, embed_dim, X, A, attention=False):
        super().__init__()
        self.linear = torch.nn.Linear(len(field_dims), 1)
        self.embedding = GCELayer(field_dims[-1], embed_dim, X, A, attention=attention)
        self.fm = FM_operation(reduce_sum=True)

    def forward(self, interaction_pairs):
        out = self.linear(interaction_pairs.float()) + self.fm(self.embedding(interaction_pairs))
        return out.squeeze(1)

    def predict(self, interactions, device):
        test_interactions = torch.from_numpy(interactions).to(dtype=torch.long, device=device)
        output_scores = self.forward(test_interactions)
        return output_scores



In [None]:
# @title OTIMIZADOR
model_gcn = FactorizationMachineModel_withGCN(train_dataset.dims,
                                              32,
                                              X.to(device),
                                              edge_idx.to(device),
                                              ).to(device)

gcn_criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
gcn_optimizer = torch.optim.Adam(params=model_gcn.parameters(), lr=0.01)



In [None]:
# @title TREINAMENTO DO MODELO GCN
train_and_test(10, model_gcn, gcn_optimizer, gcn_criterion, data_loader, device, test_x, tb_gcn)

#RESULTADOS

In [None]:
# @title VISUALIZANDO RESULTADOS COM TENSORBOARD
%tensorboard --logdir runs