In [111]:
import os
import pandas as pd
import numpy as np
import random
import scipy.sparse as sp
import argparse

from sklearn.preprocessing import LabelEncoder

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# 전처리

In [112]:
path = '../Data/Movie_Lens_100k'

In [113]:
train_df = pd.read_csv(os.path.join(path, 'ua.base'), sep = '\t', names = ['user_id', 'movie_id', 'rating', 'timestamp'])
test_df = pd.read_csv(os.path.join(path, 'ua.test'), sep = '\t', names = ['user_id', 'movie_id', 'rating', 'timestamp'])

In [114]:
test_only_movie = list(set(test_df['movie_id'].unique().flatten()) - set(train_df['movie_id'].unique().flatten()))
test_df = test_df[~test_df['movie_id'].isin(test_only_movie)]

### Label Encoder

In [115]:
user_le = LabelEncoder()
movie_le = LabelEncoder()

user_le.fit(train_df['user_id'])
movie_le.fit(train_df['movie_id'])

train_df['user_id'] = user_le.transform(train_df['user_id'])
train_df['movie_id'] = movie_le.transform(train_df['movie_id'])

test_df['user_id'] = user_le.transform(test_df['user_id'])
test_df['movie_id'] = movie_le.transform(test_df['movie_id'])

In [116]:
train_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,0,5,874965758
1,0,1,3,876893171
2,0,2,4,878542960
3,0,3,3,876893119
4,0,4,3,889751712


# NGCF

In [265]:
class NGCF(nn.Module):
    def __init__(self, n_user, n_item, norm_adj, args):
        super(NGCF, self).__init__()
        self.n_user = n_user
        self.n_item = n_item
        self.device = args.device
        self.emb_size = args.embed_size
        self.batch_size = args.batch_size
        
        self.node_dropout = args.node_dropout
        self.mess_dropout = args.mess_dropout
        self.batch_size = args.batch_size
        
        self.norm_adj = norm_adj
        
        self.layers = eval(args.layer_size)
        self.decay = eval(args.regs)[0]
        
        # Init the weight of user-item.
        self.embedding_dict, self.weight_dict = self.init_weight()
        
        # Get sparse adj
        self.sparse_norm_adj = self._convert_sp_mat_to_sp_tensor(self.norm_adj).to(self.device)
        
    def init_weight(self):
        #xavier init
        initializer = nn.init.xavier_uniform_
        
        embedding_dict = nn.ParameterDict({
            'user_emb' : nn.Parameter(initializer(torch.empty(self.n_user, self.emb_size))),
            'item_emb' : nn.Parameter(initializer(torch.empty(self.n_item, self.emb_size)))
        })
        
        weight_dict = nn.ParameterDict()
        layers = [self.emb_size] + self.layers
        for k in range(len(self.layers)):
            weight_dict.update({'W_gu_%d'%k: nn.Parameter(initializer(torch.empty(layers[k], layers[k + 1])))})
            weight_dict.update({'b_gu_%d'%k: nn.Parameter(initializer(torch.empty(1, layers[k + 1])))})
            weight_dict.update({'W_bi_%d'%k: nn.Parameter(initializer(torch.empty(layers[k], layers[k + 1])))})
            weight_dict.update({'b_bi_%d'%k: nn.Parameter(initializer(torch.empty(1, layers[k + 1])))})
            
        return embedding_dict, weight_dict
    
    def _convert_sp_mat_to_sp_tensor(self, X):
        coo = X.tocoo()
        i = torch.LongTensor([coo.row, coo.col])
        v = torch.from_numpy(coo.data).float()
        return torch.sparse.FloatTensor(i, v, coo.shape)
    
    def sparse_dropout(self, x, rate, noise_shape):       
        rate = rate[0]
        
        random_tensor = 1 - rate
        random_tensor += torch.rand(noise_shape).to(x.device)
        dropout_mask = torch.floor(random_tensor).type(torch.bool)
        i = x._indices()
        v = x._values()
        
        i = i[:, dropout_mask]
        v = v[dropout_mask]
        
        out = torch.sparse.FloatTensor(i, v, x.shape).to(x.device)
        return out * (1. / (1 - rate))
    
    def loss(self, users_embedding, items_embedding, labels):
        scores = torch.sum(torch.mul(users_embedding, items_embedding), axis = 1)           
        loss_value = nn.MSELoss()(scores, labels)
        return loss_value
        
    def rating(self, user_embedding, item_embedding):
        return torch.matmul(user_embedding, item_embedding.t()).item()
    
    def forward(self, users, items, drop_flag = True):
        A_hat = self.sparse_dropout(self.sparse_norm_adj, 
                                    self.node_dropout, 
                                    self.sparse_norm_adj._nnz()) if drop_flag else self.sparse_norm_adj
        
        ego_embeddings = torch.cat([self.embedding_dict['user_emb'], 
                                    self.embedding_dict['item_emb']], 0)
        
        all_embeddings = [ego_embeddings]
        
        for k in range(len(self.layers)):
            side_embeddings = torch.sparse.mm(A_hat, ego_embeddings)
            
            sum_embeddings = torch.matmul(side_embeddings, self.weight_dict['W_gu_%d' %k]) + self.weight_dict['b_gu_%d' %k]
            bi_embeddings = torch.mul(ego_embeddings, side_embeddings)
            bi_embeddings = torch.matmul(bi_embeddings, self.weight_dict['W_bi_%d' %k]) + self.weight_dict['b_bi_%d' %k]
            
            ego_embeddings = nn.LeakyReLU(negative_slope = 0.2)(sum_embeddings + bi_embeddings)
            
            ego_embeddings = nn.Dropout(self.mess_dropout[k])(ego_embeddings)
            
            norm_embeddings = F.normalize(ego_embeddings, p = 2, dim = 1)
            all_embeddings += [norm_embeddings]
            
        all_embeddings = torch.cat(all_embeddings, 1)
        u_g_embeddings = all_embeddings[:self.n_user, :]
        i_g_embeddings = all_embeddings[self.n_user:, :]
        
        u_g_embeddings = u_g_embeddings[users, :]
        i_g_embeddings = i_g_embeddings[items, :]
        
        return u_g_embeddings, i_g_embeddings                        

### Graph class

In [266]:
class Graph(object):
    def __init__(self, train_df, test_df, user_le, item_le, batch_size):
#         self.path = path
        self.batch_size = batch_size
       
        self.user_le = user_le
        self.item_le = item_le
        
        self.n_users = len(train_df['user_id'].unique())
        self.n_items = len(train_df['movie_id'].unique())
        
        self.n_train = len(train_df)
        self.n_test = len(test_df)
        
        self.R = sp.dok_matrix((self.n_users, self.n_items), dtype = np.float32)
        
        self.train_items, self.test_set = {}, {}
        self.exist_users = list(train_df['user_id'].unique())
        
        for i, row in train_df.iterrows():
            
            # 여기에 rating 값을 넣어야 하는지 생갈할 필요가 있다.
            self.R[row['user_id'], row['movie_id']] = 1
        
        for i, row in enumerate(train_df.groupby(['user_id'])['movie_id'].unique()):
            self.train_items[i] = row
            
        for i, row in enumerate(test_df.groupby(['user_id'])['movie_id'].unique()):
            self.test_set[i] = row
        
    def get_adj_mat(self):
        adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat()
        return adj_mat, norm_adj_mat, mean_adj_mat
    
    def create_adj_mat(self):
        adj_mat = sp.dok_matrix((self.n_users + self.n_items, self.n_users + self.n_items), dtype = np.float32)
        adj_mat = adj_mat.tolil()
        R = self.R.tolil()
        
        adj_mat[:self.n_users, self.n_users:] = R
        adj_mat[self.n_users:, :self.n_users] = R.T
        adj_mat = adj_mat.todok()
        
        def mean_adj_single(adj):
            rowsum = np.array(adj.sum(1))
            
            d_inv = np.power(rowsum, -1).flatten()
            d_inv[np.isinf(d_inv)] = 0.
            d_mat_inv = sp.diags(d_inv)
            
            norm_adj = d_mat_inv.dot(adj)
            return norm_adj.tocoo()
        
        def normalized_adj_single(adj):
            rowsum = np.array(adj.sum(1))
            
            d_inv_sqrt = np.power(rowsum, -0.5).flatten()
            d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
            d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
            
            bi_lap = d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt)
            return bi_lap.tocoo()
        
        def check_adj_if_equal(adj):
            dense_A = np.array(adj.todense())
            degree = np.sum(dense_A, axis = 1, keepdims = False)
            
            temp = np.dot(np.diag(np.power(degree, -1)), dense_A)
            return temp
        
        norm_adj_mat = mean_adj_single(adj_mat + sp.eye(adj_mat.shape[0]))
        mean_adj_mat = mean_adj_single(adj_mat)
        return adj_mat.tocsr(), norm_adj_mat.tocsr(), mean_adj_mat.tocsr()
    
    
    def get_num_users_items(self):
        return self.n_users, self.n_items        

In [267]:
class CustomDataset(Dataset):
    def __init__(self, df):
        super(Dataset, self).__init__()
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        user = torch.tensor(row[0], dtype = torch.long)
        item = torch.tensor(row[1], dtype = torch.long)
        label = torch.tensor(row[2], dtype = torch.float)
        
        return user, item, label

# Main

### argparse

In [268]:
parser = argparse.ArgumentParser(description="Run NGCF.")
parser.add_argument('--weights_path', nargs='?', default='model/',
                    help='Store model path.')
parser.add_argument('--data_path', nargs='?', default='../Data/',
                    help='Input data path.')
parser.add_argument('--proj_path', nargs='?', default='',
                    help='Project path.')

parser.add_argument('--dataset', nargs='?', default='gowalla',
                    help='Choose a dataset from {gowalla, yelp2018, amazon-book}')
parser.add_argument('--pretrain', type=int, default=0,
                    help='0: No pretrain, -1: Pretrain with the learned embeddings, 1:Pretrain with stored models.')
parser.add_argument('--verbose', type=int, default=1,
                    help='Interval of evaluation.')
parser.add_argument('--epoch', type=int, default=100,
                    help='Number of epoch.')

parser.add_argument('--embed_size', type=int, default=64,
                    help='Embedding size.')
parser.add_argument('--layer_size', nargs='?', default='[64,64,64]',
                    help='Output sizes of every layer')
parser.add_argument('--batch_size', type=int, default=1024,
                    help='Batch size.')

parser.add_argument('--regs', nargs='?', default='[1e-5]',
                    help='Regularizations.')
parser.add_argument('--lr', type=float, default=0.001,
                    help='Learning rate.')

parser.add_argument('--model_type', nargs='?', default='ngcf',
                    help='Specify the name of model (ngcf).')
parser.add_argument('--adj_type', nargs='?', default='norm',
                    help='Specify the type of the adjacency (laplacian) matrix from {plain, norm, mean}.')

parser.add_argument('--gpu_id', type=int, default=6)

parser.add_argument('--node_dropout_flag', type=int, default=1,
                    help='0: Disable node dropout, 1: Activate node dropout')
parser.add_argument('--node_dropout', nargs='?', default='[0.1]',
                    help='Keep probability w.r.t. node dropout (i.e., 1-dropout_ratio) for each deep layer. 1: no dropout.')
parser.add_argument('--mess_dropout', nargs='?', default='[0.1,0.1,0.1]',
                    help='Keep probability w.r.t. message dropout (i.e., 1-dropout_ratio) for each deep layer. 1: no dropout.')

parser.add_argument('--Ks', nargs='?', default='[20, 40, 60, 80, 100]',
                    help='Output sizes of every layer')

parser.add_argument('--save_flag', type=int, default=0,
                    help='0: Disable model saver, 1: Activate model saver')

parser.add_argument('--test_flag', nargs='?', default='part',
                    help='Specify the test type from {part, full}, indicating whether the reference is done in mini-batch')

parser.add_argument('--report', type=int, default=0,
                    help='0: Disable performance report w.r.t. sparsity levels, 1: Show performance report w.r.t. sparsity levels')

_StoreAction(option_strings=['--report'], dest='report', nargs=None, const=None, default=0, type=<class 'int'>, choices=None, help='0: Disable performance report w.r.t. sparsity levels, 1: Show performance report w.r.t. sparsity levels', metavar=None)

In [271]:
args = parser.parse_args('')
args.device = torch.device('cpu')

data_generator = Graph(train_df, test_df, user_le, movie_le, args.batch_size)

plain_adj, norm_adj, mean_adj = data_generator.get_adj_mat()

args.node_dropout = eval(args.node_dropout)
args.mess_dropout = eval(args.mess_dropout)


model = NGCF(data_generator.n_users,
            data_generator.n_items,
            norm_adj,
            args).to(args.device)


optimizer = optim.Adam(model.parameters(), lr = args.lr)

train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

train_dataloader = DataLoader(train_dataset, batch_size = args.batch_size, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = args.batch_size, shuffle = True)

train_loss_loger, test_loss_loger = [], []

for epoch in range(args.epoch):
    train_loss = 0
    
    model.train()
    for users, items, labels in train_dataloader:
        users_embedding, items_embedding = model(users, items)
                
        batch_loss = model.loss(users_embedding, items_embedding, labels)
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        train_loss += batch_loss.item()/len(train_dataloader)
    
    train_loss_loger.append(train_loss)
    
    test_loss = 0
    with torch.no_grad():
        model.eval()
        for users, items, labels in test_dataloader:
            users_embedding, items_embedding = model(users, items)
            
            batch_loss = model.loss(users_embedding, items_embedding, labels)
            test_loss += batch_loss.item() / len(test_dataloader)

        test_loss_loger.append(test_loss)
        
    print('epoch : {}, train_loss : {}, test_loss : {}'.format(epoch, round(train_loss, 4), round(test_loss, 4)))

epoch : 0, train_loss : 1.9681, test_loss : 2.048
epoch : 1, train_loss : 1.8566, test_loss : 1.863
epoch : 2, train_loss : 1.3797, test_loss : 1.3629
epoch : 3, train_loss : 0.9798, test_loss : 1.0979
epoch : 4, train_loss : 0.8807, test_loss : 1.0355
epoch : 5, train_loss : 0.8261, test_loss : 0.9748
epoch : 6, train_loss : 0.7727, test_loss : 0.9554
epoch : 7, train_loss : 0.731, test_loss : 0.9175
epoch : 8, train_loss : 0.6983, test_loss : 0.918
epoch : 9, train_loss : 0.6659, test_loss : 0.9037
epoch : 10, train_loss : 0.6331, test_loss : 0.8987
epoch : 11, train_loss : 0.6022, test_loss : 0.8863
epoch : 12, train_loss : 0.5739, test_loss : 0.8918
epoch : 13, train_loss : 0.5458, test_loss : 0.8927
epoch : 14, train_loss : 0.5177, test_loss : 0.8775
epoch : 15, train_loss : 0.4927, test_loss : 0.8921
epoch : 16, train_loss : 0.4681, test_loss : 0.8939
epoch : 17, train_loss : 0.4439, test_loss : 0.8943
epoch : 18, train_loss : 0.4229, test_loss : 0.8949
epoch : 19, train_loss : 0

KeyboardInterrupt: 