In [1]:
import os
import pandas as pd
import numpy as np
import scipy.sparse as sp
import argparse

from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# 전처리

In [2]:
path = '../Data/Movie_Lens_100k'

In [8]:
train_df = pd.read_csv(os.path.join(path, 'ua.base'), sep = '\t', names = ['user_id', 'movie_id', 'rating', 'timestamp'])
test_df = pd.read_csv(os.path.join(path, 'ua.test'), sep = '\t', names = ['user_id', 'movie_id', 'rating', 'timestamp'])

In [9]:
test_only_movie = list(set(test_df['movie_id'].unique().flatten()) - set(train_df['movie_id'].unique().flatten()))
test_df = test_df[~test_df['movie_id'].isin(test_only_movie)]

### Label Encoder

In [10]:
user_le = LabelEncoder()
movie_le = LabelEncoder()

user_le.fit(train_df['user_id'])
movie_le.fit(train_df['movie_id'])

train_df['movie_id'] = movie_le.transform(train_df['movie_id'])
train_df['user_id'] = user_le.transform(train_df['user_id'])

test_df['movie_id'] = movie_le.transform(test_df['movie_id'])
test_df['user_id'] = user_le.transform(test_df['user_id'])

In [43]:
train_df[['user_id', 'movie_id']].values

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [ 942, 1187],
       [ 942, 1227],
       [ 942, 1329]])

# LightGCN

In [146]:
class LightGCN(nn.Module):
    def __init__(self, n_user, n_item, args, train_df):
        super(LightGCN, self).__init__()
        self.n_user = n_user
        self.n_item = n_item
        self.device = args.device
        self.emb_size = args.embed_size
        self.batch_size = args.batch_size
        self.num_layers = args.num_layers
        self.node_dropout = args.node_dropout
        
        self.split = args.split
        self.num_folds = args.num_folds
        self.reg = args.reg
        
        self.make_train_matrix(train_df)
        
        self.Graph = self.getSparseGraph()
        self.data_loader = None # 이거는 추후에 확인할 필요 있음
        
        self.build_graph()
    
    def make_train_matrix(self, train_df):
        rows, cols = train_df['user_id'], train_df['movie_id']
        values = train_df['rating']
        
        sp_data = sp.csr_matrix((values, (rows, cols)), dtype = 'float64', shape = (self.n_user, self.n_item))
        
        self.train_matrix = sp_data
    
    def build_graph(self):
        self.user_embedding = nn.Embedding(self.n_user, self.emb_size)
        self.item_embedding = nn.Embedding(self.n_item, self.emb_size)
        
        # weight initalization
        nn.init.normal_(self.user_embedding.weight, 0, 0.01)
        nn.init.normal_(self.item_embedding.weight, 0, 0.01)
        
        self.to(self.device)
        
    def lightgcn_embedding(self, graph):
        users_emb = self.user_embedding.weight
        items_emb = self.item_embedding.weight
        all_emb = torch.cat([users_emb, items_emb], dim = 0)
        
        embs = [all_emb]
        
        if self.node_dropout > 0 :
            if self.training:
                g_droped = self.__dropout(graph, self.node_dropout)
                
            else:
                g_droped = graph
                
        else:
            g_droped = graph
            
        ego_emb = all_emb
        for k in range(self.num_layers):
            if self.split:
                tmp_emb = []
                for f in range(len(g_droped)):
                    tmp_emb.append(torch.sparse.mm(g_droped[f], ego_emb))
                side_emb = torch.cat(temp_emb, dim = 0)
                all_emb = side_emb
                
            else:
                all_emb = torch.sparse.mm(g_droped, all_emb)
            embs.append(all_emb)
            
        embs = torch.stack(embs, dim = 1)
        lightgcn_out = torch.mean(embs, dim = 1)
        users, items = torch.split(lightgcn_out, [self.n_user, self.n_item])
        
        return users, items
        
    def _split_A_hat(self, A):
        A_fold = []
        fold_len = (self.n_user + self.n_item) // self.num_folds
        
        for i_fold in range(self.num_folds):
            start = i_fold * fold_len
            if i_fold == self.num_folds -1:
                end = self.n_user + self.n_item
            else:
                end = (i_fold + 1) * fold_len
            A__fold.append(self._convert_sp_mat_to_sp_tensor(A[start:end]).coalesce().to(self.device))
        return A_fold
    
    def _convert_sp_mat_to_sp_tensor(self, X):
        coo = X.tocoo().astype(np.float32)
        row = torch.Tensor(coo.row).long()
        col = torch.Tensor(coo.col).long()
        index = torch.stack([row, col])
        data = torch.FloatTensor(coo.data)
        return torch.sparse.FloatTensor(index, data, torch.Size(coo.shape))
    
    def getSparseGraph(self):
        n_users, n_items = self.train_matrix.shape
        
        adj_mat = sp.dok_matrix((n_users + n_items, n_users + n_items), dtype = np.float32)
        adj_mat = adj_mat.tolil()
        R = rating_matrix.tolil()
        adj_mat[:n_users, n_users:] = R
        adj_mat[n_users:, :n_users] = R.T
        adj_mat = adj_mat.todok()
        
        rowsum = np.array(adj_mat.sum(axis = 1))
        d_inv = np.power(rowsum, -0.5).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat = sp.diags(d_inv)
        
        norm_adj = d_mat.dot(adj_mat)
        norm_adj = norm_adj.dot(d_mat)
        norm_adj = norm_adj.tocsr()
        
        if self.split == True:
            Graph = self._split_A_hat(norm_adj)
        
        else:
            Graph = self._convert_sp_mat_to_sp_tensor(norm_adj)
            Graph = Graph.coalesce().to(self.device)
            
        return Graph
    
    def predict_batch_users(self, user_ids):
        user_embeddings = F.embedding(user_ids, self.user_embedding_pred)
        item_embeddings = self.item_Embedding_pred
        return np.matmul(user_embeddings, item_embeddings.T)
    
    def forward(self, user, item):
        u_embedding, i_embedding = self.lightgcn_embedding(self.Graph)
        
        user_latent = F.embedding(user, u_embedding)
        item_latent = F.embedding(item, i_embedding)
        
        score = torch.mul(user_latent, item_latent).sum(1)
        
        return score

In [147]:
class CustomDataset(Dataset):
    def __init__(self, df):
        super(Dataset, self).__init__()
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        user = torch.tensor(row[0], dtype = torch.long)
        item = torch.tensor(row[1], dtype = torch.long)
        label = torch.tensor(row[2], dtype = torch.float)
        
        return user, item, label

# Main

### argparse

In [148]:
parser = argparse.ArgumentParser(description = 'Run LightGCN')
parser.add_argument('--embed_size', default = 64)
parser.add_argument('--num_layers', default = 2)
parser.add_argument('--node_dropout', default = 0.3)
parser.add_argument('--split', default = False)
parser.add_argument('--num_folds', default = 100)
parser.add_argument('--reg', default = 1e-3)
parser.add_argument('--epochs', default = 40)
parser.add_argument('--batch_size', default = 1024)
parser.add_argument('--lr', default = 1e-3)

_StoreAction(option_strings=['--lr'], dest='lr', nargs=None, const=None, default=0.001, type=None, choices=None, help=None, metavar=None)

In [149]:
args = parser.parse_args('')
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'

n_user = train_df['user_id'].nunique()
n_item = train_df['movie_id'].nunique()

model = LightGCN(n_user, n_item, args, train_df)
optimizer = optim.Adam(model.parameters(), lr = args.lr)

train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

train_dataloader = DataLoader(train_dataset, batch_size = args.batch_size, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = args.batch_size, shuffle = True)

train_loss_loger, test_loss_loger = [], []
criterion = nn.MSELoss()

for epoch in range(args.epochs):
    train_loss = 0
    
    model.train()
    for users, items, labels in train_dataloader:
        hat_labels = model(users, items)
        batch_loss = criterion(hat_labels, labels)
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        train_loss += batch_loss.item() / len(train_dataloader)
        
    train_loss_loger.append(train_loss)
    
    test_loss = 0
    with torch.no_grad():
        model.eval()
        for users, items, labels in test_dataloader:
            hat_labels = model(users, items)
            batch_loss = criterion(hat_labels, labels)
            test_loss += batch_loss.item() / len(test_dataloader)
            
        test_loss_loger.append(test_loss)
        
    print('epoch : {}, train_loss : {}, test_loss : {}'.format(epoch, round(train_loss, 4), round(test_loss, 4)))     

<class 'torch.Tensor'>
tensor(indices=tensor([[   0,    0,    0,  ..., 2620, 2621, 2622],
                       [ 943,  944,  945,  ...,  862,  895,  915]]),
       values=tensor([0.0042, 0.0050, 0.0081,  ..., 0.0819, 0.0534, 0.0538]),
       size=(2623, 2623), nnz=181140, layout=torch.sparse_coo)
tensor([0.0042, 0.0050, 0.0081,  ..., 0.0819, 0.0534, 0.0538])


UnboundLocalError: local variable 'g_droped' referenced before assignment

In [99]:
args = parser.parse_args('')
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'

n_user = train_df['user_id'].nunique()
n_item = train_df['movie_id'].nunique()

model = LightGCN(n_user, n_item, args, train_df)
optimizer = optim.Adam(model.parameters(), lr = args.lr)

train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

train_dataloader = DataLoader(train_dataset, batch_size = args.batch_size, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = args.batch_size, shuffle = True)

train_loss_loger, test_loss_loger = [], []
criterion = nn.MSELoss()

for epoch in range(args.epochs):
    train_loss = 0
    
    model.train()
    for users, items, labels in train_dataloader:
        hat_labels = model(users, items)
        batch_loss = criterion(hat_labels, labels)
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        train_loss += batch_loss.item() / len(train_dataloader)
        
    train_loss_loger.append(train_loss)
    
    test_loss = 0
    with torch.no_grad():
        model.eval()
        for users, items, labels in test_dataloader:
            hat_labels = model(users, items)
            batch_loss = criterion(hat_labels, labels)
            test_loss += batch_loss.item() / len(test_dataloader)
            
        test_loss_loger.append(test_loss)
        
    print('epoch : {}, train_loss : {}, test_loss : {}'.format(epoch, round(train_loss, 4), round(test_loss, 4)))     

epoch : 0, train_loss : 11.7921, test_loss : 9.1175
epoch : 1, train_loss : 3.9549, test_loss : 3.2602
epoch : 2, train_loss : 2.1307, test_loss : 2.8058
epoch : 3, train_loss : 1.9053, test_loss : 2.5529
epoch : 4, train_loss : 1.7366, test_loss : 2.3437
epoch : 5, train_loss : 1.5994, test_loss : 2.2105
epoch : 6, train_loss : 1.488, test_loss : 2.0135
epoch : 7, train_loss : 1.3955, test_loss : 1.9432
epoch : 8, train_loss : 1.3179, test_loss : 1.7821
epoch : 9, train_loss : 1.2539, test_loss : 1.7169
epoch : 10, train_loss : 1.1986, test_loss : 1.6017
epoch : 11, train_loss : 1.1533, test_loss : 1.5254
epoch : 12, train_loss : 1.1138, test_loss : 1.4765
epoch : 13, train_loss : 1.081, test_loss : 1.4023
epoch : 14, train_loss : 1.052, test_loss : 1.3585
epoch : 15, train_loss : 1.0271, test_loss : 1.3191
epoch : 16, train_loss : 1.0067, test_loss : 1.2871
epoch : 17, train_loss : 0.9875, test_loss : 1.221
epoch : 18, train_loss : 0.9727, test_loss : 1.2023
epoch : 19, train_loss : 