# Config

In [7]:
# from config import *
# service_config = ServiceConfig()
# model_config = ModelConfig()
from arguments import args

# Data

In [8]:
from dataset import data_source_factory
data = data_source_factory(args)

# Preprocessing

In [9]:
# make interaction data
from dataset import preprocessor_factory
preprocessor_factory(data, args)

AttributeError: 'DataFrame' object has no attribute 'game_name'

In [11]:
import pandas as pd
data = pd.read_pickle('data/steam_games_interaction_data.pkl')

# Dataset

In [60]:
import numpy as np 
from collections import defaultdict
import torch
import scipy.sparse as sp

from dataset.base import BaseDataset
class EASEDataset(BaseDataset):
    """
    MatrixDataSet 생성
    """
    def __init__(self, data, args):
        self.args = args
        self.data = self.load(f'data/{args.game_name}_interaction_data.pkl')
        self.data['user_idx'] = self.data['user_id'].map(self.encode(self.data['user_id']))
        self.data['item_idx'] = self.data['item_id'].map(self.encode(self.data['item_id']))

        self.num_users = self.data['user_id'].nunique()
        self.num_items = self.data['item_id'].nunique()
        args.num_users = self.num_users
        args.num_items = self.num_items
        self.users = self.data['user_idx'].unique()
        
        self.train_data = {}
        self.valid_data = {}
        self.user_item_dict = self._make_user_item_dict()

    def encode(self, feature) -> dict:
        return {v:i for i, v in enumerate(feature.unique())}
    
    def decode(self, feature) -> dict:
        return {i:v for i, v in enumerate(feature.unique())}

    def get_data(self):
        return self.data
    
    def get_feature_names(self):
        return super().get_feature_names()
    
    def load(self, path):
        data = pd.read_pickle(path)
        return data
    
    def _make_user_item_dict(self) -> dict:
        user_item_dict = defaultdict(list)
        for user, item in zip(self.data['user_idx'], self.data['item_idx']):
            user_item_dict[user].append(item)
        return user_item_dict
    
    def train_valid_split(self, valid_sample=5):
        assert self.user_item_dict is not None, 'user_item_dict is None. Run make_user_item_dict() first.'
        for user in self.user_item_dict:
            total = self.user_item_dict[user]
            valid = np.random.choice(self.user_item_dict[user], valid_sample, replace=True)
            train = np.setdiff1d(total, valid)
            self.train_data[user] = list(train)
            self.valid_data[user] = list(valid)
        return None
    
    def get_train_valid_data(self):
        return self.user_train, self.user_valid

    def make_matrix(self, user_list, train = True):
        """
        user_item_dict를 바탕으로 행렬 생성
        """
        mat = torch.zeros(size = (user_list.size(0), self.num_item))
        for idx, user in enumerate(user_list):
            if train:
                mat[idx, self.train_data[user.item()]] = 1
            else:
                mat[idx, self.train_data[user.item()] + self.valid_data[user.item()]] = 1
                # 왜 train의 index를 더해줄까?
        return mat

    def make_sparse_matrix(self):
        X = sp.dok_matrix((self.num_users, self.num_items), dtype=np.float32)
        for user in self.train_data.keys():
            item_list = self.train_data[user]
            X[user, item_list] = 1.0
                
        return X.tocsr()
    
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        return self.users[idx]

In [62]:
dataset = EASEDataset(data, args)

In [63]:
dataset.train_valid_split(valid_sample=5)

In [56]:
dataset.make_sparse_matrix()

<12393x5155 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Row format>

In [6]:
from dataset.base import BaseDataset

class EASEDataset(BaseDataset):
    def __init__(self, data, model_config):
        self.data = data
        self.model_config = model_config

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        user_id, item_id, label = self.data[idx]
        return user_id, item_id, label

In [7]:
# make model input 
# input : interaction data
# output : train, test data  e.g user : [item1, item2, item3, ...]
from dataset import dataset_factory
dataset = dataset_factory(service_config.game_name)

# Dataloader

In [8]:
from dataloader import dataloader_factory
dataloader = dataloader_factory(dataset, batch_size=32, shuffle=True, num_workers=0)

In [75]:
import torch
import torch.utils.data as data_utils
from abc import ABC, abstractmethod
class BaseDataLoader(ABC):

    @abstractmethod
    def __init__(self, dataset, args):
        pass

    @abstractmethod
    def __iter__(self):
        pass

    @abstractmethod
    def __len__(self):
        pass

class PytorchDataLoader(BaseDataLoader):

    def __init__(self, dataset, args):
        self.dataset = dataset
        self.batch_size = args.batch_size
        # self.shuffle = args.shuffle
        # self.num_workers = args.num_workers
        self.data_loader = data_utils.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=1)

    def __iter__(self):
        return iter(self.data_loader)

    def __len__(self):
        return len(self.data_loader)

class MatrixDataLoader(BaseDataLoader):

    def __init__(self, dataset, args):
        self.dataset = dataset

    def __iter__(self):
        pass

    def __len__(self):
        pass

In [76]:
dataloader = MatrixDataLoader(dataset, args)

# Model

In [87]:
class EASE():
    def __init__(self, X):
        self.X = self._convert_sp_mat_to_sp_tensor(X)
    
    def _convert_sp_mat_to_sp_tensor(self, X):
        """
        Convert scipy sparse matrix to PyTorch sparse matrix

        Arguments:
        ----------
        X = Adjacency matrix, scipy sparse matrix
        """
        coo = X.tocoo().astype(np.float32) # tocoo : COOrdinate format으로 변환 / COOrdinate format : 희소행렬을 나타내기 위한 방법
        i = torch.LongTensor(np.mat([coo.row, coo.col])) # row index, col index
        v = torch.FloatTensor(coo.data)
        res = torch.sparse.FloatTensor(i, v, coo.shape).to('cuda')
        return res

    def fit(self, reg):
        '''
        진짜 정말 간단한 식으로 모델을 만듬
        '''
        G = self.X.to_dense().t() @ self.X.to_dense() # X^T * X
        diagIndices = torch.eye(G.shape[0]) == 1 # 대각선 index
        G[diagIndices] += reg  # regularization

        P = G.inverse() # inverse matrix
        B = P / (-1 * P.diag()) # B = -P / P_ii
        B[diagIndices] = 0 # 대각선은 0으로 만들어줌 / 왜냐하면 자기 자신과의 similarity는 0이기 때문

        self.pred = self.X.to_dense() @ B  # X * B

In [88]:
model = EASE(dataset.make_sparse_matrix())

In [9]:
from model import model_factory
model = model_factory(model_name='AutoRec', input_dim=dataset.num_items, hidden_dim=64)

# Runner

In [94]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from tqdm import tqdm
from abc import ABC, abstractmethod

from runner.metric import get_ndcg, get_hit

class BaseRunner(ABC):
    """
    Abstract class for training and testing PyTorch models.
    하위 클래스에서 필수적으로 구현해야하는 메소드는 아래와 같습니다.
    - train : epochs만큼 모델을 학습할 수 있는 기능
    - train_one_epoch : 1 epoch만큼 모델을 학습할 수 있는 기능
    - evaluate : 모델 성능을 모니터링하는 기능
    - inference : 학습된 모델을 사용하여 추론을 수행하는 기능
    - save : 모델의 checkpoint를 저장하는 기능
    - load : 모델의 checkpoint를 불러오는 기능
    """
    def __init__(self, model, dataloader, args):
        self.model = model
        self.dataloader = dataloader
        self.args = args 

    @abstractmethod
    def train(self, train_loader, epoch):
        pass

    @abstractmethod
    def train_one_epoch(self, train_loader, epoch):
        pass

    @abstractmethod
    def evaluate(self, val_loader):
        pass

    @abstractmethod
    def inference(self, user_ids):
        pass

    @abstractmethod
    def save(self, path):
        pass

    @abstractmethod
    def load(self, path):
        pass


class EASERunner(BaseRunner):

    def __init__(self, model, dataloader, args):
        super().__init__(model, dataloader, args)
        self.model = None
        self.dataloader = dataloader
        self.dataset = dataloader.dataset
        # self.lr = args.lr
        self.device = args.device
        self.topk = args.topk
        self.num_epochs = args.num_epochs
        self.reg = args.reg

        # self.optimizer = self._create_optimizer(args.optimizer)
        # self.criterion = self._create_criterion(args.criterion)

    def train(self):
        X = self.dataset.make_sparse_matrix()
        for reg in self.reg:
            self.train_one_epoch(X, reg)
            NDCG, HIT = self.evaluate()
            print(f'NDCG:{NDCG} / HIT: {HIT}')
        return None
        
    def train_one_epoch(self, X, reg):
        self.model = EASE()
        self.model.X = self.model._convert_sp_mat_to_sp_tensor(X)
        self.model.fit(reg)
        return None
    
    def evaluate(self):
        NDCG = 0.0 
        HIT = 0.0 
        pred = self.model.pred.cpu()
        X = self.dataset.make_sparse_matrix().toarray()
        mat = torch.from_numpy(X)

        pred[mat == 1] = -1
        pred = pred.argsort(dim = 1)

        for user, rec1 in tqdm(enumerate(pred)):
            uv = dataset.valid_data[user]

            # ranking
            up = rec1[-5:].cpu().numpy().tolist()[::-1]

            NDCG += get_ndcg(pred_list = up, true_list = uv)
            HIT += get_hit(pred_list = up, true_list = uv)

        NDCG /= len(dataset.train_data)
        HIT /= len(dataset.train_data)

        return NDCG, HIT
    
    def inference(self, user_ids):

        user_list = [user_ids]
        mat = self.dataset.get_matrix(user_list, trainYn=False).to(self.device)
        recon_mat = self.model(mat)
        recon_mat = recon_mat.softmax(dim = 1)
        recon_mat[mat == 1] = -1.
        rec_list = recon_mat.argsort(dim = 1)
        rec_list = rec_list[0].cpu().numpy().tolist()
        rec_list = rec_list[-self.args.topk:]
        return rec_list

    def _create_optimizer(self, optimizer):
        if optimizer == 'adam':
            return optim.Adam(self.model.parameters(), lr=self.lr)
        elif optimizer == 'sgd':
            return optim.SGD(self.model.parameters(), lr=self.lr)
        else:
            raise ValueError('Invalid optimizer')

    def _create_criterion(self, criterion):
        if criterion == 'mse':
            return nn.MSELoss()
        elif criterion == 'ce':
            return nn.CrossEntropyLoss()
        else:
            raise ValueError('Invalid criterion')

    def save(self, path):
        torch.save(self.model.state_dict(), path)
        print(f'Model saved to {path}')
        return None

    def load(self, path):
        self.model.load_state_dict(torch.load(path))
        print(f'Model loaded from {path}')
        return None

In [95]:
args.num_epochs =10

In [96]:
runner = EASERunner(model, dataloader, args)

In [97]:
runner.train()

12393it [00:00, 102420.03it/s]


NDCG:0.24149737904033453 / HIT: 0.07087872185911483


12393it [00:00, 97582.24it/s] 


NDCG:0.27899916360934285 / HIT: 0.08143306705398337


12393it [00:00, 102421.65it/s]


NDCG:0.3370102134099699 / HIT: 0.09700637456628933


12393it [00:00, 87273.35it/s]


NDCG:0.3776437045632483 / HIT: 0.10886790930364154


12393it [00:00, 87891.96it/s]


NDCG:0.3836400787497139 / HIT: 0.11056241426612042


12393it [00:00, 99145.89it/s] 


NDCG:0.3762844409608466 / HIT: 0.10831921245864842


12393it [00:00, 94603.88it/s] 

NDCG:0.3699566464715221 / HIT: 0.10662470749616958





In [13]:
import torch
import torch.optim as optim
from torch import nn
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lr = 0.005

In [14]:
from runner import runner_factory
runner = runner_factory(model, dataloader, optimizer, criterion, lr, device, dataset, scheduler=None)

# Test

In [None]:
runner.train(10)