# Auto Encoder

## Import

In [1]:
import torch
from torch import nn
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm, trange
import scipy.sparse as sp
from sklearn.metrics import mean_squared_error
import requests
from bs4 import BeautifulSoup
import bottleneck as bn

## Config

In [4]:
cfg = {
    "K": 1024,
    "lr" : 1e-4,
    "n_epochs" : 500,
    "data_dir" : "./dataset",
    "data_file" : "user_problem_mat.csv",
    "batch_size" : 64,
    "test_batch_size" : 32,
    "topks" : [10,20,50,100],
}
cfg["model_path"] = f"./saved_model/auto_encoder_K_{cfg['K']}_lr_{cfg['lr']}.pt"
cfg["device"] = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f'Using {cfg["device"]} device')

Using cuda device


## Utils

In [6]:
#########################################
################# Test ##################
#########################################

def getLabel(test_data, pred_data):
    r = []
    for i in range(len(test_data)):
        groundTrue = test_data[i]
        predictTopK = pred_data[i]
        pred = list(map(lambda x: x in groundTrue, predictTopK))
        pred = np.array(pred).astype("float")
        r.append(pred)
    return np.array(r).astype('float')

def NDCGatK_r(test_data,r,k):
    """
    Normalized Discounted Cumulative Gain
    rel_i = 1 or 0, so 2^{rel_i} - 1 = 1 or 0
    """
    assert len(r) == len(test_data)
    pred_data = r[:, :k]

    test_matrix = np.zeros((len(pred_data), k))
    for i, items in enumerate(test_data):
        length = k if k <= len(items) else len(items)
        test_matrix[i, :length] = 1
    max_r = test_matrix
    idcg = np.sum(max_r * 1./np.log2(np.arange(2, k + 2)), axis=1)
    dcg = pred_data*(1./np.log2(np.arange(2, k + 2)))
    dcg = np.sum(dcg, axis=1)
    idcg[idcg == 0.] = 1.
    ndcg = dcg/idcg
    ndcg[np.isnan(ndcg)] = 0.
    return np.sum(ndcg)

def RecallPrecision_ATk(test_data, r, k):
    """
    test_data should be a list? cause users may have different amount of pos items. shape (test_batch, k)
    pred_data : shape (test_batch, k) NOTE: pred_data should be pre-sorted
    k : top-k
    """
    right_pred = r[:, :k].sum(1)
    precis_n = k
    recall_n = np.array([len(test_data[i]) for i in range(len(test_data))])
    recall = np.sum(right_pred/recall_n)
    precis = np.sum(right_pred)/precis_n
    return {'recall': recall, 'precision': precis}

def test_one_batch(X, cfg):
    sorted_items = X[0].numpy()
    groundTrue = X[1]
    r = getLabel(groundTrue, sorted_items)
    pre, recall, ndcg = [], [], []
    for k in cfg['topks']:
        ret = RecallPrecision_ATk(groundTrue, r, k)
        pre.append(ret['precision'])
        recall.append(ret['recall'])
        ndcg.append(NDCGatK_r(groundTrue,r,k))
    return {'recall':np.array(recall), 
            'precision':np.array(pre), 
            'ndcg':np.array(ndcg)}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}    
    
def add_to_user_problem_mat(idx, id, user_problem_mat : np.array):
    data = requests.get(f'https://www.acmicpc.net/user/{id}', headers=headers)
    soup = BeautifulSoup(data.text, 'html.parser')
    trs = soup.select('div.problem-list')

    for tr in trs:
        problem_nums = tr.select('a')
            
        for problem_num in problem_nums :

            problem_num = int(problem_num.text) - 1000
            #print(problem_num)
            try:
                user_problem_mat[idx,problem_num] = 1
            except:
                print("범위를 벗어난 문제 번호 : " + str(problem_num))


## Model

In [11]:
class BaseModel(nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        
    def getUsersRating(self, user_row):
        raise NotImplementedError

class AutoEncoder(BaseModel):
    
    def __init__(self, item_n, cfg):
        """
        Arguments
        - sparse_matrix : user-item rating matrix
        - cfg : configuration dict
            - K (int)       : number of latent dimensions
            - device : using device
        """
        super(AutoEncoder, self).__init__()
        # convert ndArray
        self.item_n = item_n
        self.K = cfg["K"]
        self.device = cfg["device"]
        
        # Initialize user and item latent feature matrice
        self.I_1 = nn.Linear(self.item_n, self.K, bias=True, device = self.device)
        self.I_2 = nn.Linear(self.K, self.item_n, bias=True, device = self.device)

        nn.init.normal_(self.I_1.weight, std=1./self.K)
        nn.init.normal_(self.I_2.weight, std=1./self.K)

    def forward(self, x):
        user_emb = self.I_1(x)
        rating = self.I_2(user_emb)
 
        return rating        

    def getUsersRating(self, user_row : np.array):
        return self.forward(torch.Tensor(user_row).cuda()).cpu().detach().numpy()

class EASE():
    """
    Embarrassingly Shallow Autoencoders model class
    """

    def __init__(self, lambda_):
        self.B = None
        self.lambda_ = lambda_

    def train(self, interaction_matrix):
        """
        train pass
        :param interaction_matrix: interaction_matrix
        """
        G = interaction_matrix.T @ interaction_matrix
        diag = list(range(G.shape[0]))
        G[diag, diag] += self.lambda_
        P = np.linalg.inv(G)

        # B = P * (X^T * X − diagMat(γ))
        self.B = P / -np.diag(P)
        min_dim = min(*self.B.shape)
        self.B[range(min_dim), range(min_dim)] = 0

    def forward(self, user_row):
        """
        forward pass
        """
        return user_row @ self.B
    
    def getUsersRating(self, user_row : np.array):
        return self.forward(user_row)

## DataSet

In [8]:
# Dataset 상속
class AEDataset(Dataset): 
    def __init__(self, sparse_matrix, test=False):
        self.sparse_matrix = sparse_matrix.fillna(0).to_numpy()
        
        if test:
            self.maxK = max(cfg['topks'])
            test_data = []
            
            for user in range(len(self.sparse_matrix)):      
                items = self.sparse_matrix[user].nonzero()[0]
                if len(items) >= self.maxK :
                    test_data.append(self.sparse_matrix[user])  
                    
            self.sparse_matrix = np.array(test_data)    
            print("complete making test dict")      

    # 총 데이터의 개수를 리턴
    def __len__(self): 
        return len(self.sparse_matrix)

    # 인덱스를 입력받아 그에 맵핑되는 입출력 데이터를 파이토치의 Tensor 형태로 리턴
    def __getitem__(self, idx): 
        x = torch.FloatTensor(self.sparse_matrix[idx]).cuda()
        return x

In [11]:
train_data_path = f'{cfg["data_dir"]}/train_{cfg["data_file"]}'
train_s_mat = pd.read_csv(train_data_path, index_col = 0)

test_data_path = f'{cfg["data_dir"]}/test_{cfg["data_file"]}'
test_s_mat = pd.read_csv(test_data_path, index_col = 0)

In [226]:
train_dataset = AEDataset(train_s_mat)
test_dataset = AEDataset(test_s_mat, test=True)

complete making test dict


In [45]:
train_dataloader = DataLoader(train_dataset, batch_size=cfg["batch_size"])
test_dataloader = DataLoader(test_dataset, batch_size=cfg["test_batch_size"])

## Train & Test

In [17]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, X in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, X)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return loss

def make_test_list(data):
    test_list = []
    for d in data:
        test_list.append(d.nonzero().squeeze(1))
    return test_list

def test_loop(dataloader, model, cfg, loss_fn=None):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    
    model.eval()
    max_K = max(cfg['topks'])
    
    results = {'precision': np.zeros(len(cfg['topks'])),
               'recall': np.zeros(len(cfg['topks'])),
               'ndcg': np.zeros(len(cfg['topks'])),
               'test_loss' : 0}

    with torch.no_grad():
        try:
            assert num_batches <= size / 10
        except AssertionError:
            print(f"test_u_batch_size is too big for this dataset, try a small one {size // 10}")
        rating_list = []
        groundTrue_list = []        
        for X in dataloader:
            rating = model(X)
            _, rating_K = torch.topk(rating, k=max_K) # rating_K는 rating에서 k번째 높은 rating까지의 index
            
            rating_list.append(rating_K.cpu())
            gt = make_test_list(X)
            groundTrue_list.append(gt)
    
            if loss_fn is not None:
                results['test_loss'] += loss_fn(rating, X).item()
            del rating

        X = zip(rating_list, groundTrue_list)
        pre_results = []
        for x in X:
            pre_results.append(test_one_batch(x,cfg))
        for result in pre_results:
            results['recall'] += result['recall']
            results['precision'] += result['precision']
            results['ndcg'] += result['ndcg']
        results['recall'] /= float(size)
        results['precision'] /= float(size)
        results['ndcg'] /= float(size)
            
    results['test_loss'] /= num_batches
    return results
    
def test_loop_np(dataloader, model, cfg, loss_fn=None):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    
    max_K = max(cfg['topks'])
    
    results = {'precision': np.zeros(len(cfg['topks'])),
               'recall': np.zeros(len(cfg['topks'])),
               'ndcg': np.zeros(len(cfg['topks'])),
               'test_loss' : 0}

    with torch.no_grad():
        try:
            assert num_batches <= size / 10
        except AssertionError:
            print(f"test_u_batch_size is too big for this dataset, try a small one {size // 10}")
        rating_list = []
        groundTrue_list = []        
        for X in dataloader:
            rating = torch.tensor(model.getUsersRating(X.cpu().numpy())).cuda()
            _, rating_K = torch.topk(rating, k=max_K) # rating_K는 rating에서 k번째 높은 rating까지의 index
            
            rating_list.append(rating_K.cpu())
            gt = make_test_list(X)
            groundTrue_list.append(gt)
    
            if loss_fn is not None:
                results['test_loss'] += loss_fn(rating, X).item()
            del rating

        X = zip(rating_list, groundTrue_list)
        pre_results = []
        for x in X:
            pre_results.append(test_one_batch(x,cfg))
        for result in pre_results:
            results['recall'] += result['recall']
            results['precision'] += result['precision']
            results['ndcg'] += result['ndcg']
        results['recall'] /= float(size)
        results['precision'] /= float(size)
        results['ndcg'] /= float(size)
            
    results['test_loss'] /= num_batches
    return results

def save_result_to_tensor_board(result, epoch, div="Test"):
    for i, k in enumerate(cfg["topks"]):
        writer.add_scalar(f"[{div}]/Precision_@{k}", result['precision'][i], epoch)
        writer.add_scalar(f"[{div}]/Recall_@{k}", result['recall'][i], epoch)
        writer.add_scalar(f"[{div}]/ndcg_@{k}", result['ndcg'][i], epoch)

In [238]:
RecModel = AutoEncoder(train_s_mat.shape[1], cfg)
optim = torch.optim.Adam(RecModel.parameters(), lr=cfg["lr"], betas=(0.9, 0.999), weight_decay=1e-6)
loss_fn = torch.nn.MSELoss(size_average=None, reduce=None, reduction='mean')
#torch.nn.BCELoss(weight=None, size_average=None, reduce=None, reduction='mean')

In [239]:
max_recall = 0

In [240]:
writer = SummaryWriter()
st_num = 0

for i in range(st_num,cfg["n_epochs"]+st_num):
    if i % 20 == 0:
        train_res = test_loop(train_dataloader, RecModel, cfg, loss_fn)
        test_res = test_loop(test_dataloader, RecModel, cfg, loss_fn)
        save_result_to_tensor_board(train_res, i, div="Train")
        save_result_to_tensor_board(test_res, i)    
        writer.add_scalar(f"Loss/test", test_res['test_loss'], i)
        print("TEST [train_data]")
        print(train_res)
        print("TEST [test_data]")
        print(test_res)
        print("==========================")
        if max_recall < max(test_res["recall"]) :
            max_recall = max(test_res["recall"])
            torch.save(RecModel.state_dict(), cfg["model_path"])
    loss = train_loop(train_dataloader, RecModel, loss_fn, optim)  
    writer.add_scalar(f"Loss/train", loss, i)
    if i % 5 == 0:
        print(f"loss: {loss:>7f}  [{i:>5d}/{cfg['n_epochs']+st_num:>5d}]")
            
writer.flush()
writer.close()

TEST [train_data]
{'precision': array([0.00417143, 0.00405714, 0.00426286, 0.00545714]), 'recall': array([0.00014064, 0.00027028, 0.00069549, 0.00175348]), 'ndcg': array([0.00395303, 0.00393439, 0.00414827, 0.00511296]), 'test_loss': 0.010870212841440331}
TEST [test_data]
{'precision': array([0.00453333, 0.00366667, 0.00437333, 0.00532   ]), 'recall': array([0.00015044, 0.00024111, 0.00070448, 0.00177643]), 'ndcg': array([0.00429021, 0.00373487, 0.00422753, 0.00501043]), 'test_loss': 0.010464023620365782}
loss: 0.007082  [    0/  500]
loss: 0.005522  [    5/  500]
loss: 0.004476  [   10/  500]
loss: 0.003583  [   15/  500]
TEST [train_data]
{'precision': array([0.99845714, 0.99685714, 0.99113143, 0.97121143]), 'recall': array([0.03938244, 0.07861227, 0.19514111, 0.38068008]), 'ndcg': array([0.99878006, 0.99755435, 0.99306297, 0.97733818]), 'test_loss': 0.003034477871419354}
TEST [test_data]
{'precision': array([0.9936    , 0.9912    , 0.97976   , 0.94942667]), 'recall': array([0.040345

loss: 0.000238  [  260/  500]
loss: 0.000226  [  265/  500]
loss: 0.000224  [  270/  500]
loss: 0.000225  [  275/  500]
TEST [train_data]
{'precision': array([1., 1., 1., 1.]), 'recall': array([0.03945172, 0.07890343, 0.19725858, 0.39451717]), 'ndcg': array([1., 1., 1., 1.]), 'test_loss': 0.0003778457943223078}
TEST [test_data]
{'precision': array([1.        , 1.        , 1.        , 0.99981333]), 'recall': array([0.04062342, 0.08124685, 0.20311712, 0.40613336]), 'ndcg': array([1.        , 1.        , 1.        , 0.99986059]), 'test_loss': 0.0018347507278296225}
loss: 0.000228  [  280/  500]
loss: 0.000224  [  285/  500]
loss: 0.000223  [  290/  500]
loss: 0.000223  [  295/  500]
TEST [train_data]
{'precision': array([1., 1., 1., 1.]), 'recall': array([0.03945172, 0.07890343, 0.19725858, 0.39451717]), 'ndcg': array([1., 1., 1., 1.]), 'test_loss': 0.00037137606149454684}
TEST [test_data]
{'precision': array([1.        , 1.        , 1.        , 0.99981333]), 'recall': array([0.04062342, 

In [15]:
# 모델 불러오기
RecModel = AutoEncoder(num_problem, cfg)
RecModel.load_state_dict(torch.load(cfg["model_path"]))
test_loop(test_dataloader, RecModel, cfg, loss_fn)

NameError: name 'test_loop' is not defined

In [18]:
# 모델 불러오기
st_pb_num = 1000
ed_pb_num = 27981
num_problem = ed_pb_num - st_pb_num + 1  # 1000 ~ 27981

RecModel = AutoEncoder(num_problem, cfg)
RecModel.load_state_dict(torch.load(cfg["model_path"]))

<All keys matched successfully>

In [19]:
test_data_path = f'{cfg["data_dir"]}/test_{cfg["data_file"]}'
test_s_mat = pd.read_csv(test_data_path, index_col = 0)
test_dataset = AEDataset(test_s_mat, test=True)
test_dataloader = DataLoader(test_dataset, batch_size=cfg["test_batch_size"])

complete making test dict


In [20]:
test_loop(test_dataloader, RecModel, cfg)

{'precision': array([1.        , 1.        , 1.        , 0.99985333]),
 'recall': array([0.04062342, 0.08124685, 0.20311712, 0.4061518 ]),
 'ndcg': array([1.        , 1.        , 1.        , 0.99989041]),
 'test_loss': 0.0}

# Tensorboard 시각화

In [2]:
%load_ext tensorboard

In [8]:
%reload_ext tensorboard

In [3]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 288), started 1 day, 1:58:31 ago. (Use '!kill 288' to kill it.)

In [None]:
!tensorboard dev upload --logdir runs \
--name "My latest experiment" \ # 선택 사항
--description "Simple comparison of several hyperparameters" # 선택 사항

# 유저에 따른 문제 추천

In [112]:
# 모델 불러오기
RecModel = AutoEncoder(train_s_mat, cfg)
RecModel.load_state_dict(torch.load(cfg["model_path"]))
RecModel.eval()

MFGDTorch(
  (I_1): Linear(in_features=26982, out_features=128, bias=True)
  (I_2): Linear(in_features=128, out_features=26982, bias=True)
)

In [116]:
st_pb_num = 1000
ed_pb_num = 27981
num_problem = ed_pb_num - st_pb_num + 1  # 1000 ~ 27981

NUM_TOP_PROBLEMS = 10
user_id = 'faang12594'

In [117]:
user_problem = np.zeros([1, num_problem])
add_to_user_problem_mat(0, user_id, user_problem)

result = RecModel.getUsersRating(user_problem)

범위를 벗어난 문제 번호 : 26982
범위를 벗어난 문제 번호 : 26983


In [118]:
# 유저가 푼 문제와 비슷한 유형 추천 - 예전
result[user_problem.nonzero()] = -np.inf
top_problems_by_user = bn.argpartition(-result, NUM_TOP_PROBLEMS, axis=1)[:, :NUM_TOP_PROBLEMS] # 값이 큰 10개 문제 고름
top_problems_by_user += 1000
print(top_problems_by_user)

[[16234 11657  1707  1976 17609 21758  1655  4149  5615  2436]]


# 모델 비교

In [25]:
import pickle

# 모델 불러오기
with open('./saved_model/ease_model_1682656807.5593908.p', 'rb') as file:
    ease = pickle.load(file)

MFGDTorch(
  (I_1): Linear(in_features=26982, out_features=128, bias=True)
  (I_2): Linear(in_features=128, out_features=26982, bias=True)
)

In [243]:
ae = AutoEncoder(train_s_mat, cfg)
ae.load_state_dict(torch.load(cfg["model_path"]))

<All keys matched successfully>

In [244]:
print("Test - AE")
test_loop(test_dataloader, ae, loss_fn, cfg)

Test - AE


{'precision': array([1.        , 1.        , 1.        , 0.99985333]),
 'recall': array([0.04062342, 0.08124685, 0.20311712, 0.4061518 ]),
 'ndcg': array([1.        , 1.        , 1.        , 0.99989041]),
 'test_loss': 0.0017784015681753134}

In [245]:
print("Test - EASE")
test_loop_np(test_dataloader, ease, loss_fn, cfg)

Test - EASE


{'precision': array([0.9904    , 0.98726667, 0.97461333, 0.9416    ]),
 'recall': array([0.04021442, 0.08015774, 0.19754144, 0.38002484]),
 'ndcg': array([0.99190783, 0.98921548, 0.97900032, 0.9523618 ]),
 'test_loss': 0.00385346341856047}