# [[Pytorch] Neural Collaborative Filtering](https://github.com/yihong-chen/neural-collaborative-filtering/blob/master/src/train.py)

## Matrix Factorization의 문제점

- MF(Matrix Factorization)는 두 벡터(user의 latent vector, item의 latent vector)의 내적에 의해 ŷ(predicted rating)가 계산된다.    
- 단순 선형 결합을 통해 ŷ을 구하기 때문에 user-item의 복잡한 상호작용을 온전히 표현할 수 없다. 아래 그림은 이런 MF의 표현력의 한계를 나타낸 그림이다.    

따라서 이런 단점을 해결하기 위해 선형결합 대신 Deep Neural Networks를 사용한다.

### Input Layer    
유저와 아이템의 ID를 one-hot-encoding하여 sparse한 vector로 변환하여 Input으로 사용한다.   

### Embedding Layer    
Sparse한 one-hot-encoding 데이터를 Dense한 벡터로 바꿔준다. 이렇게 얻어진 dense vector는 Latent vector로도 볼 수 있다.

### Output Layer
NCF layer와 hidden vector를 input으로 받아 predictive score $ŷ_{ui}$를 예측하며, Target $ŷ_{ui}$와의 비교를 통해 학습이 진행됩니다.  

### Multi-Layer Perceptron Layer
임베딩이 완료된 User, Item latent vector는 여러 층의 신경망을 거치게 됩니다. 이 다층 신경망 구조를 MLP Layers라고 합니다.    
우선 User latent vector와 Item latent vector를 concatenate한 벡터를 시작으로 각각의 층을 거치며 인공신경망을 통해 복잡한 비선형의 데이터 관계를 학습할 수 있게 됩니다.

### Generalized Matrix Factorization
Matrix Factorization는 NCF의 특별한 케이스가 됨을 보여주며 이를 GMF라고 명명하였다.

# Dataset - MovieLens

In [1]:
from google.colab import drive
drive.mount('/content/data')

Mounted at /content/data


In [2]:
import numpy as np
import pandas as pd

In [9]:
import easydict
import json

args = easydict.EasyDict()
args.default_path = '/content/data/MyDrive/dev/2.deep learning/5.  추천/data/data/Movie/ml-1m/'
args.ratings = args.default_path+'ratings.csv'

In [10]:
ratings = pd.read_csv(args.ratings)

print(f'{ratings.shape}')
ratings.head()

(100004, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [11]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


## Load Datas

In [12]:
import random

class LoadData(object):
  def __init__(self, args):
    self.ratings = self.__load_ratings(args.ratings)
    self.__binarize()

    self.user_pool = set(self.ratings['uid'].unique())
    self.item_pool = set(self.ratings['iid'].unique())

    self.negatives = self.__sample_negative()
    self.train_ratings, self.test_ratings = self.__split_loop()

  def __load_rating(self, ratings_path):
    df_ratings = pd.read_csv(ratings_path)
    df_ratings.drop_duplicates(inplace=True)
    #Reindex
    df_users = df_ratings[['userid']].drop_duplicates().reindex()
    df_users['uid'] = np.arange(len(df_users))
    df_ratings = pd.merge(df_ratings, df_users, on=['userid'], how='left')

    df_items = df_ratings[['movieId']].drop_duplicates().reindex()
    df_items['iid'] = np.arange(len(df_items))
    df_ratings = pd.merge(df_ratings, df_items, on=['movieId'], how='left')

    return df_ratings[['uid', 'iid', 'rating', 'timestamp']]

  def __binarize(self):
    """
    binarize into 0 or 1, implicit feedback
    """
    self.ratings['rating'][self.ratings['rating'] > 0] = 1.0

  def __sample_negative(self, num_ng=99):
    """
    return alll negative items & sampled negative items
    """
    interact_status = self.ratings.groupby(['uid'])['iid'].apply(set).reset_index().rename(
        columns={'iid':'interacted_iid'}
    )
    interact_status['negative_iid'] = interact_status['interacted_iid'].map(lambda x: self.item_pool - x)
    interact_status['negative_samples'] = interact_status['negative_iid'].map(lambda x: random.sample(x, num_ng))
    return interact_status[['uid', 'negative_iid', 'negative_samples']]

  def __split_loop(self):
    self.ratings['rank_latest'] = self.ratings.groupby(['uid'])['timestamp'].rank(method='first', ascending=False)
    test = self.ratings[self.ratings['rank_latest'] == 1]
    train = self.ratings[self.ratings['rank_latest'] > 1]

    assert train['uid'].nunique() == test['uid'].nunique() 
    return train[['uid', 'iid', 'rating']], test[['uid', 'iid', 'rating']]

In [None]:
loaddata = LoadData(args)

In [None]:
loaddata.ratings.head()

In [None]:
loaddata.train_ratings['rating'].value_counts()

In [None]:
loaddata.train_ratings['uid'].nunique()

In [None]:
loaddata.negatives.head()

## Dataset

In [None]:
import torch 
from torch.utils.data import Dataset

In [13]:
class RatingDataset(Dataset):
  def __init__(self, data, negative, num_ng=4):

    concat_ratings = self.__add_negative_items(data, negative, num_ng)

    self.features = concat_ratings[['uid', 'iid']].to_numpy().astype(np.long)
    self.ratings = torch.from_numpy(concat_ratings['rating'].to_numpy().astype(np.float32))

  def __add_negative_items(self, data, negative, num_ng):
    negative['negatives'] = negative['negative_iid'].map(lambda x: random.sample(x, num_ng))
    negative = negative[['uid', 'negatives']].explode('negatives').rename(
        columns={'negatives':'iid'}
    )
    negative['rating'] = 0.0
    return pd.concat([data, negative], axis=0)

  def __len__(self):
    return len(self.features)

  def __getitem__(self, index):
    return self.features[index], self.ratings[index]    

NameError: ignored

In [None]:
train_dataset = RatingDataset(loaddata.train_ratings, loaddata.negatives)
test_dataset = RatingDataset(loaddata.test_ratings, loaddata.negatives)

len(train_dataset), len(test_dataset)

In [None]:
feture, rating = next(iter(train_dataset))
feture, rating

In [None]:
train_dataset.field_dims

## Data Loader

In [None]:
from torch.utils.data import DataaLoader

In [None]:
train_loader = DataLoader(
        train_dataset, batch_size=1024, shuffle=True
    )

test_loader = DataLoader(
    test_dataset, batch_size=1024, shuffle=False
)

In [None]:
len(train_loader), len(test_loader)

In [None]:
next(iter(train_loader))

# NCF Model

In [None]:
import torch

In [14]:
class FeaturesEmbedding(torch.nn.Module):

    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.float32)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)

NameError: ignored

In [None]:
class MultiLayerPerceptron(torch.nn.Module):

  def __init__(self, input_dim, embed_dim):
    super().__init__()
    self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
    self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.float32)
    torch.nn.init.xavier_uniform_(self.embedding.weight.data)

  def forward(self, x):
    """
    :param x: Long tensor of size '' (batch_size, num_fields) ''
    """  
    x = x + x.new_tensor(self.offsets).unsqueeze(0)
    return self.embedding(x)

In [None]:
class MultiLayerPerceptron(torch.nn.Module):

    def __init__(self, input_dim, embed_dims, dropout, output_layer=True):
        super().__init__()
        layers = list()
        for embed_dim in embed_dims:
            layers.append(torch.nn.Linear(input_dim, embed_dim))
            layers.append(torch.nn.BatchNorm1d(embed_dim))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(p=dropout))
            input_dim = embed_dim
        if output_layer:
            layers.append(torch.nn.Linear(input_dim, 1))
        self.mlp = torch.nn.Sequential(*layers)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, embed_dim)``
        """
        return self.mlp(x)

In [None]:
class NeuralCollaborativeFiltering(torch.nn.Module):
  """
  A pytorch implementation of Neural Collaborative Filtering
  Reference:
  """

  def __init__(self, field_dims, user_field_idx=0, item_field_idx=1, embed_dim=16, mlp_dims=(16, 16), dropout=0.2):
    super().__init__()
    self.user_field_idx = user_field_idx
    self.tiem_field_idx = item_field_idx
    self.embedding = FeaturesEmbedding(field_dims, embed_dim)
    self.embed_output_dim = len(field_dims) * embed_dim
    self.embed_mlp = MultiLayerPerceptron(self.embed_output_dim, mlp_dims, dropout, output_layer=False)
    self.fc = torch.nn.Linear(mlp_dims[-1] + embed_dim, 1)

  def forward(self, x):
    x = self.embedding(x)
    user_x = x[:, self.user_field_idx].squeeze(1)
    item_x = x[:, self.item_field_idx].squeeze(1)
    x = self.mlp(x.view(-1, self.embed_output_dim))
    gmf = user_x * item_x
    x = torch.cat([gmf, x], dim=1)
    x = self.fc(x).squeeze(1)
    return torch.sigmoid(x)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
model = NeuralCollaborativeFiltering(train_dataset.field_dims).to(device)

In [None]:
features, label = next(iter(train_loader))
features.shape, label.shape

In [None]:
pred = model(features)
pred.shape

# Engine

## Train Step

In [None]:
from tqdm.auto import tqdm

def train_step(model, optimizer, data_loader, criterion, device, log_interval=100):
    model.train()
    total_loss = 0
    tk0 = tqdm(data_loader, smoothing=0, mininterval=1.0)
    for i, (fields, target) in enumerate(tk0):
        fields, target = fields.to(device), target.to(device)
        y = model(fields)
        loss = criterion(y, target.float())
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (i + 1) % log_interval == 0:
            tk0.set_postfix(loss=total_loss / log_interval)
            total_loss = 0

## Metrics
평가지표는 hit rate와 nDCG(normalized Discounted Cumulative Gain)입니다.   
hit rate는 ground truth가 예측한 아이템 순위 k 안에 들어가는 비율을 나타낸 것이고, nDCG는 관련성이 높은 결과를 상위권에 노출시켰는지를 평가하는 지표입니다.

In [None]:
def hit(gt_item, pred_items):
  if gt_item in pred_items:
    return 1
  return 0

In [None]:
def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index + 2))
    return 0

## Test Step

In [None]:
from sklearn.metrics import roc_auc_score

def test_step(model, data_loader, device, top_k):
  model.eval()
  HR, NDCG = [], []
  targets, predicts = [], []
  with torch.no_grad():
    for fields, target in tqdm(data_loader, smoothing=0, mininterval=1.0):
      fields, target = fields.to(device), target.to(device)
      y = model(fields)
      targets.extend(target.tolist())
      predicts.extend(y.tolist())
      # 가장 높은 top_k개 선택
      _, indices = torch.topk(y, top_k)
      # 해당 상품 index 선택
      recommends = torch.take(target, indices).numpy().tolist()
      # 정답값
      gt_item = target[0].item()
      # 평가
      HR.append(hit(gt_item, recommends))
      NDCG.append(ndcg(gt_item, recommends))

    return np.mean(HR), np.mean(NDCG), roc_auc_score(targets, predicts)  

# Training
- writer = SummaryWriter():   
writer 초기화 
- writer.add_scalar("data/loss", loss.item(), count):    
매 count마다 loss를 기록
- writer.add_scalar("test/HR", np.mean(HR), epoch):   
매 epoch마다 HR의 평균을 기록
- writer.add_scalar("test/NDCG", np.mean(NDCG), epoch):   
매 epoch마다 NDCG의 평균을 기록 

In [None]:
learning_rate = 0.001
top_k = 10
epochs = 10

device = torch.device(device)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs):
  train_step(model, optimizer, train_loader, criterion, device)
  HR, NDCG, auc = test_step(model, test_loader, device, top_k)
  print(f'epoch: {epoch} / auc: {auc}')
  print("HR: {:.3f}\tNDCG: {:.3f}".format(HR, NDCG))