# AutoEncoder Meet Collaborative Filtering

- Collaborative Filtering을 위해 user-item matrix 만들기
- AutoEncoder 모델 구조 정의하기

* Training Deep AutoEncoder 논문은 [저자 코드](https://github.com/NVIDIA/DeepRecommender) 참고

## 논문 종류
- AutoRec
- Training Deep AutoEncoder
- Variational AutoEncoder

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
def read_data(data_path):
    df = pd.read_csv(os.path.join(data_path,'ratings.csv'))[:10000]
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=1234, shuffle=True)

    user_to_index = {original: idx for idx, original in enumerate(df.userId.unique())}
    movie_to_index = {original: idx for idx, original in enumerate(df.movieId.unique())}

    return train_df, val_df, user_to_index, movie_to_index

In [3]:
class KMRDdataset(Dataset):
    def __init__(self, df, user_to_index, movie_to_index, item_based=True):
        self.min_rating = min(df.rating)
        self.max_rating = max(df.rating)

        self.user = [user_to_index[u] for u in df.userId.values]
        self.movie = [movie_to_index[m] for m in df.movieId.values]
        self.rating = df.rating.values

        if item_based:
            input_tensor = torch.LongTensor([self.movie, self.user])
            self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rating),
                                             torch.Size([len(movie_to_index), len(user_to_index)])).to_dense()
        else:
            input_tensor = torch.LongTensor([self.user, self.movie])
            self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rating),
                                             torch.Size([len(user_to_index), len(movie_to_index)])).to_dense()


    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [4]:
data_path = "./movielens_data"
train_df, val_df, user_to_index, movie_to_index = read_data(data_path=data_path)

In [5]:
train_dataset = KMRDdataset(train_df, user_to_index, movie_to_index)
val_dataset = KMRDdataset(val_df, user_to_index, movie_to_index)

In [6]:
print(train_df.shape)

(8000, 4)


In [7]:
print(train_dataset.data[0].size())

torch.Size([66])


In [8]:
print(val_df.shape)

(2000, 4)


In [9]:
print(val_dataset.data[0].size())

torch.Size([66])


In [10]:
print(len(list(user_to_index.keys())))

66


In [11]:
train_dataset.data[0]

tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 4.5000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 2.5000, 0.0000, 4.5000, 0.0000,
        0.0000, 0.0000, 3.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 3.0000,
        0.0000, 0.0000, 0.0000, 5.0000, 3.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 5.0000, 0.0000, 0.0000, 0.0000, 3.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 3.0000, 0.0000, 0.0000, 0.0000, 3.0000,
        0.0000, 0.0000, 5.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 5.0000,
        4.0000, 0.0000, 4.0000])

In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

In [13]:
# Define AutoEncoder 

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.nn.init as weight_init

In [14]:
class SimpleAutoEncoder(nn.Module):
    def __init__(self, num_inputs, num_hiddens, kind='sigmoid', dropout=None):
        super(SimpleAutoEncoder, self).__init__()
        # encoder -> hidden -> decoder
        # input -> hidden -> output
        # input -> hidden : encoder
        # hidden -> output = input : decoder
        self.encoder = nn.Sequential(nn.Linear(num_inputs, num_hiddens), self.activation(kind))
        self.decoder = nn.Sequential(nn.Linear(num_hiddens, num_inputs), self.activation(kind))  

    def activation(self, kind):
        if kind == 'selu':
            return nn.SELU()
        elif kind == 'relu':
            return nn.ReLU()
        elif kind == 'relu6':
            return nn.ReLU6()
        elif kind == 'sigmoid':
            return nn.Sigmoid()
        elif kind == 'tanh':
            return nn.Tanh()
        elif kind == 'elu':
            return nn.ELU()
        elif kind == 'lrelu':
            return nn.LeakyReLU()
        elif kind == 'none':
            return input
        else:
            raise ValueError('Unknown non-linearity type')

    def forward(self, x):
        return self.decoder(self.encoder(x))

In [15]:
class DeepAutoEncoder(nn.Module):
    def __init__(self, num_hiddens, num_layers, dropout=None, nn_type='diamond'):
        super(AutoEncoder, self).__init__()
        # input -> hidden -> output
        # input -> hidden(10) -> ... -> hidden(10) -> output = input
        self.encoder, self.decoder = self.generate_layers(num_hiddens, num_layers, dropout, nn_type)
  
    def forward(self, x):
        return self.decoder(self.encoder(x))
  
    def generate_layers(self, num_hiddens, num_layers, dropout=None, nn_type='diamond'):
        # hidden layers -> [50, 25, 12, 6, 12, 25, 50], [100 50 100] -> 100, 50, 60, 50 100 
        if nn_type == 'diamond':
            encoder_modules = []
            decoder_modules = []

            hidden_layers = []
            temp = num_hiddens
            for idx, x in enumerate(range(num_layers)):
                if idx == 0:
                    hidden_layers.append(temp)
                else:
                    hidden_layers.append(int(temp/2))
                temp = temp/2
            hidden_layers = [x for x in hidden_layers if x > 10]
      
        # encoder
        for idx, num_hidden in enumerate(hidden_layers):
            if idx < len(hidden_layers)-1:
                encoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
                encoder_modules.append(nn.Sigmoid())

        # decoder
        hidden_layers = list(reversed(hidden_layers))
        for idx, num_hidden in enumerate(hidden_layers):
            if idx < len(hidden_layers)-1:
                decoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
                decoder_modules.append(nn.Identity())

            # num_hidden = 50, num_layers = 3 ->  input_dim -> [50, 50, 50] -> output_dim = input_dim 
            elif nn_type == 'constant':
                hidden_layers = [num_hiddens] * num_layers
                for idx, enc in enumerate(hidden_layers):
                    if idx < num_layers-1:
                        encoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
                        encoder_modules.append(nn.Sigmoid())
                        decoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
                        decoder_modules.append(nn.Identity())

        if dropout is not None:    
            encoder_modules = [x for y in (encoder_modules[i:i+2] + [nn.Dropout(dropout)] * (i < len(encoder_modules) - 1) 
                              for i in range(0, len(encoder_modules), 2)) for x in y]
            decoder_modules = [x for y in (decoder_modules[i:i+2] + [nn.Dropout(dropout)] * (i < len(decoder_modules) - 1)
                              for i in range(0, len(decoder_modules), 2)) for x in y]

        encoder = nn.Sequential(*encoder_modules)
        decoder = nn.Sequential(*decoder_modules)

        return encoder, decoder

## Train

In [16]:
num_users = len(user_to_index.keys())
num_movies = len(movie_to_index.keys())
print(num_users, num_movies)

66 3218


In [17]:
model = SimpleAutoEncoder(num_inputs=num_users, num_hiddens=100, kind='selu')

In [18]:
model

SimpleAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=66, out_features=100, bias=True)
    (1): SELU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=100, out_features=66, bias=True)
    (1): SELU()
  )
)

In [19]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [20]:
def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(weights_init)

SimpleAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=66, out_features=100, bias=True)
    (1): SELU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=100, out_features=66, bias=True)
    (1): SELU()
  )
)

In [21]:
train_dataset.data[0].size()

torch.Size([66])

In [22]:
# NVIDIA Recommender System 참고
def MSEloss(inputs, targets, size_average=False):
    mask = targets != 0
    num_ratings = torch.sum(mask.float())
    criterion = nn.MSELoss(reduction='sum' if not size_average else 'mean')
    return criterion(inputs * mask.float(), targets), Variable(torch.Tensor([1.0])) if size_average else num_ratings

In [23]:
model.train()
train_loss = 0
for idx, batch in enumerate(train_dataloader):
    optimizer.zero_grad()
    
    pred = model(batch)
    loss, num_ratings = MSEloss(pred, batch)    
    loss = torch.sqrt(loss / num_ratings)
    loss.backward()
    train_loss += loss.item() 
    optimizer.step()
    
    print(train_loss / (idx+1))

4.024050712585449
4.010781645774841
3.956714391708374
3.8927645683288574
3.880780792236328
3.845703919728597
3.7725095067705428
3.752323567867279
3.7041617499457464
3.64604172706604
3.6220582831989634
3.5898364384969077
3.5707243405855618
3.5313898665564403
3.508512608210246
3.4756292700767517
3.4389257150537826
3.4042499197853937
3.3710322631032845
3.3382113456726072
3.3031790483565557
3.267450896176425
3.238255604453709
3.2088656524817147
3.1777984619140627
3.142748548434331
3.1107908884684243
3.0821524943624223
3.0558417583334037
3.023597478866577
2.9953775790429886
2.966559872031212
2.941505034764608
2.9103665001252117
2.8823345388684953
2.8551771177185907
2.8272660068563513
2.798488560475801
2.7735295540247207
2.7432249635457993
2.7248581211741376
2.6977702492759343
2.674365531566531
2.6482171687212857
2.6214652909172904
2.5964838214542554
2.573346011182095
2.5493355616927147
2.5237045020473245
2.5004734349250795
2.4720824395909027


In [24]:
model.eval()
val_loss = 0
with torch.no_grad():
    for idx, batch in enumerate(val_dataloader):
        pred = model(batch)
        loss, num_ratings = MSEloss(pred, batch)
        loss = torch.sqrt(loss / num_ratings)
        val_loss += loss.item()

        print(val_loss/(idx+1))

1.1498407125473022
1.8348398804664612
1.5521109700202942
1.5600093454122543
1.545199453830719
1.4390999674797058
1.4158827747617448
1.3357869535684586
1.3081312312020197
1.4065138220787048
1.3272842981598594
1.2593594590822856
1.2459863424301147
1.2468439851488387
1.2769152561823527
1.2514952756464481
1.2594050835160648
1.2560493979189131
1.264278270696339
1.2664989322423934
1.278817287513188
1.2852870469743556
1.2635453809862551
1.281140349805355
1.2583775877952577
1.2950109495566442
1.2987424113132335
1.31492951299463
1.3304651544011872
1.4231683909893036
1.424634946930793
1.427579464390874
1.3983940879503887
1.3964334393248838
1.379056474140712
1.3729548354943593
1.3812375229758185
1.3845880721744739
1.3926495680442224
1.3938040524721145
1.3990221168936752
1.3872689306735992
1.392296243545621
1.3924040320244702
1.3895098355081346
1.3861660089181818
1.3829819686869358
1.3829962275922298
1.374077400382684
1.3728268909454346
1.3665214823741538
1.3556962517591624
1.3548865093375153
1.36