<a href="https://colab.research.google.com/github/respect5716/Deep-Learning-Paper-Implementation/blob/master/05_Recommender/Collaborative%20Denoising%20Auto%20Encoders%20for%20Top-N%20Recommender%20Systems%20(CDAE).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collaborative Denoising Auto-Encoders for Top-N Recommender Systems (CDAE)

## 0. Paper

### Info
* TItle : Collaborative Denoising Auto-Encoders for Top-N Recommender Systems (CDAE)
* Author : Yao Wu et al.
* Publication : WSDM 2016, [link](https://dl.acm.org/doi/pdf/10.1145/2835776.2835837)

### Summary
* Denoising autoencoder를 추천 시스템에 적용
* corrupted input을 복원하도록 학습
* user embedding으로 user specific 정보 처리

### Differences
* ##

## 1. Setting

In [0]:
# Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# Libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

import tensorflow as tf

In [3]:
# GPU Setting
!nvidia-smi

print(f'tensorflow version : {tf.__version__}')
print(f'available GPU list : {tf.config.list_physical_devices("GPU")}')

Wed Jun  3 08:29:19 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
# Hyperparameters
CONFIG = {
    'base_dir' : '/content/drive/Shared drives/Yoon/Project/Doing/Deep Learning Paper Implementation',
    'num_neg' : 5,
    'model_dim' : 100,
    'learning_rate' : 1e-3,
    'corruption_rate' : 0.2,
    'lambda' : 0.01, # regularization factor
    'batch_size' : 256,
    'epoch_size' : 30
}

## 2. Data

In [0]:
def load_data():
    rating = pd.read_table('data/ratings.dat', sep='::', engine='python', header=None)
    rating.columns = ['userId', 'itemId', 'rating', 'timestamp']    
    rating = rating.sort_values(['userId', 'timestamp'])
    return rating

def get_neg_sample(pos, num_item, num_neg=None):
    pos = sorted(pos)
    
    sample = np.arange(0, num_item - len(pos))
    pos_adj = pos - np.arange(len(pos))
    search = np.searchsorted(pos_adj, sample, side='right')
    neg = sample + search

    if not num_neg:
        num_neg = min(CONFIG['num_neg'] * len(pos), len(neg))
    neg = np.random.choice(neg, num_neg)
    neg = list(neg)
    return neg

def multi_hot(seq, num_item):
    result = np.zeros(num_item)
    result[seq] = 1
    return result

def corrupt(seq):
    corrupted = [i for i in seq if np.random.rand() > CONFIG['corruption_rate']]
    if len(corrupted) == 0:
        corrupted = [np.random.choice(seq, 1)]
    return corrupted

In [0]:
class Dataset(tf.keras.utils.Sequence):
    def __init__(self, train_data, test_data, num_user, num_item, train):
        self.train_data = train_data
        self.test_data = test_data
        self.num_user = num_user
        self.num_item = num_item
        self.train = train

        self.idx = 0 
        self.on_epoch_end()
    
    def __len__(self):
        return np.ceil(len(self.train_data) / CONFIG['batch_size']).astype(np.int32)
    
    def on_epoch_end(self):
        if self.train == 'train':
            self.indices = np.random.permutation(len(self.train_data))
        else:
            self.indices = np.arange(len(self.train_data))
        
    def __getitem__(self, idx):
        batch_idx = self.indices[CONFIG['batch_size']*idx : CONFIG['batch_size']*(idx+1)]
        batch_data = self.train_data.iloc[batch_idx]
        user = np.array(batch_data.index)
        if self.train == 'train':
            x = [corrupt(i) for i in batch_data]
            x = np.stack([multi_hot(i, self.num_item) for i in x])
            pos = np.stack([multi_hot(i, self.num_item) for i in batch_data])
            neg = [get_neg_sample(i, self.num_item) for i in batch_data]
            neg = np.stack([multi_hot(i, self.num_item) for i in neg])
            user, x, pos, neg = user.astype(np.int32), x.astype(np.float32), pos.astype(np.float32), neg.astype(np.float32)
        else:
            x = np.stack([multi_hot(i, self.num_item) for i in batch_data])
            pos = self.test_data.iloc[batch_idx]
            neg = None
        return user, x, pos, neg

    def next(self):
        if self.idx == self.__len__():
            self.on_epoch_end()
            self.idx = 0
        user, x, pos, neg = self.__getitem__(self.idx)
        self.idx += 1
        return user, x, pos, neg

In [0]:
data_path = os.path.join(CONFIG['base_dir'], 'data/movielens_10m.zip')
!unzip $"{data_path}" -d '/content/data/'

In [0]:
data = load_data()

In [10]:
data.head()

Unnamed: 0,userId,itemId,rating,timestamp
18,1,588,5.0,838983339
2,1,231,5.0,838983392
4,1,316,5.0,838983392
5,1,329,5.0,838983392
3,1,292,5.0,838983421


In [11]:
user_dict = data['userId'].unique()
user_dict = {j:i for i,j in enumerate(user_dict)}
num_user = len(user_dict)

item_dict = data['itemId'].unique()
item_dict = {j:i for i,j in enumerate(item_dict)}
num_item = len(item_dict)

num_user, num_item

(69878, 10677)

In [0]:
data = data.loc[data['rating'] > 3]
data['userId'] = data['userId'].map(user_dict)
data['itemId'] = data['itemId'].map(item_dict)

data = data.groupby('userId')['itemId'].apply(list)
data = data.loc[data.apply(len) >= 5]
train_data = data.apply(lambda x : x[:int(len(x)*0.8)])
test_data = data.apply(lambda x : x[int(len(x)*0.8):])

In [13]:
train_data.head()

userId
0    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
2    [41, 42, 40, 44, 45, 47, 48, 49, 50, 51, 52, 5...
3    [31, 71, 72, 74, 75, 0, 2, 78, 3, 7, 81, 34, 9...
4    [97, 101, 25, 102, 105, 106, 108, 109, 110, 11...
5    [175, 33, 176, 178, 180, 22, 181, 182, 183, 18...
Name: itemId, dtype: object

In [0]:
train_dataset = Dataset(train_data, test_data, num_user, num_item, 'train')
test_dataset = Dataset(train_data, test_data, num_user, num_item, 'test')

In [0]:
user, x, pos, neg = train_dataset.next()

In [32]:
user.shape, x.shape, pos.shape, neg.shape

((256,), (256, 10677), (256, 10677), (256, 10677))

## 3. Model

In [0]:
class Network(tf.keras.Model):
    def __init__(self, num_user, num_item):
        super(Network, self).__init__()
        self.user_embedding = tf.keras.layers.Embedding(num_user, CONFIG['model_dim'])
        self.latent_layer = tf.keras.layers.Dense(CONFIG['model_dim'], activation='relu', kernel_regularizer=tf.keras.regularizers.l2(CONFIG['lambda']))
        self.output_layer = tf.keras.layers.Dense(num_item, activation='sigmoid')
    
    def call(self, user, x):
        user_embed = self.user_embedding(user)
        x = self.latent_layer(x)
        x += user_embed
        outputs = self.output_layer(x)
        return outputs

@tf.function
def train_step(network, optimizer, user, x, pos, neg):
    with tf.GradientTape() as g:
        pred = network(user, x)

        # square loss function
        # pos_loss = tf.reduce_mean(tf.square(pos - pred*pos))
        # neg_loss = tf.reduce_mean(tf.square(tf.zeros_like(neg) - pred*neg))
        # loss = tf.reduce_mean(pos_loss + neg_loss)

        # logistic loss function
        pos_loss = pos * tf.math.log(pred + 1e-5)
        neg_loss = neg * tf.math.log(1-pred + 1e-5)
        loss = -tf.reduce_mean(pos_loss + neg_loss)
    
    gradient = g.gradient(loss, network.trainable_variables)
    optimizer.apply_gradients(zip(gradient, network.trainable_variables))
    return loss

In [0]:
network = Network(num_user, num_item)
optimizer = tf.keras.optimizers.Adam()

## 4. Train

In [0]:
train_dataset.on_epoch_end()
for i in tqdm(range(CONFIG['epoch_size'])):
    for j in range(len(train_dataset)):
        user, x, pos, neg = train_dataset.next()
        loss = train_step(network, optimizer, user, x, pos, neg)
    
    print(f'EP : {str(i).zfill(2)} | loss : {loss:.7f}')

## 5. Test

In [0]:
def precision(true, pred):
    return len([i for i in pred if i in true]) / len(pred)

def recall(true, pred):
    return len([i for i in pred if i in true]) / len(true)

def average_precision(true, pred):
    avg_prec = np.sum([precision(true, pred[:i+1]) for i in range(len(pred)) if pred[i] in true])
    avg_prec /= min(len(true), len(pred))
    return avg_prec

In [153]:
maps = []
test_dataset.on_epoch_end()
for i in range(len(test_dataset)):
    user, x, pos, _ = test_dataset.next()
    pred = network(user, x)
    pred = pred * (1-x)
    rank = np.argsort(-pred)
    batch_maps = [average_precision(i,j[:10]) for i,j in zip(pos, rank)]
    maps += batch_maps

print(f'MAP@10 : {np.mean(maps):.3f}')

MAP@10 : 0.054
