In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# General Stuff:
import numpy as np
import pandas as pd
import os
import sys
from datetime import datetime
import gc
from collections import defaultdict, Counter

# Dimensionality Reduction:
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn import random_projection

# Neural Networks Stuff
import torch
from torch import nn, optim
from torch.utils import data
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl


# Statistics Stuff
from sklearn.model_selection import train_test_split, cross_val_score

# Visualization Stuff
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 240)
    
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
if torch.cuda.is_available():
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-1m-dataset/users.dat
/kaggle/input/movielens-1m-dataset/ratings.dat
/kaggle/input/movielens-1m-dataset/README
/kaggle/input/movielens-1m-dataset/movies.dat


# AutoRec - AutoEncoders Meet Collaborative Filtering - PyTorch 

<a id="toc"></a>
## Table of Content
1. [Introduction](#introduction)
1. [Data Preparation](#preparation)
1. [Embeddings](#embeddings)
1. [Classification](#classification)
1. [Affective Space](#affective)
1. [Dimensionality Reduction](#reduction)
1. [Conclusions](#conclusions)

<a id="introduction"></a>

## Introduction

In this notebook we will review a collaborative filtering approach using autoencoders, as suggested by Sedhain et al, in their 2015 paper "AutoRec: Autoencoders Meet Collaborative Filtering".

We will then introduce several improvemnts and asses them.


The notebook is avaible on [Kaggle](https://www.kaggle.com/odedgolden/autorec-pytorch//)

Code is inspired by [gtshs2/Autorec](https://github.com/gtshs2/Autorec)

[Table of content](#toc)

In [2]:
ratings_list = [i.strip().split("::") for i in open('/kaggle/input/movielens-1m-dataset/ratings.dat', 'r').readlines()]
ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = int)
ratings_df['Rating']=ratings_df['Rating'].apply(pd.to_numeric, downcast='float')
ratings_df.head(3)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5.0,978300760
1,1,661,3.0,978302109
2,1,914,3.0,978301968


In [3]:
R_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head(3)

MovieID,1,10,100,1000,1002,1003,1004,1005,1006,1007,...,99,990,991,992,993,994,996,997,998,999
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
class RatingsDataSet(data.Dataset):
    def __init__(self, ratings, mode="users"):
        self.ratings = ratings
        self.mode = mode

    def __len__(self):
        return len(self.ratings) if self.mode == "users" else len(self.ratings[0])
    
    def __getitem__(self, item):
        return self.ratings[:,0] if self.mode == "users" else self.ratings[0]


In [5]:
user_dataset = RatingsDataSet(R_df.values)
len(user_dataset)

6040

In [6]:
train_size = int(0.8 * len(user_dataset))
validation_size = len(user_dataset) - train_size
train_dataset, validation_dataset = random_split(user_dataset, [train_size, validation_size])


train_loader = DataLoader(dataset = train_dataset,
                                     batch_size = 256,
                                     num_workers=4,
                                     shuffle = True)
validation_loader = DataLoader(dataset = validation_dataset,
                                     batch_size = 256,
                                     num_workers=4)

In [7]:
for batch_ndx, sample in enumerate(train_loader):
    print(batch_ndx)
    print(sample.size())
    break

0
torch.Size([256, 6040])


In [8]:
class AutoRec(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.encoder = nn.Linear(6040,16)
        self.decoder = nn.Linear(16,6040)

    def training_step(self, batch, batch_idx):
        # --------------------------
        # REPLACE WITH YOUR OWN
        x = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        self.log('train_loss', loss)
        return loss
        # --------------------------

    def validation_step(self, batch, batch_idx):
        # --------------------------
        # REPLACE WITH YOUR OWN
        x = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        self.log('val_loss', loss)
        # --------------------------

    def test_step(self, batch, batch_idx):
        # --------------------------
        # REPLACE WITH YOUR OWN
        x = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        self.log('test_loss', loss)
        # --------------------------

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [9]:
# init model
autorec = AutoRec()

# Initialize a trainer
trainer = pl.Trainer(gpus=0, max_epochs=10, progress_bar_refresh_rate=200)

# Train the model ⚡
trainer.fit(autorec, train_loader, validation_loader)

  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"


Validation sanity check: 0it [00:00, ?it/s]

  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [10]:
# Start tensorboard.
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

In [11]:
latent_dims = [10, 20, 40, 80, 100, 200, 300, 400, 500]
lambdas = [0.001, 0.01, 0.1, 1, 100, 1000]