In [1]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
import numpy as np

In [2]:
data_cols = ['user_id', 'movie_id', 'rating', 'ts']
data = []
with open('data/ml-100k/u.data', 'r') as f:
    while True:
        l = f.readline() 
        if not l:
            break
        res = l[:-1].split('\t')
        info_d = {}
        for i in range(len(data_cols)):
            info_d[data_cols[i]] = res[i]
        data.append(info_d)

In [3]:
item_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_url', 'unknown', 'Action', 
'Adventure', 'Animation', "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
"Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
items = []
with open('data/ml-100k/u.item', 'r', encoding='ISO-8859-1') as f:
    while True:
        l = f.readline()
        if not l:
            break
        res = l[:-1].split('|')
        info_d = {}
        for i in range(len(item_cols)):
            info_d[item_cols[i]] = res[i]
        items.append(info_d)

In [4]:
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zipcode']
users = []
with open('data/ml-100k/u.user', 'r', encoding='ISO-8859-1') as f:
    while True:
        l = f.readline()
        if not l:
            break
        res = l[:-1].split('|')
        info_d = {}
        for i in range(len(user_cols)):
            info_d[user_cols[i]] = res[i]
        users.append(info_d)

In [5]:
df_items = pd.DataFrame(items)
df_users = pd.DataFrame(users)
df_data = pd.DataFrame(data)
df_data['user_id'] = df_data['user_id'].astype(int)
df_data['movie_id'] = df_data['movie_id'].astype(int)
df_data['rating'] = df_data['rating'].astype('float32')

In [6]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df_data, random_state=1, test_size=0.4)
df_val = df_test[:len(df_test)//4]
df_test = df_test[len(df_test)//4:]

X_train = np.array(df_train[['user_id', 'movie_id']])
y_train = np.array(df_train['rating'])
X_val = np.array(df_val[['user_id', 'movie_id']])
y_val = np.array(df_val['rating'])
X_test = np.array(df_test[['user_id', 'movie_id']])
y_test = np.array(df_test['rating'])

n_users = len(df_users)
n_items = len(df_items)

In [7]:
df_train.head()

Unnamed: 0,user_id,movie_id,rating,ts
47622,460,242,4.0,882910838
55696,727,202,4.0,883711354
52647,557,198,5.0,881179513
43018,256,319,2.0,882150053
53422,406,115,4.0,879446108


In [8]:
df_items.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
from collections import OrderedDict

In [16]:
class AutoEncoder(nn.Module):
    def __init__(self, d_in: int, dropout_prob: float=0.1):
        self.d_in = d_in
        self.dropout_prob = dropout_prob
        self.encoder = nn.Sequential(
            nn.Linear(self.d_in, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16)
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, self.d_in)
        )
        

In [12]:
nn.BatchNorm1d?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mBatchNorm1d[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mnum_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meps[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m1e-05[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmomentum[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maffine[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrack_running_stats[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Applies Batch Normalization over a 2D or 3D input as described in the paper
`Batch Normalization: Accelerating Deep N