In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
# Import elements from pytorch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [6]:
# Import dataset
movies = pd.read_csv("ml-1m/movies.dat", sep = "::", header = None, engine = "python", encoding = "latin-1")

In [7]:
# Import users data
users = pd.read_csv("ml-1m/users.dat", sep = "::", header = None, engine = "python", encoding = "latin-1")

In [8]:
# Import ratings
ratings = movies = pd.read_csv("ml-1m/ratings.dat", sep = "::", header = None, engine = "python", encoding = "latin-1")
ratings.shape

(1000209, 4)

In [10]:
# Import training and test sets
training_set = pd.read_csv("ml-100k/u1.base", delimiter = "\t", header = None)
training_set.head()

Unnamed: 0,0,1,2,3
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [11]:
# Convert to array
training_set = np.array(training_set1, dtype = 'int')

In [12]:
# Same for test data
test_set = pd.read_csv("ml-100k/u1.test", delimiter = "\t", header = None)
test_set = np.array(test_set, dtype = 'int')

In [13]:
# Find the total number of users and movies in the full dataset (training + test)
combined = np.append(training_set, test_set, axis=0)
nb_users = len(np.unique(combined[:,0]))
nb_movies = len(np.unique(combined[:,1]))
print(nb_users)
print(nb_movies)

943
1682


In [14]:
# Convert data into array such that users are rows and movies are columns
# Torch takes input as list of lists
def convert(data):
    # Create list of lists
    # Each element is a list of all ratings
    # where rating at ith index belongs to ith movie
    new_data = []
    
    for user_id in range(1, nb_users + 1):
        movie_id = data[:,1][data[:,0] == user_id] # all movies for this user
        rating = data[:,2][data[:,0] == user_id] # all ratings for this user
        ratings = np.zeros(nb_movies)
        ratings[movie_id - 1] = rating # Indexing starts at zero but first movie id is 1
        new_data.append(ratings)
    
    return new_data

In [15]:
training_set = convert(training_set)
test_set = convert(test_set)

In [16]:
# Convert to tensors for torch inputs
training_set_tensor = torch.FloatTensor(training_set)
test_set_tensor = torch.FloatTensor(test_set)

In [18]:
# Create architecture of AutoEncoder
# Parent class = nn.Module from pytorch
# SAE inherits nn.Module
class SAE(nn.Module):
    # Init function
    # blank space after self means it'll inherit variables from its parent class
    def __init__(self, ):
        # Get all inherited methods and variables from parent class by using super()
        super(SAE, self).__init__()
        # Specify full connection
        self.fc1 = nn.Linear(nb_movies, 20) # nn.Linear as arguments takes no. of nodes in input layer and first hidden layer
        # Build a stacked second hidden layer
        self.fc2 = nn.Linear(20, 10)
        # Build a stacked third hidden layer
        self.fc3 = nn.Linear(10, 20)
        # Build a layer to decode
        self.fc4 = nn.Linear(20, nb_movies)
        # Activation layer
        self.activation = nn.Sigmoid()
        
    # Function to encode and decode in forward pass
    # Takes as argument vector of input values
    # Returns output vector
    def forward(self, x):
        # Step 1 - encoding
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        # Step 2 - decoding
        x = self.activation(self.fc3(x))
        x = self.fc4(x) # No need of activation function
        
        return x

In [19]:
# Create the object
sae = SAE()

In [21]:
# Measuring loss
criterion = nn.MSELoss()
# Optimizer
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

In [26]:
# Train the SAE
n_epoch = 200

for epoch in range(1, n_epoch + 1):
    train_loss = 0 # Average difference of actual rating and predicted rating
    counter = 0.0
    
    # Loop over all users
    for user in range(nb_users):
        input_ = Variable(training_set_tensor[user]).unsqueeze(0) # Create extra dimension
        target = input_.clone() # target set as input
        
        if torch.sum(target.data > 0) > 0:
            # Take only those observations where none of the ratings given by a user is zero
            # i.e the user has rated at least one movie
                output = sae(input_)
                # Don't compute gradient with target vector
                target.require_grad = False
                output[target == 0] = 0
                loss = criterion(output, target)
                # Compute average of error considering only those movies
                # that got non-zero ratings
                mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10) # Keep deonminator positive
                loss.backward()
                train_loss += np.sqrt(loss.item() * mean_corrector) # Adjusted loss
                counter += 1
                # Optimizer decides the amount by which updates happen
                optimizer.step()
                
    print("epoch: " + str(epoch) + " loss: " + str(train_loss/counter))

epoch: 1 loss: 1.7560476249194306
epoch: 2 loss: 1.0963511341932897
epoch: 3 loss: 1.0533084924779916
epoch: 4 loss: 1.0385673262838446
epoch: 5 loss: 1.0307154469200652
epoch: 6 loss: 1.0265841390145418
epoch: 7 loss: 1.0239332956573293
epoch: 8 loss: 1.0220311394883255
epoch: 9 loss: 1.0207079674902386
epoch: 10 loss: 1.0194161198519744
epoch: 11 loss: 1.0187327835902087
epoch: 12 loss: 1.0181741513349465
epoch: 13 loss: 1.0177639869003754
epoch: 14 loss: 1.0176088117514397
epoch: 15 loss: 1.0171605387925604
epoch: 16 loss: 1.0167779710636324
epoch: 17 loss: 1.0167184067995754
epoch: 18 loss: 1.0165444675370403
epoch: 19 loss: 1.0164219814100404
epoch: 20 loss: 1.0160958218972944
epoch: 21 loss: 1.0157835409156006
epoch: 22 loss: 1.015929542796188
epoch: 23 loss: 1.0158688907859645
epoch: 24 loss: 1.0155583450894887
epoch: 25 loss: 1.0156341881080129
epoch: 26 loss: 1.0156159906322564
epoch: 27 loss: 1.0153234531233255
epoch: 28 loss: 1.0151503474808523
epoch: 29 loss: 1.012607962543

In [31]:
# Test
test_loss = 0
counter = 0.0
# Loop over all users
for user in range(nb_users):
    input_ = Variable(training_set_tensor[user]).unsqueeze(0) # Create extra dimension
    target = Variable(test_set_tensor[user]).unsqueeze(0) # target set as input
        
    if torch.sum(target.data > 0) > 0:
        # Take only those observations where none of the ratings given by a user is zero
        # i.e the user has rated at least one movie
            output = sae(input_)
            # Don't compute gradient with target vector
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            # Compute average of error considering only those movies
            # that got non-zero ratings
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10) # Keep deonminator positive
            test_loss += np.sqrt(loss.item() * mean_corrector) # Adjusted loss
            counter += 1
            
print("loss: " + str(train_loss/counter))

loss: 1.874764958247081
