In [None]:
# Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [None]:
# Importing the dataset

In [2]:
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [3]:
movies

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
users

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [5]:
ratings

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [None]:
# Preparing the training and test set

In [2]:
training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t')
print(training_set)
training_set = np.array(training_set, dtype = 'int')  # converting the dataframe into a numpy array
test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t')
print(test_set)
test_set = np.array(test_set, dtype = 'int')  # converting the dataframe into a numpy array

         1   1.1  5  874965758
0        1     2  3  876893171
1        1     3  4  878542960
2        1     4  3  876893119
3        1     5  3  889751712
4        1     7  4  875071561
...    ...   ... ..        ...
79994  943  1067  2  875501756
79995  943  1074  4  888640250
79996  943  1188  3  888640250
79997  943  1228  3  888640275
79998  943  1330  3  888692465

[79999 rows x 4 columns]
         1     6  5  887431973
0        1    10  3  875693118
1        1    12  5  878542960
2        1    14  5  874965706
3        1    17  3  875073198
4        1    20  4  887431883
...    ...   ... ..        ...
19994  458   648  4  886395899
19995  458  1101  4  886397931
19996  459   934  3  879563639
19997  460    10  3  882912371
19998  462   682  5  886365231

[19999 rows x 4 columns]


In [None]:
# Getting the total number of users and movies

In [3]:
nb_users = int(max(max(training_set[:, 0]), max(test_set[:, 0])))
nb_movies = int(max(max(training_set[:, 1]), max(test_set[:, 1])))

In [4]:
print("Total users: ",nb_users)
print("Total movies: ",nb_movies)

Total users:  943
Total movies:  1682


In [None]:
# Converting the data into a matrix with 'users' in rows and 'movies' in columns (usual structure for any deep learning model)
# We will create a list of list containing 943 lists of users where each list contains the ratings of 1682 movies

In [5]:
def convert(data):
    new_data = []
    for id_users in range(1, nb_users+1):
        id_movies = data[:, 1][data[:, 0] == id_users]  # extracts all the movie ids of the current user
        id_ratings = data[:, 2][data[:, 0] == id_users] # extracts all the ratings of the current user
        ratings = np.zeros(nb_movies)  # initialising a list of 1682 0s
        ratings[id_movies - 1] = id_ratings  # list belonging to current user gets updated by ratings of movies which are rated by the current user. Movies which are not rated by current user are rated as 0.
        new_data.append(list(ratings))  # adding the list belonging to single user to the list of list. This way 943 lists get added to list of list
    return new_data

In [6]:
training_set = convert(training_set)  
test_set = convert(test_set)

In [None]:
# Converting the data into Torch tensors

In [7]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [None]:
# Creating the architecture of the Neural Network (Stacked AutoEncoders)

In [8]:
class SAE(nn.Module):  # Here nn.Module is the base class for neural networks. We are creating a subclass SAE that extends this nn.Module class
    def __init__(self, ):
        super(SAE, self).__init__()   # To get all the inherited classes and methods of nn module class
        self.fc1 = nn.Linear(nb_movies, 20)  # Full connection between input layer and 1st hidden layer created using Linear class of nn module. No.of neurons in each hidden layer can be taken anything (Refer PyTorch Documentation for further details)
        self.fc2 = nn.Linear(20, 10)  # Full connection between 1st hidden layer and 2nd hidden layer
        self.fc3 = nn.Linear(10, 20)  # Full connection between 2nd hidden layer and 3rd hidden layer
        self.fc4 = nn.Linear(20, nb_movies)  # Full connection between 3rd hidden layer and output layer (output layer has same dimension as input layer in AutoEncoders)
        self.activation = nn.Sigmoid()   # Taking the activation func as 'sigmoid'. Here we use 'Sigmoid' class of nn module.
    def forward(self, x):   # method for performing operations inside the SAE, i.e. to perform encoding and decoding (Forward Propagation)
        x = self.activation(self.fc1(x))  # 1st encoding for 1st full connection
        x = self.activation(self.fc2(x))  # 2nd encoding for 2nd full connection
        x = self.activation(self.fc3(x))  # 1st decoding for 3rd full connection
        x = self.fc4(x)   # 2nd (Final) decoding for 4th full connection
        return x    # Now 'x' becomes the vector of predicted ratings
    
sae = SAE()
criterion = nn.MSELoss()  # defining the loss function (Mean squared Error) using MSELoss class of nn module
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)   # defining the Optimizer using RMSprop class of optim module
# weight_decay is used to reduce the lr after every few epochs. This improves the model

In [None]:
# Training the SAE

In [9]:
nb_epoch = 200
for epoch in range(1, nb_epoch+1):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)  # adding a second dimension (second dimension is of batch)to the input as PyTorch cannot take single dimension (just like keras)
        target = input.clone()  # creating a backup variable for input vector
        if torch.sum(target.data > 0) > 0:   # considering those users who have rated atleast 1 movie
            output = sae.forward(input)  # calling the forward() to get the predicted ratings
            target.require_grad = False  # for optimizing the code to reduce a lot of computations by not calculating the gradient
            output[target == 0] = 0   # The ratings which are originally 0 (not rated by a user) are taken as 0 in final output
            loss = criterion(output, target)  # calculating the loss by comparing the predicted ratings and actual ratings
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()  # Performing Backpropagation for adjusting the weights. Determines whether we need to increase the weights or decrease the weights
            train_loss += np.sqrt(loss.item() * mean_corrector) # calculating the Root Mean Square Error (RMSE)
            s += 1.
            optimizer.step()  # To apply the optimizer of RMSprop class we use inbuilt step() of the class
            # backwards() decides whether weights are to be increased or decreased whereas optimizers decides by how much amount the weights are to be adjusted
    print('epoch: '+str(epoch)+' train_loss: '+str(train_loss/s))

epoch: 1 train_loss: 1.772211365108864
epoch: 2 train_loss: 1.096502091451791
epoch: 3 train_loss: 1.0535028838367548
epoch: 4 train_loss: 1.0383157640212335
epoch: 5 train_loss: 1.0308870318950867
epoch: 6 train_loss: 1.0267459663600502
epoch: 7 train_loss: 1.0237340680991933
epoch: 8 train_loss: 1.0218786398369761
epoch: 9 train_loss: 1.0206072603270961
epoch: 10 train_loss: 1.0198187932755418
epoch: 11 train_loss: 1.0187255985057282
epoch: 12 train_loss: 1.018535845761225
epoch: 13 train_loss: 1.017964634409972
epoch: 14 train_loss: 1.0175357420590194
epoch: 15 train_loss: 1.017005529882621
epoch: 16 train_loss: 1.01705321570111
epoch: 17 train_loss: 1.0166847821610583
epoch: 18 train_loss: 1.016551206606653
epoch: 19 train_loss: 1.0159693468004354
epoch: 20 train_loss: 1.016239082294117
epoch: 21 train_loss: 1.015988103532113
epoch: 22 train_loss: 1.015824577331143
epoch: 23 train_loss: 1.0157990025342651
epoch: 24 train_loss: 1.0157604455752274
epoch: 25 train_loss: 1.015862452322

epoch: 200 train_loss: 0.9139163664651386


In [None]:
# Testing the SAE

In [11]:
test_loss = 0
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)  # adding a second dimension (second dimension is of batch)to the input as PyTorch cannot take single dimension (just like keras)
    # here we take training set and not test set because we want to predict the ratings of the movies that the user has not watched in the training set and then compare these predicted ratings with the actual ratings of those movies that are present in the test set.
    target = Variable(test_set[id_user]).unsqueeze(0)  # target contains the actual ratings of the movies in the test set that were not watched by the users in the training set
    if torch.sum(target.data > 0) > 0:   # considering those users who have rated atleast 1 movie
        output = sae.forward(input)  # calling the forward() to get the predicted ratings
        target.require_grad = False  # for optimizing the code to reduce a lot of computations by not calculating the gradient
        output[target == 0] = 0   # The ratings which are originally 0 (not rated by a user) are taken as 0 in final output
        loss = criterion(output, target)  # calculating the loss by comparing the predicted ratings and actual ratings
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.item() * mean_corrector) # calculating the Root Mean Square Error (RMSE)
        s += 1.
print('test_loss: '+str(test_loss/s))

test_loss: 0.9496858995110379


In [None]:
# Making Predictions for a given user and for a given movie

In [70]:
user_id = 3
movie_id = 482
input = Variable(training_set[user_id-1]).unsqueeze(0)
predicted_rating = sae.forward(input)
predicted_rating = predicted_rating.data.numpy()
print('Predicted Rating: '+ str(predicted_rating[0, movie_id-1]))

Predicted Rating: 3.6260068
