In [None]:
# AutoEncoders

# Importing the libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [None]:
# Importing the dataset
movies = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Boltzman/ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Boltzman/ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Boltzman/ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

# Preparing the training set and the test set
training_set = pd.read_csv('/content/drive/My Drive/Colab Notebooks/AutoEncoders/ml-100k/ml-100k/u1.base', delimiter = '\t')
test_set = pd.read_csv('/content/drive/My Drive/Colab Notebooks/AutoEncoders/ml-100k/ml-100k/u1.test', delimiter = '\t')
training_set = np.array(training_set, dtype = 'int')
test_set = np.array(test_set, dtype = 'int')

# Getting the number of users and movies
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

# Converting the data into an array with users in lines and movies in columns
def convert(data):
    new_data = []
    for id_users in range(1, nb_users + 1):
        id_movies = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data
training_set = convert(training_set)
test_set = convert(test_set)

# Converting the data into Torch tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)


In [None]:
# Creating the architecture of the Neural Network
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(nb_movies, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 20)
        self.fc4 = nn.Linear(20, nb_movies)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

In [None]:
# Training the SAE
nb_epoch = 200
for epoch in range (1, nb_epoch+1):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0 :
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data.item()*mean_corrector)
            s += 1.
            optimizer.step()
    print("epoch: "+ str(epoch) + " loss: "+str(train_loss/s))


epoch: 1 loss: 1.7720670899728646
epoch: 2 loss: 1.096816029499128
epoch: 3 loss: 1.0533041812493262
epoch: 4 loss: 1.0383477828221173
epoch: 5 loss: 1.0308013534164875
epoch: 6 loss: 1.0265501352810142
epoch: 7 loss: 1.0236692375286607
epoch: 8 loss: 1.0218415573322486
epoch: 9 loss: 1.0208105255667557
epoch: 10 loss: 1.019407587359843
epoch: 11 loss: 1.0187731783478535
epoch: 12 loss: 1.0183718958770325
epoch: 13 loss: 1.0178283026263752
epoch: 14 loss: 1.017302212885839
epoch: 15 loss: 1.0171401558499702
epoch: 16 loss: 1.016689754245841
epoch: 17 loss: 1.0167443227458126
epoch: 18 loss: 1.0165165439840151
epoch: 19 loss: 1.0163418684544616
epoch: 20 loss: 1.0159326713816812
epoch: 21 loss: 1.01609063770526
epoch: 22 loss: 1.0160295767359768
epoch: 23 loss: 1.0158813152917174
epoch: 24 loss: 1.0157681781096197
epoch: 25 loss: 1.0157543645500264
epoch: 26 loss: 1.015608926654231
epoch: 27 loss: 1.015312471004435
epoch: 28 loss: 1.0152465794869103
epoch: 29 loss: 1.0137375591562772
ep

In [None]:
# Testing the SAE
test_loss = 0
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)
    target = Variable(test_set[id_user]).unsqueeze(0)
    if torch.sum(target.data > 0) > 0 :
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data.item()*mean_corrector)
        s += 1.
        optimizer.step()
print("test_loss: "+str(test_loss/s))

test_loss: 0.9508767464106335


In [None]:
#Making Predictions with visualisation in the form of a table
user_id = 0
movie_title = movies.iloc[:nb_movies, 1:2]
user_rating = training_set.data.numpy()[user_id, :].reshape(-1,1)
user_target = test_set.data.numpy()[user_id, :].reshape(-1,1)
 
user_input = Variable(training_set[user_id]).unsqueeze(0)
predicted = sae(user_input)
predicted = predicted.data.numpy().reshape(-1,1)
 
# Join all info in one dataset
result_array = np.hstack([movie_title, user_target, predicted])
result_array = result_array[result_array[:, 1] > 0]
result_df = pd.DataFrame(data=result_array, columns=['Movie', 'Target Rating', 'Predicted'])

In [None]:
result_df

Unnamed: 0,Movie,Target Rating,Predicted
0,GoldenEye (1995),3,3.92705
1,Dracula: Dead and Loving It (1995),5,4.59359
2,Nixon (1995),5,4.01931
3,Sense and Sensibility (1995),3,3.39048
4,Money Train (1995),4,3.55919
...,...,...,...
131,Legends of the Fall (1994),2,2.88937
132,Major Payne (1994),4,4.21844
133,Little Odessa (1994),1,2.57925
134,My Crazy Life (Mi vida loca) (1993),4,2.70674
