In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as  np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [3]:
movies = pd.read_csv('/content/drive/MyDrive/ml-1m/movies.dat',sep='::',header=None,engine='python',encoding='latin-1')
user = pd.read_csv('/content/drive/MyDrive/ml-1m/users.dat',sep='::',header=None,engine='python',encoding='latin-1')
ratings = pd.read_csv('/content/drive/MyDrive/ml-1m/ratings.dat',sep='::',header=None,engine='python',encoding='latin-1')

In [4]:
movies.head()
#movie id , #movie name , #genre
#movie id will be fed for training instead of whole movie name since we are not producing a NLP model.

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()
#user no. , #movie id , # ratings, #timestamp(don't care->to be removed)

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
#base ->train and test ->test dataset
#always remember we have data in csv but we perform calculation after converting them into numpy arrays.
training_set = pd.read_csv('/content/drive/MyDrive/ml-100k/u1.base',delimiter='\t',header=None)
#delimiter tab should not be replaced by sep delimeter is best to be used here.

In [7]:
training_set.head()
#the same scheme followed user no., movie id, movie rating, timestamp

Unnamed: 0,0,1,2,3
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [8]:
#train test split ratio can be determined via
len(training_set)
#this clearly portrays that out of 100k entries  80k are for training i.e 4:1 train:test

80000

In [9]:
training_set = np.array(training_set,dtype='int')

In [10]:
testing_set = pd.read_csv('/content/drive/MyDrive/ml-100k/u1.test',delimiter='\t',header=None)

In [11]:
testing_set = np.array(testing_set,dtype='int')

 Making 2 matrices for training and testing and we get the rating of  amovie form the user id.
 i.e user id as row index and movie  id as columns and the corresponding rating as data (split is random).

In [12]:
#since the spit is random hence it may happen that the last user may be in test
user_count = max(max(training_set[:,0]),max(testing_set[:,0]))
movie_count = max(max(testing_set[:,1]),max(training_set[:,1]))
print(user_count, movie_count)

943 1682


In [13]:
#Now we create a more comprehensive matrix representation of the datatset.
#since we have to develop a recommender system based on ratings hence movie rating should be the features, target to be predicted is rating.
#creating a function is the best method to tackle the situation where we have to convert multiple instances of data into tensors.
def convert(dataset):
  entry = []
#instead of making 2 D tensor it is easier to get a list(for appending entries)->numpy(for managing the ratings based on movies indexes)->tensor!!
  for i in range(1,user_count+1):
    movie_rating = np.zeros(movie_count)
    movie_id = dataset[:,1][dataset[:,0]==i]
    rating = dataset[:,2][dataset[:,0]==i]
    movie_rating[movie_id - 1] = rating
    entry.append(list(movie_rating))
  return entry
  #the user id 1 has index 0 in this list of lists

In [14]:
training_set = convert(training_set)
testing_set = convert(testing_set)

In [15]:
#we can build the arrays by numpy arrays also but pytorch tensors(there are tnsorflow tensors also!!) provide extra features and enhanced performance.
training_set = torch.FloatTensor(training_set)
#now this provides us a tensor i.e a multidimensional array of same dtype and tensor requires list of list as an argument not numpy arrays.
testing_set = torch.FloatTensor(testing_set)
#point to be noted the thing we are creating is a recommender system and the best recommender system in action is in googlecolab u find that code are already recommended !!!

In [16]:
#since we will use a binary encoding scheme for producing a recommender system hence for the movies which were not seen by a user i.e 0 rated have to be assigned another value so that it does not interfere with the model
training_set[training_set==0] = -1#here just like pandas it has this feature since the entire matrix has same dtype
training_set[training_set==1] = 0
training_set[training_set==2] = 0
training_set[training_set==3] = 1
training_set[training_set==4] = 1
training_set[training_set==5] = 1
#the movie rating scheme followed is 1 for >=3

In [17]:
testing_set[testing_set==0] = -1
testing_set[testing_set==1] = 0
testing_set[testing_set==2] = 0
testing_set[testing_set==3] = 1
testing_set[testing_set==4] = 1
testing_set[testing_set==5] = 1

Our objective is to create a recommender system using restricted boltzmann machine for rating the movies which were not rated. Here we cannot take genre as features instead rating are features with their movie ids since there will be no way to remap the genre to the original movie.

In [46]:
# The parameters involved in RBM are hidden nodes, visible nodes, weights, Bias for probability given hidden node
class RBM():  # keep first letter capital convention
    def __init__(self, visible, hidden):  # it defines the argument we feed in a constructor of the class
        self.W = torch.randn(hidden, visible)  # weights
        self.h_bias = torch.randn(1, hidden)  # tensor needs 2 dimensions necessarily hence written like this.
        self.v_bias = torch.randn(1, visible)

    # probability that a hidden node is activated given the data in visible nodes.
    def sample_hidden(self, v):
        v_dotW = torch.mm(v, self.W.T)
        activation = v_dotW + self.h_bias.expand_as(v_dotW)
        prob_vtoh = torch.sigmoid(activation)
        return prob_vtoh, torch.bernoulli(prob_vtoh)

    # we are constructing a Bernoulli RBM hence the second thing that is returned contains all the activated hidden nodes.
    def sample_visible(self, h):
        h_dotW = torch.mm(h, self.W)  # no transpose
        prob_htov = h_dotW + self.v_bias.expand_as(h_dotW)
        return torch.sigmoid(prob_htov), torch.bernoulli(torch.sigmoid(prob_htov))

    # Now contrastive divergence
    def train(self, v0, vk, ph0, phk):
        self.W += torch.mm(ph0,v0) - torch.mm(phk,vk)
        self.v_bias += torch.sum((v0 - vk), 0)
        self.h_bias += torch.sum((ph0 - phk), 0)


In [47]:
visible = len(training_set[0])#number of movies i.e features
hidden = 100#completely chosen at will since it is based on the hidden patterns it draws from the data.
#number of hidden nodes corresponds to number of features we want to detect!!
batch_size = 100#here the total number of batches is again a hyper-parameter, total 943 examples so 943/64 batch in an epoch
rbm = RBM(visible,hidden)

In [51]:
# training of model
# no gradient descent hence no learning rate used
epochs = 10
s = 0.
for epoch in range(1, epochs + 1):
    training_loss = 0
    for id in range(0, user_count - batch_size, batch_size):
        vk = training_set[id:id + batch_size]
        v0 = training_set[id:id + batch_size]
        ph0, _ = rbm.sample_hidden(v0)
        # this is for getting only the first returning value of the function.
        # this is the gibbs sampling here visible nodes are reconstructed i.e first first hidden then visible
        for k in range(10):
            _, hk = rbm.sample_hidden(vk)
            _, vk = rbm.sample_visible(hk)

        # now update the weights after epoch
            vk[v0 < 0] = v0[v0 < 0]  # this is added so that the training is only done on the rated movies
        phk, _ = rbm.sample_hidden(vk)
        rbm.train(v0, vk, ph0, phk)
        training_loss += torch.mean(torch.abs(v0[v0 > 0] - vk[v0 > 0]))  # this is for rated movies only
        s += 1.
    print(f'train loss is {training_loss / s} for epoch {epoch}')


train loss is 0.6198312044143677 for epoch 1
train loss is 0.16360561549663544 for epoch 2
train loss is 0.08224501460790634 for epoch 3
train loss is 0.05362746864557266 for epoch 4
train loss is 0.041928742080926895 for epoch 5
train loss is 0.03664872422814369 for epoch 6
train loss is 0.026912836357951164 for epoch 7
train loss is 0.01885838434100151 for epoch 8
train loss is 0.017975958064198494 for epoch 9
train loss is 0.0163985975086689 for epoch 10


In [54]:
# testing of model
# no gradient descent hence no learning rate used
s = 0.
testing_loss = 0
for id in range(user_count):
  vk = training_set[id:id + 1]
  v0 = testing_set[id:id + 1]
  # this is for getting only the first returning value of the function.
  # this is the gibbs sampling here visible nodes are reconstructed i.e first first hidden then visible
  if len(v0[v0>=0])>0:
    _, hk = rbm.sample_hidden(vk)
    _, vk = rbm.sample_visible(hk)
          #k can be removed since it is only 1 step no kstep process!!
          # now update the weights after epoch
  #phk,vk are weigths which were updated previously now no training hence not required!!
    testing_loss += torch.mean(torch.abs(v0[v0 > 0] - vk[v0 > 0]))  # this is for rated movies only
    s += 1.
print(f'testing loss is {testing_loss / s}')


testing loss is 0.2122347205877304
