In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import LabelEncoder

### Data preprocessing ###

In [2]:
# Read the ratings.csv file into the dataframe
df = pd.read_csv('ml-1m/ratings.csv')
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# Since the user_id and movie_id might not be consecutive, we map them into the range
LE = LabelEncoder()
users = LE.fit_transform(df['user_id'].values)
movies = LE.fit_transform(df['movie_id'].values)
ratings = df['rating'].values

In [4]:
# Split the data into 60% as training, 20% as validation, 20% as test
user, user_test, movie, movie_test, rating, rating_test = train_test_split(users, movies, ratings, test_size=0.2)
user_train, user_val, movie_train, movie_val, rating_train, rating_val = train_test_split(user,movie,rating,test_size=0.25)

In [5]:
# Data analysis
print('There are',users.max()+1,'unique users in the dataset')
print('There are',movies.max()+1,'unique movies in the dataset')

There are 6040 unique users in the dataset
There are 3706 unique movies in the dataset


### Model Prediction ###

In [6]:
# Matrix factorization using: https://datajobs.com/data-science-repo/ Recommender-Systems-[Netflix].pdf

In [7]:
# Parameter initialization
rating_mean = np.mean(rating_train)

def initialization(F,U,I):
    bu_0 = np.random.normal(0,0.0001,U)
    bi_0 = np.random.normal(0,0.0001,I)
    pu = np.random.normal(0,1/max(1,np.sqrt(F)),(U,F))
    qi = np.random.normal(0,1/max(1,np.sqrt(F)),(I,F))
    return bu_0,bi_0,pu,qi

bu_0,bi_0,pu,qi=initialization(2,users.max()+1,movies.max()+1)

In [8]:
bu_0.shape,bi_0.shape,pu.shape,qi.shape

((6040,), (3706,), (6040, 2), (3706, 2))

In [9]:
# Matrix decomposition
def model_prediction(U,I,rating_mean,bu_0,bi_0,pu,qi):
    bu = bu_0[U]
    bi = bi_0[I]
    pu = pu[U]
    qi = qi[I]
    r = rating_mean + bu + bi + np.sum(pu*qi,axis=1)
    return r

In [10]:
rating = model_prediction(user_train,movie_train,rating_mean,bu_0,bi_0,pu,qi)
rating[:10]

array([3.4132626 , 3.8881893 , 3.86157201, 3.93129115, 3.42878726,
       3.34759146, 4.00533156, 3.76057187, 4.16789689, 3.40888691])

In [11]:
np.mean(rating_train - rating)

0.0004374309093889802

### Loss Function ###

In [12]:
# As we assume no regularization penalty for the biases

def loss(rating,U,I,rating_mean,bu_0,bi_0,pu,qi):
    L = np.mean((rating-model_prediction(U,I,rating_mean,bu_0,bi_0,pu,qi))**2) # 1/N * sum == mean, do not need to np.sum first
    return L

In [13]:
loss(rating_train,user_train,movie_train,rating_mean,bu_0,bi_0,pu,qi)

1.807685777390463

In [14]:
len(bu_0)

6040

### Learning Algorithm ###

In [15]:
def learning(rating,U,I,rating_mean,bu_0,bi_0,pu,qi,B,lr,p):
    permutation = np.random.permutation(len(rating))
    for i in range(0,len(rating),B):
        ind = permutation[i:i+B]
        user = U[ind]
        item = I[ind]
        rate = rating[ind]
        bu = bu_0[user]
        bi = bi_0[item]
        pun = pu[user]
        qin = qi[item]
        dr = rate-(rating_mean+bu+bi+np.sum(pun*qin,axis=1))
        bu_0[user] += lr*dr
        bi_0[item] += lr*dr
        pu[user] += lr*(dr[:,np.newaxis]*qin-p*pun) # pu remains initialized, but not substituted
        qi[item] += lr*(dr[:,np.newaxis]*pun-p*qin)
    return 

In [16]:
def fitdata(user_train,movie_train,rating_train,user_val,movie_val,rating_val,F,lr,step,B,p):
    train_mean = np.mean(rating_train)
    bu_0,bi_0,pu,qi = initialization(F,users.max()+1,movies.max()+1)
    for k in range(step):  
        train_loss = loss(rating_train,user_train,movie_train,train_mean,bu_0,bi_0,pu,qi)    
        learning(rating_train,user_train,movie_train,train_mean,bu_0,bi_0,pu,qi,B,lr,p)
        if (k % (step//10))==0:
            val_loss = loss(rating_val,user_val,movie_val,train_mean,bu_0,bi_0,pu,qi)
            print("\t",k,train_loss,val_loss)
    train_loss = loss(rating_train,user_train,movie_train,train_mean,bu_0,bi_0,pu,qi)
    val_loss = loss(rating_val,user_val,movie_val,train_mean,bu_0,bi_0,pu,qi)
    print("\tFinal",train_loss,val_loss)
    return val_loss,train_mean,bu_0,bi_0,pu,qi

In [17]:
class RecommenderModel:
    def __init__(self,F,lr,step,B,p):
        self.F = F
        self.lr = lr
        self.step = step
        self.B = B
        self.p = p
    def fit(self,user_train,movie_train,rating_train,user_val,movie_val,rating_val):
        loss,train_mean,bu_0,bi_0,pu,qi = fitdata(user_train,movie_train,rating_train,user_val,movie_val,rating_val,self.F,self.lr,self.step,self.B,self.p)
        self.loss = loss
        self.train_mean = train_mean
        self.bu_0 = bu_0
        self.bi_0 = bi_0
        self.pu = pu
        self.qi = qi
        return loss
    def predict(self,U,I):
        prediction = model_prediction(U,I,self.rating_mean,self.bu_0,self.bi_0,self.pu,self.qi)
        return prediction

### Train the Model

#### Popularity model with F = 0

In [18]:
model=RecommenderModel(F=0,lr=0.05,step=11,B=50,p=0.1)
model.fit(user_train,movie_train,rating_train,user_val,movie_val,rating_val)

	 0 1.2490321035581162 0.8607817836824053
	 1 0.8392866674189221 0.8603226064327695
	 2 0.8348219343375991 0.8580382620467261
	 3 0.8322262735560804 0.8601634941990214
	 4 0.8338020064033137 0.8601042840861302
	 5 0.8338655313544115 0.8599042574348835
	 6 0.8339136386414858 0.8577751990471153
	 7 0.8329964408320699 0.8608780821566315
	 8 0.8348503146502935 0.8624116353444549
	 9 0.8353883809260839 0.8615478484029556
	 10 0.8349077046920185 0.8596321098170543
	Final 0.834702058761575 0.8596321098170543


0.8596321098170543

In [19]:
model=RecommenderModel(F=1,lr=0.05,step=11,B=50,p=0.1)
model.fit(user_train,movie_train,rating_train,user_val,movie_val,rating_val)

	 0 2.2806303748097103 0.8682245022628453
	 1 0.8382091190176469 0.8629120557632577
	 2 0.8337981385846155 0.8596770691591971
	 3 0.8296972503478163 0.8580264653698083
	 4 0.8277831429228325 0.8523534319604142
	 5 0.8203012498284284 0.8451047115741568
	 6 0.809131521888325 0.836454437704149
	 7 0.800790018973115 0.8336673904384306
	 8 0.7948273065419914 0.8312759320293241
	 9 0.7933532185442821 0.8309902471757691
	 10 0.7920080607268396 0.8280964321890085
	Final 0.7878154272870911 0.8280964321890085


0.8280964321890085

In [20]:
model=RecommenderModel(F=1,lr=0.05,step=11,B=30,p=0.1)
model.fit(user_train,movie_train,rating_train,user_val,movie_val,rating_val)

	 0 2.214974372223453 0.8661004909775942
	 1 0.8382758991401661 0.8608651269761817
	 2 0.8310138337854867 0.8616741767254371
	 3 0.8307345391503373 0.8583440543139387
	 4 0.8279023754515824 0.8515615933981234
	 5 0.8205000739213909 0.8454894498455862
	 6 0.8112707274929216 0.8359525510921476
	 7 0.7997378604936579 0.8315504996092236
	 8 0.795446336251474 0.8319011582058352
	 9 0.7922045099843366 0.8271199301012201
	 10 0.789044409467505 0.8284732544930842
	Final 0.7882533930733214 0.8284732544930842


0.8284732544930842

### Tuning the hyper-parameters

In [21]:
Fs = [0,1,5,10]
Bs = [30,50,60]
final = []
best_loss = 10
best_F = 0
best_B = 0
for F in Fs:
    for B in Bs:
        print('F',F, 'Batch size:',B)
        model = RecommenderModel(F=F,lr=0.05,step=11,B=B,p=0.1)
        loss1 = model.fit(user_train,movie_train,rating_train,user_val,movie_val,rating_val) # Need to keep the variable different from the global variable function
        final.append((F,B,loss1))
        if loss1 < best_loss:
            best_loss = loss1
            best_F = F
            best_Batch = B
        print('Best F is:', F,'Best Batch size is:', B, 'corresponding loss is:',loss1)

F 0 Batch size: 30
	 0 1.2490273530956846 0.8600234527227132
	 1 0.838244918541423 0.8583033343705522
	 2 0.8345088932273065 0.8596400223345517
	 3 0.8343270974078659 0.8594536449167575
	 4 0.8329373090540827 0.8604339439861656
	 5 0.8349310899453808 0.8598597410280928
	 6 0.8339769733870228 0.8608774901407457
	 7 0.8344690220770251 0.8578694262900437
	 8 0.8320483805182399 0.86044354045571
	 9 0.8344442373516417 0.8631675506832558
	 10 0.8358015557829376 0.8582362797082744
	Final 0.8325333314296496 0.8582362797082744
Best F is: 0 Best Batch size is: 30 corresponding loss is: 0.8582362797082744
F 0 Batch size: 50
	 0 1.249032955531158 0.8635112505795619
	 1 0.8411186135055914 0.8584509299884575
	 2 0.8352230187773237 0.8616182180214179
	 3 0.8351805823676766 0.8617883987648254
	 4 0.8363214962774471 0.8591501802430792
	 5 0.8343560018467354 0.8593546046669627
	 6 0.8343757836931746 0.859656730562744
	 7 0.8329362151653988 0.8606856310471586
	 8 0.8349376729518714 0.8623093230796273
	 9

In [22]:
final_df = pd.DataFrame(final,columns=['F','Batch size','corresponding loss value'])
final_pivot = final_df.pivot(index='F', columns='Batch size')
final_pivot

Unnamed: 0_level_0,corresponding loss value,corresponding loss value,corresponding loss value
Batch size,30,50,60
F,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0.858236,0.859771,0.861056
1,0.828275,0.830365,0.827603
5,0.808656,0.807743,0.810914
10,0.800038,0.804153,0.799366


### Reading from the above dataframe, we could notice that minimum loss value is achieved at F=10 and Batch size=60. Then, to find the best F, I use 5-fold cross validation to test on F=[9,10,11]

In [23]:
kfold = KFold(5,shuffle=True)
folds = []
for fold in kfold.split(users):
    folds.append(fold)    

In [24]:
def cross_validation(model,users,movies,ratings,folds):
    loss_sum = []
    count = 0
    for train,val in folds:
        users_train = users[train]
        movies_train = movies[train]
        ratings_train = ratings[train]
        users_val = users[val]
        movies_val = movies[val]
        ratings_val = ratings[val]
        loss2 = model.fit(users_train,movies_train,ratings_train,
                                   users_val,movies_val,ratings_val)      
        loss_sum.append(loss2)
        print('---------Fold:', count+1, 'loss:',loss2,'----------')
        count += 1
    loss_sum = np.array(loss_sum)
    return loss_sum.mean()

In [28]:
Fs = [9,10,11]
final1 = []
best_loss = 10
best_F = 0
for F in Fs:
    print('F',F)
    model = RecommenderModel(F=F,lr=0.05,step=11,B=60,p=0.1)
    loss2 = cross_validation(model,users,movies,ratings,folds)
    final1.append((F,loss2))
    if loss1 < best_loss:
        best_loss = loss2
        best_F = F
    print('Best F is:', F,'corresponding loss is:',loss2)

F 9
	 0 1.3601829650084232 0.8656719885684826
	 1 0.8325665828495444 0.8585337065705028
	 2 0.82411262930501 0.838898887287395
	 3 0.8028517352963126 0.8283550602000007
	 4 0.7881228551437421 0.8167946777419491
	 5 0.7721780097412038 0.8094158340990715
	 6 0.7644176527696676 0.8075834351649875
	 7 0.7577637743097725 0.8048614450351074
	 8 0.7517973632387948 0.8011516238002102
	 9 0.7481722491445405 0.8018522888359825
	 10 0.7462540553364917 0.8002203469251667
	Final 0.7416221467559038 0.8002203469251667
---------Fold: 1 loss: 0.8002203469251667 ----------
	 0 1.3594149619578566 0.8570011443443798
	 1 0.8311167722105816 0.844124931995787
	 2 0.8156040060493418 0.8324472975091385
	 3 0.8001751464411078 0.8209430922396026
	 4 0.7860993047711032 0.8143730275959218
	 5 0.7753967575894289 0.8089528185629846
	 6 0.7675515915967703 0.8071812494152841
	 7 0.7623359436903949 0.8006287342156108
	 8 0.7536472211152317 0.800766701470754
	 9 0.7505915810830033 0.7961765459307617
	 10 0.7454094032088

	 7 0.7587982253608634 0.8045505978602636
	 8 0.7509611112633298 0.8041997347644375
	 9 0.7476583243608562 0.8031043124903006
	 10 0.742675440850691 0.7980478179878127
	Final 0.7371756949855092 0.7980478179878127
---------Fold: 5 loss: 0.7980478179878127 ----------
Best F is: 11 corresponding loss is: 0.7944034965386592


In [29]:
final1_df = pd.DataFrame(final1,columns = ['F','loss'])
final1_df.head()

Unnamed: 0,F,loss
0,9,0.796027
1,10,0.794516
2,11,0.794403


### Selecting F = 11 and Batch size = 60 as the optimal tuning parameter, and fit the model to the test data

In [30]:
model = RecommenderModel(F=11,lr=0.05,step=11,B=60,p=0.1)
model.fit(users,movies,ratings,user_test,movie_test,rating_test)

	 0 1.3390383116635098 0.8311222169311986
	 1 0.8325067236173415 0.8106261859832328
	 2 0.809836904018346 0.7967817029119547
	 3 0.7984389232916002 0.783844860568531
	 4 0.7841417983389678 0.7742220535102295
	 5 0.7746864396308372 0.7668519277118556
	 6 0.7674505172452754 0.7614489527790336
	 7 0.7618282938077128 0.7559221944532081
	 8 0.7575015438119574 0.7522868279067578
	 9 0.7537663712957845 0.7492715266409525
	 10 0.7499332060560542 0.7476959671438216
	Final 0.7482201142031257 0.7476959671438216


0.7476959671438216

### The final loss is 0.7476 after fitting the Recommender Model with parameters (F = 11, learning rate = 0.05, step = 11, Batchc size = 60, penalty = 0.1) on the entire dataset, and test on the test data.