### Import dataset 

In [1]:
import pandas as pd
import numpy as np
import time

train_df_small = pd.read_csv('./final_project_movielens/train_df_small.csv')
val_df_small = pd.read_csv('./final_project_movielens/val_df_small.csv')
test_df_small = pd.read_csv('./final_project_movielens/test_df_small.csv')

In [2]:
train_df_large = pd.read_csv('./final_project_movielens/train_df_large.csv')
val_df_large = pd.read_csv('./final_project_movielens/val_df_large.csv')
test_df_large = pd.read_csv('./final_project_movielens/test_df_large.csv')

### Load lenskit

In [3]:
# pip install lenskit

In [38]:
# from lenskit.datasets import ML100K
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, als, item_knn as knn
from lenskit import topn

### Small dataset

In [39]:
train_df_small.columns = ['user', 'item', 'rating', 'timestamp']
val_df_small.columns = ['user', 'item', 'rating', 'timestamp']
test_df_small.columns = ['user', 'item', 'rating', 'timestamp']

### -train function with hyper parameter tuning

In [40]:
# training function 
def eval(aname, algo, train, test):
    fittable = util.clone(algo)
    fittable = Recommender.adapt(fittable)
    
    # measure training time (fit)
    start_time = time.time()
    fittable.fit(train)
    end_time = time.time()
    print("Total execution time: {} seconds".format(end_time - start_time))
    
    users = test.user.unique()
    # now we run the recommender
    recs = batch.recommend(fittable, users, 100)
    # add the algorithm name for analyzability
    recs['Algorithm'] = aname
    return recs

In [61]:
test_size = [0.1, 0.2, 0.3 ,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

for i in test_size:
    algo_als = als.BiasedMF(features=100, reg=0.0001, iterations=15)
    print('test_size: ', int(len(train_df_small) * i))
    all_recs = eval('ALS', algo_als, train_df_small[:int(len(train_df_small)*i)], val_df_small)

test_size:  8553
Total execution time: 0.19531989097595215 seconds
test_size:  17106
Total execution time: 0.241441011428833 seconds
test_size:  25659
Total execution time: 0.29753899574279785 seconds
test_size:  34212
Total execution time: 0.3342018127441406 seconds
test_size:  42765
Total execution time: 0.425494909286499 seconds
test_size:  51318
Total execution time: 0.49080991744995117 seconds
test_size:  59870
Total execution time: 0.6120648384094238 seconds
test_size:  68424
Total execution time: 0.6114380359649658 seconds
test_size:  76977
Total execution time: 0.6592328548431396 seconds
test_size:  85530
Total execution time: 0.7269601821899414 seconds


In [54]:
import itertools as it

rank = [50, 100, 200]
regParam = [0.00001, 0.00005, 0.0001, 0.0005]
maxIter = [5, 10, 15]
params = it.product(rank, regParam, maxIter)

for rank, regParam, maxIter in params:
    algo_als = als.BiasedMF(features=rank, reg=regParam, iterations=maxIter)
    print('Hyperparameters) rank: ', rank, 'regParam: ', regParam, 'maxIter: ', maxIter)
    
    all_recs = eval('ALS', algo_als, train_df_small, val_df_small)
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(all_recs, val_df_small)
    als_result = results.groupby('Algorithm').ndcg.mean()
    print('val_score: ', als_result[0])
    print()

Hyperparameters) rank:  50 regParam:  1e-05 maxIter:  5
Total execution time: 0.19615983963012695 seconds
val_score:  0.05726957549863248

Hyperparameters) rank:  50 regParam:  1e-05 maxIter:  10
Total execution time: 0.26905107498168945 seconds
val_score:  0.05175166653057153

Hyperparameters) rank:  50 regParam:  1e-05 maxIter:  15
Total execution time: 0.3735520839691162 seconds
val_score:  0.05386986055024137

Hyperparameters) rank:  50 regParam:  5e-05 maxIter:  5
Total execution time: 0.15253591537475586 seconds
val_score:  0.05665616716793309

Hyperparameters) rank:  50 regParam:  5e-05 maxIter:  10
Total execution time: 0.2769949436187744 seconds
val_score:  0.05371244997257696

Hyperparameters) rank:  50 regParam:  5e-05 maxIter:  15
Total execution time: 0.37346887588500977 seconds
val_score:  0.05363593369018333

Hyperparameters) rank:  50 regParam:  0.0001 maxIter:  5
Total execution time: 0.15373015403747559 seconds
val_score:  0.05166733517459215

Hyperparameters) rank:  

### -validation dataset

In [32]:
all_recs = eval('ALS', algo_als, train_df_small, val_df_small)

Total execution time: 0.1879270076751709 seconds


In [30]:
rla = topn.RecListAnalysis()
rla.add_metric(topn.ndcg)
results = rla.compute(all_recs, val_df_small)
results.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,ndcg
Algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1
ALS,580,100,0.047678
ALS,53,100,0.0
ALS,322,100,0.207109


In [50]:
a = results.groupby('Algorithm').ndcg.mean()
a[0]

0.0766641368587348

### -test dataset

In [35]:
all_recs = eval('ALS', algo_als, train_df_small, test_df_small)

Total execution time: 0.946160078048706 seconds


In [36]:
len(all_recs), len(test_df_small)

(12200, 7877)

In [37]:
rla = topn.RecListAnalysis()
rla.add_metric(topn.ndcg)
results = rla.compute(all_recs, test_df_small)
results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,ndcg
Algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1
ALS,496,100,0.387574
ALS,85,100,0.105765
ALS,251,100,0.185802
ALS,588,100,0.26236
ALS,78,100,0.253692


In [38]:
results.groupby('Algorithm').ndcg.mean()

Algorithm
ALS    0.076414
Name: ndcg, dtype: float64

### large dataset

In [40]:
train_df_large.columns = ['user', 'item', 'rating', 'timestamp']
val_df_large.columns = ['user', 'item', 'rating', 'timestamp']
test_df_large.columns = ['user', 'item', 'rating', 'timestamp']

### -train function

In [41]:
algo_als = als.BiasedMF(100)

In [42]:
def eval(aname, algo, train, test):
    fittable = util.clone(algo)
    fittable = Recommender.adapt(fittable)
    
    # measure training time (fit)
    start_time = time.time()
    fittable.fit(train)
    end_time = time.time()
    print("Total execution time: {} seconds".format(end_time - start_time))
    
    users = test.user.unique()
    # now we run the recommender
    recs = batch.recommend(fittable, users, 100)
    # add the algorithm name for analyzability
    recs['Algorithm'] = aname
    return recs

### -validation dataset

In [43]:
all_recs = eval('ALS', algo_als, train_df_large, val_df_large)

Total execution time: 307.5095829963684 seconds


In [44]:
len(all_recs), len(val_df_large)

(5398500, 2223502)

In [45]:
rla = topn.RecListAnalysis()
rla.add_metric(topn.ndcg)
results = rla.compute(all_recs, val_df_large)
results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,ndcg
Algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1
ALS,471,100,0.043053
ALS,833,100,0.0
ALS,2659,100,0.061246
ALS,3997,100,0.0
ALS,4818,100,0.011212


In [46]:
results.groupby('Algorithm').ndcg.mean()

Algorithm
ALS    0.024968
Name: ndcg, dtype: float64

### -test dataset

In [47]:
all_recs = eval('ALS', algo_als, train_df_large, test_df_large)

Total execution time: 308.6373801231384 seconds


In [48]:
len(all_recs), len(test_df_large)

(5400900, 2177556)

In [49]:
rla = topn.RecListAnalysis()
rla.add_metric(topn.ndcg)
results = rla.compute(all_recs, test_df_large)
results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,ndcg
Algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1
ALS,1591,100,0.0
ALS,1959,100,0.0
ALS,2142,100,0.0
ALS,3175,100,0.0
ALS,3749,100,0.0


In [50]:
results.groupby('Algorithm').ndcg.mean()

Algorithm
ALS    0.024963
Name: ndcg, dtype: float64