## Tuning Hyperparameter for knn ContentBasedFiltering

Run: 

<i> '$' pipenv shell </i>

<i> '$' pipenv install ipynb </i>

to be able to import functions from other ipynb files


In [1]:
import pandas as pd
import numpy as np
import os
import sys
from knn_preprocessing import knn_preprocessing
#Import will run notebook and perform PCA
from ipynb.fs.full.ContentBasedFiltering import predict_movie_rating, test_predict_mr, get_data

PCA:
Main Variance impacting factors:
[0.1352569  0.09503712 0.07998231 0.06074159 0.04796725 0.04597668
 0.0430597  0.04134133 0.03750373 0.03703565 0.03502601 0.0336219
 0.03295991 0.03155028 0.0294954  0.02847374 0.02764069 0.02351684
 0.02326137 0.01913027 0.01828914 0.01731806 0.0156969  0.01428033
 0.01294528 0.00832974 0.0045619 ]
     weights        features  abs_weights
2   0.460198      imdbRating     0.460198
5   0.428491      Metacritic     0.428491
4   0.422208  RottenTomatoes     0.422208
6   0.352119          Awards     0.352119
3   0.305617       imdbVotes     0.305617
1   0.277951         Runtime     0.277951
14  0.209256               7     0.209256
0  -0.148325            Year     0.148325
11 -0.143027               4     0.143027
25  0.120398              18     0.120398
17 -0.086768              10     0.086768
12  0.083021               5     0.083021
16  0.067442               9     0.067442
18  0.049997              11     0.049997
22 -0.047021              15  

In [2]:
def kreuzvalidiere(n, folds, mean = False, knn_metric = 'cosine', set_k=False, k_neighbors=15, data=get_data()):
    upper = 0
    lower = 10
    
    doku = pd.DataFrame(columns={'RMSE','Testsize','Fold','mean', 'knn_metric','set_k','k_neighbors','data'})
    
    
    for i in range(0,folds):
        rmse = test_predict_mr(n, mean, knn_metric, set_k, k_neighbors)
        doku.loc[i,'RMSE'] = rmse
        doku.loc[i,'Testsize'] = n
        doku.loc[i,'Fold'] = str(i)
        doku.loc[i,'mean'] = mean
        doku.loc[i,'knn_metric'] = knn_metric
        doku.loc[i,'set_k'] = set_k
        doku.loc[i,'data'] = str(data.columns)
        if set_k:
            doku['k_neighbors'] = k_neighbors
        
        
        if (float(rmse) < float(lower)):
            lower = rmse

        if (float(rmse) > float(upper)):
            upper = rmse

    print('RMSE upper Bound: '+str(upper))
    print('RMSE lower Bound: '+str(lower))
    
    #build output csv
    doku = doku.sort_values(by=['RMSE'], ascending=True)
    doku = doku.dropna(how='all', axis=1)
    doku.to_csv('../../data/tuning/ContentbasedTuning_'+str(round(lower,4))+'.csv') 

## Tuning for Mean() vs. distance-weighted rating

In [6]:
n = 100
folds = 4
print("Testing for n= "+str(n)+", Mean: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, mean=False)

Testing for n= 100, Mean: False, Folds: 4
RMSE: 0.8600161357792899
RMSE: 0.7211986368173233
RMSE: 0.8312425713259929
RMSE: 0.834051301360322
RMSE upper Bound: 0.8600161357792899
RMSE lower Bound: 0.7211986368173233


In [4]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Mean: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, mean=False)

Testing for n= 25000, Mean: False, Folds: 10
RMSE: 0.7998149236829937
RMSE: 0.8160378639300775
RMSE: 0.8073930425258502
RMSE: 0.8069565348965053
RMSE: 0.810005601122022
RMSE: 0.8112758113273519
RMSE: 0.8099375545148093
RMSE: 0.8048459046791226
RMSE: 0.8032283724130602
RMSE: 0.8078172426787046
RMSE upper Bound: 0.8160378639300775
RMSE lower Bound: 0.7998149236829937


In [7]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Mean: True"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, mean=True)

Testing for n= 25000, Mean: True, Folds: 10
RMSE: 0.8112716543249315
RMSE: 0.8138867705850326
RMSE: 0.8151026948932399
RMSE: 0.8112323118427778
RMSE: 0.8072462801263179
RMSE: 0.8242223840957272
RMSE: 0.8072125771925124
RMSE: 0.8099465640124388
RMSE: 0.8160387040113014
RMSE: 0.8126307130411561
RMSE upper Bound: 0.8242223840957272
RMSE lower Bound: 0.8072125771925124


## Tuning for knn_metric

In [5]:
n = 100
folds = 4
print("Testing for n= "+str(n)+", Metric: 'cosine'"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, knn_metric = 'cosine')

Testing for n= 100, Metric: 'cosine', Folds: 4
RMSE: 0.8004467547482733
RMSE: 0.9800473258505924
RMSE: 0.8859917601047976
RMSE: 0.7667533899233383
RMSE upper Bound: 0.9800473258505924
RMSE lower Bound: 0.7667533899233383


In [None]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Metric: 'cosine'"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, knn_metric = 'cosine')

In [None]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Metric: 'minkowski'"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, knn_metric = 'minkowski')

## Tuning for k_neighbors

In [None]:
n = 100
folds = 4
print("Testing for n= "+str(n)+", set_k: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False)

In [None]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", set_k: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False)

In [None]:
n = 25000
folds = 10
for i in range(4,10):
    print("Testing for n= "+str(n)+", set_k: True,"+"k= "+str(i)+ "Folds: "+str(folds))
    kreuzvalidiere(n, folds, set_k=True, k_neighbors=i)

In [None]:
n = 25000
folds = 10
for i in range(10,20):
    print("Testing for n= "+str(n)+", set_k: True,"+"k= "+str(i)+ "Folds: "+str(folds))
    kreuzvalidiere(n, folds, set_k=True, k_neighbors=i)

In [None]:
n = 25000
folds = 10
for i in range(20,30):
    print("Testing for n= "+str(n)+", set_k: True,"+"k= "+str(i)+ "Folds: "+str(folds))
    kreuzvalidiere(n, folds, set_k=True, k_neighbors=i)

In [3]:
#Testing for k = sqrt(n) ->changed function for test to int(sqrt())
n = 25000
folds = 10
print("Testing for n= "+str(n)+", set_k: False, k= sqrt(n)"+ "Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False)

Testing for n= 25000, set_k: False, k= sqrt(n)Folds: 10
RMSE: 0.8108819503594531
RMSE: 0.8139425257633317
RMSE: 0.8104578954533992
RMSE: 0.8180258913592309
RMSE: 0.8112767943430366
RMSE: 0.8133678078371134
RMSE: 0.8160648537251654
RMSE: 0.8053924750845168
RMSE: 0.8137311172247427
RMSE: 0.813224486801822
RMSE upper Bound: 0.8180258913592309
RMSE lower Bound: 0.8053924750845168


## Tuning for input features

In [3]:
init_data = get_data()

In [5]:
#Testing top 7 features of PCA without genres
n = 25000
folds = 6
data = init_data[['user_id','imdbID','rating','imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime','Year']]
print("Testing for n= "+str(n)+", Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime','Year'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

Testing for n= 25000, Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime','Year'], Folds: 6
RMSE: 0.8156907087952867
RMSE: 0.8147850340424856
RMSE: 0.8117248365242943
RMSE: 0.8136016717943373
RMSE: 0.8143924548155408
RMSE: 0.8018403567997943
RMSE upper Bound: 0.8156907087952867
RMSE lower Bound: 0.8018403567997943


In [6]:
#Testing top 6 features of PCA
n = 25000
folds = 6
data = init_data[['user_id','imdbID','rating','imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime']]
print("Testing for n= "+str(n)+", Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

Testing for n= 25000, Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime'], Folds: 6
RMSE: 0.8078299659164158
RMSE: 0.8127006685318212
RMSE: 0.8076608812985635
RMSE: 0.8110006929519168
RMSE: 0.808547387424126
RMSE: 0.8134418866779114
RMSE upper Bound: 0.8134418866779114
RMSE lower Bound: 0.8076608812985635


In [7]:
#Testing top 5 features of PCA
n = 25000
folds = 6
data = init_data[['user_id','imdbID','rating','imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes']]
print("Testing for n= "+str(n)+", Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

Testing for n= 25000, Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes'], Folds: 6
RMSE: 0.8126204306820846
RMSE: 0.8124221823716061
RMSE: 0.8136117042462693
RMSE: 0.8143337476468395
RMSE: 0.8065899737812474
RMSE: 0.8109207028577036
RMSE upper Bound: 0.8143337476468395
RMSE lower Bound: 0.8065899737812474


In [8]:
#Testing top 4 features of PCA #Overfitting?
n = 25000
folds = 6
data = init_data[['user_id','imdbID','rating','imdbRating','Metacritic','RottenTomatoes','Awards']]
print("Testing for n= "+str(n)+", Features: ['imdbRating','Metacritic','RottenTomatoes','Awards'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

Testing for n= 25000, Features: ['imdbRating','Metacritic','RottenTomatoes','Awards'], Folds: 6
RMSE: 0.8067276469596564
RMSE: 0.8157612957334184
RMSE: 0.8051272572311773
RMSE: 0.8052960462359586
RMSE: 0.8132469435857358
RMSE: 0.8060394144318678
RMSE upper Bound: 0.8157612957334184
RMSE lower Bound: 0.8051272572311773


In [9]:
#Testing top 3 features of PCA #Overfitting?
n = 25000
folds = 6
data = init_data[['user_id','imdbID','rating','imdbRating','Metacritic','RottenTomatoes']]
print("Testing for n= "+str(n)+", Features: ['imdbRating','Metacritic','RottenTomatoes'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

Testing for n= 25000, Features: ['imdbRating','Metacritic','RottenTomatoes'], Folds: 6
RMSE: 0.8186969891387754
RMSE: 0.8117342636926391
RMSE: 0.813796849062147
RMSE: 0.8114068295449239
RMSE: 0.8099268368368102
RMSE: 0.8080272028868583
RMSE upper Bound: 0.8186969891387754
RMSE lower Bound: 0.8080272028868583


In [10]:
# Test only genres
data = init_data.drop(columns={'imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime','Year'})
n = 25000
folds = 6
print("Testing for n= "+str(n)+", Features: ['Genres'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

Testing for n= 25000, Features: ['Genres'], Folds: 6
RMSE: 0.8164999159709324
RMSE: 0.8068043484836593
RMSE: 0.8096442497202567
RMSE: 0.8119043229662339
RMSE: 0.8129717864334833
RMSE: 0.8097118192083863
RMSE upper Bound: 0.8164999159709324
RMSE lower Bound: 0.8068043484836593


In [14]:
# Test all
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Features: ['All'],"+ " Folds: "+str(folds))
print(init_data)
kreuzvalidiere(n, folds, set_k=False, data=init_data)

Testing for n= 25000, Features: ['All'], Folds: 10
        user_id    imdbID  rating    Year  Runtime  imdbRating  imdbVotes  \
0        1264.0   47034.0     3.5  1954.0     96.0         7.6    27485.0   
1         981.0   47034.0     3.5  1954.0     96.0         7.6    27485.0   
2         481.0   47034.0     1.0  1954.0     96.0         7.6    27485.0   
3          98.0   47034.0     2.5  1954.0     96.0         7.6    27485.0   
4         249.0   47034.0     4.0  1954.0     96.0         7.6    27485.0   
...         ...       ...     ...     ...      ...         ...        ...   
787536    243.0   47376.0     3.0  1954.0    100.0         6.5     1813.0   
787537    417.0   43132.0     4.0  1950.0     95.0         7.6     7217.0   
787538    379.0   81433.0     3.0  1980.0    106.0         5.2     1048.0   
787539    279.0  295480.0     1.0  2001.0     95.0         6.5      156.0   
787540    312.0  298072.0     4.0  2001.0     70.0         7.0       70.0   

        RottenTomatoes  