## Tuning Hyperparameter for knn ContentBasedFiltering

Run: 

<i> '$' pipenv shell </i>

<i> '$' pipenv install ipynb </i>

to be able to import functions from other ipynb files


In [2]:
import pandas as pd
import numpy as np
import os
import sys
from knn_preprocessing import knn_preprocessing
#Import will run notebook and perform PCA
from ipynb.fs.full.ContentBasedFiltering import predict_movie_rating, test_predict_mr, get_data

PCA:
Main Variance impacting factors:
[0.13526531 0.09503161 0.07997255 0.06073348 0.04796419 0.0459749
 0.04306016 0.04134322 0.03749745 0.03703565 0.03502202 0.03362328
 0.03296001 0.031551   0.02949176 0.02847078 0.02764098 0.0235196
 0.02326318 0.01913025 0.01829368 0.01733844 0.01569124 0.01429026
 0.0129433  0.00832911 0.00456256]
14
works
RMSE: 0.8755066980912132
2.3070592880249023
works
RMSE: 0.819408853877708
20.770462036132812
works
RMSE: 0.8206629199854772
21.412940979003906


In [3]:
def kreuzvalidiere(n, folds, mean = False, knn_metric = 'cosine', set_k=False, k_neighbors=15, data=get_data()):
    upper = 0
    lower = 10
    
    doku = pd.DataFrame(columns={'RMSE','Testsize','Fold','mean', 'knn_metric','set_k','k_neighbors','data'})
    
    
    for i in range(0,folds):
        rmse = test_predict_mr(n, mean, knn_metric, set_k, k_neighbors, data)
        doku.loc[i,'RMSE'] = rmse
        doku.loc[i,'Testsize'] = n
        doku.loc[i,'Fold'] = str(i)
        doku.loc[i,'mean'] = mean
        doku.loc[i,'knn_metric'] = knn_metric
        doku.loc[i,'set_k'] = set_k
        doku.loc[i,'data'] = str(data.columns)
        if set_k:
            doku['k_neighbors'] = k_neighbors
        
        
        if (float(rmse) < float(lower)):
            lower = rmse

        if (float(rmse) > float(upper)):
            upper = rmse

    print('RMSE upper Bound: '+str(upper))
    print('RMSE lower Bound: '+str(lower))
    
    #build output csv
    doku = doku.sort_values(by=['RMSE'], ascending=True)
    doku = doku.dropna(how='all', axis=1)
    doku.to_csv('../../data/tuning/ContentbasedTuning_'+str(round(lower,4))+'.csv') 

## Tuning for Mean() vs. distance-weighted rating

In [6]:
n = 100
folds = 4
print("Testing for n= "+str(n)+", Mean: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, mean=False)

Testing for n= 100, Mean: False, Folds: 4
RMSE: 0.8600161357792899
RMSE: 0.7211986368173233
RMSE: 0.8312425713259929
RMSE: 0.834051301360322
RMSE upper Bound: 0.8600161357792899
RMSE lower Bound: 0.7211986368173233


In [4]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Mean: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, mean=False)

Testing for n= 25000, Mean: False, Folds: 10
RMSE: 0.7998149236829937
RMSE: 0.8160378639300775
RMSE: 0.8073930425258502
RMSE: 0.8069565348965053
RMSE: 0.810005601122022
RMSE: 0.8112758113273519
RMSE: 0.8099375545148093
RMSE: 0.8048459046791226
RMSE: 0.8032283724130602
RMSE: 0.8078172426787046
RMSE upper Bound: 0.8160378639300775
RMSE lower Bound: 0.7998149236829937


In [7]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Mean: True"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, mean=True)

Testing for n= 25000, Mean: True, Folds: 10
RMSE: 0.8112716543249315
RMSE: 0.8138867705850326
RMSE: 0.8151026948932399
RMSE: 0.8112323118427778
RMSE: 0.8072462801263179
RMSE: 0.8242223840957272
RMSE: 0.8072125771925124
RMSE: 0.8099465640124388
RMSE: 0.8160387040113014
RMSE: 0.8126307130411561
RMSE upper Bound: 0.8242223840957272
RMSE lower Bound: 0.8072125771925124


## Tuning for knn_metric

In [5]:
n = 100
folds = 4
print("Testing for n= "+str(n)+", Metric: 'cosine'"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, knn_metric = 'cosine')

Testing for n= 100, Metric: 'cosine', Folds: 4
RMSE: 0.8004467547482733
RMSE: 0.9800473258505924
RMSE: 0.8859917601047976
RMSE: 0.7667533899233383
RMSE upper Bound: 0.9800473258505924
RMSE lower Bound: 0.7667533899233383


In [None]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Metric: 'cosine'"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, knn_metric = 'cosine')

In [None]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Metric: 'minkowski'"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, knn_metric = 'minkowski')

## Tuning for k_neighbors

In [None]:
n = 100
folds = 4
print("Testing for n= "+str(n)+", set_k: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False)

In [None]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", set_k: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False)

In [None]:
n = 25000
folds = 10
for i in range(4,10):
    print("Testing for n= "+str(n)+", set_k: True,"+"k= "+str(i)+ "Folds: "+str(folds))
    kreuzvalidiere(n, folds, set_k=True, k_neighbors=i)

In [None]:
n = 25000
folds = 10
for i in range(10,20):
    print("Testing for n= "+str(n)+", set_k: True,"+"k= "+str(i)+ "Folds: "+str(folds))
    kreuzvalidiere(n, folds, set_k=True, k_neighbors=i)

In [None]:
n = 25000
folds = 10
for i in range(20,30):
    print("Testing for n= "+str(n)+", set_k: True,"+"k= "+str(i)+ "Folds: "+str(folds))
    kreuzvalidiere(n, folds, set_k=True, k_neighbors=i)

In [3]:
#Testing for k = sqrt(n) ->changed function for test to int(sqrt())
n = 25000
folds = 10
print("Testing for n= "+str(n)+", set_k: False, k= sqrt(n)"+ "Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False)

Testing for n= 25000, set_k: False, k= sqrt(n)Folds: 10
RMSE: 0.8108819503594531
RMSE: 0.8139425257633317
RMSE: 0.8104578954533992
RMSE: 0.8180258913592309
RMSE: 0.8112767943430366
RMSE: 0.8133678078371134
RMSE: 0.8160648537251654
RMSE: 0.8053924750845168
RMSE: 0.8137311172247427
RMSE: 0.813224486801822
RMSE upper Bound: 0.8180258913592309
RMSE lower Bound: 0.8053924750845168


## Tuning for input features

In [5]:
init_data = get_data()

In [8]:
#Testing top 7 features of PCA without genres
n = 25000
folds = 6
data = init_data[['user_id','imdbID','rating','imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime','Year']]
print("Testing for n= "+str(n)+", Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime','Year'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

Testing for n= 25000, Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime','Year'], Folds: 6
Index(['working with data: user_id', 'working with data: imdbID',
       'working with data: rating', 'working with data: imdbRating',
       'working with data: Metacritic', 'working with data: RottenTomatoes',
       'working with data: Awards', 'working with data: imdbVotes',
       'working with data: Runtime', 'working with data: Year'],
      dtype='object')
RMSE: 0.8081294943256773
Index(['working with data: user_id', 'working with data: imdbID',
       'working with data: rating', 'working with data: imdbRating',
       'working with data: Metacritic', 'working with data: RottenTomatoes',
       'working with data: Awards', 'working with data: imdbVotes',
       'working with data: Runtime', 'working with data: Year'],
      dtype='object')
RMSE: 0.8108374506167532
Index(['working with data: user_id', 'working with data: imdbID',
       'working with data

In [10]:
#Testing top 6 features of PCA
n = 25000
folds = 6
data = init_data[['user_id','imdbID','rating','imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime']]
print("Testing for n= "+str(n)+", Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

Testing for n= 25000, Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime'], Folds: 6
Index(['working with data: user_id', 'working with data: imdbID',
       'working with data: rating', 'working with data: imdbRating',
       'working with data: Metacritic', 'working with data: RottenTomatoes',
       'working with data: Awards', 'working with data: imdbVotes',
       'working with data: Runtime'],
      dtype='object')
RMSE: 0.8042105076268059
Index(['working with data: user_id', 'working with data: imdbID',
       'working with data: rating', 'working with data: imdbRating',
       'working with data: Metacritic', 'working with data: RottenTomatoes',
       'working with data: Awards', 'working with data: imdbVotes',
       'working with data: Runtime'],
      dtype='object')
RMSE: 0.8161203153924569
Index(['working with data: user_id', 'working with data: imdbID',
       'working with data: rating', 'working with data: imdbRating',
       'working w

In [11]:
#Testing top 5 features of PCA
n = 25000
folds = 6
data = init_data[['user_id','imdbID','rating','imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes']]
print("Testing for n= "+str(n)+", Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

Testing for n= 25000, Features: ['imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes'], Folds: 6
Index(['working with data: user_id', 'working with data: imdbID',
       'working with data: rating', 'working with data: imdbRating',
       'working with data: Metacritic', 'working with data: RottenTomatoes',
       'working with data: Awards', 'working with data: imdbVotes'],
      dtype='object')
RMSE: 0.8094617085591249
Index(['working with data: user_id', 'working with data: imdbID',
       'working with data: rating', 'working with data: imdbRating',
       'working with data: Metacritic', 'working with data: RottenTomatoes',
       'working with data: Awards', 'working with data: imdbVotes'],
      dtype='object')
RMSE: 0.8051284849885267
Index(['working with data: user_id', 'working with data: imdbID',
       'working with data: rating', 'working with data: imdbRating',
       'working with data: Metacritic', 'working with data: RottenTomatoes',
       'working with dat

In [6]:
#Testing top 4 features of PCA #Overfitting?
n = 25000
folds = 6
data = init_data[['user_id','imdbID','rating','imdbRating','Metacritic','RottenTomatoes','Awards']]
print("Testing for n= "+str(n)+", Features: ['imdbRating','Metacritic','RottenTomatoes','Awards'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

Testing for n= 25000, Features: ['imdbRating','Metacritic','RottenTomatoes','Awards'], Folds: 6
works
RMSE: 0.8307948652454795
works
RMSE: 0.8264566029741917
works
RMSE: 0.8306611990286983
works
RMSE: 0.8273709966180487
works
RMSE: 0.8247370870378248
works
RMSE: 0.8235210820282983
RMSE upper Bound: 0.8307948652454795
RMSE lower Bound: 0.8235210820282983


In [None]:
#Testing top 3 features of PCA #Overfitting?
n = 25000
folds = 6
data = init_data[['user_id','imdbID','rating','imdbRating','Metacritic','RottenTomatoes']]
print("Testing for n= "+str(n)+", Features: ['imdbRating','Metacritic','RottenTomatoes'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

In [None]:
# Test only genres
data = init_data.drop(columns={'imdbRating','Metacritic','RottenTomatoes','Awards','imdbVotes','Runtime','Year'})
n = 25000
folds = 6
print(data)
print("Testing for n= "+str(n)+", Features: ['Genres'],"+ " Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False, data=data)

In [9]:
# Test all
n = 25000
folds = 6
print("Testing for n= "+str(n)+", Features: ['All'],"+ " Folds: "+str(folds))
print(init_data)
kreuzvalidiere(n, folds, set_k=False, data=init_data)

Testing for n= 25000, Features: ['All'], Folds: 6
        user_id    imdbID  rating    Year  Runtime  imdbRating  imdbVotes  \
0        1264.0   47034.0     3.5  1954.0     96.0         7.6    27485.0   
1         981.0   47034.0     3.5  1954.0     96.0         7.6    27485.0   
2         481.0   47034.0     1.0  1954.0     96.0         7.6    27485.0   
3          98.0   47034.0     2.5  1954.0     96.0         7.6    27485.0   
4         249.0   47034.0     4.0  1954.0     96.0         7.6    27485.0   
...         ...       ...     ...     ...      ...         ...        ...   
787536    243.0   47376.0     3.0  1954.0    100.0         6.5     1813.0   
787537    417.0   43132.0     4.0  1950.0     95.0         7.6     7217.0   
787538    379.0   81433.0     3.0  1980.0    106.0         5.2     1048.0   
787539    279.0  295480.0     1.0  2001.0     95.0         6.5      156.0   
787540    312.0  298072.0     4.0  2001.0     70.0         7.0       70.0   

        RottenTomatoes  M