## Tuning Hyperparameter for knn ContentBasedFiltering

Run: 

<i> '$' pipenv shell </i>

<i> '$' pipenv install ipynb </i>

to be able to import functions from other ipynb files


In [24]:
import pandas as pd
import numpy as np
import os
import sys
from knn_preprocessing import knn_preprocessing
#Import will run notebook and perform PCA
from ipynb.fs.full.ContentBasedFiltering import predict_movie_rating, test_predict_mr, get_data

In [31]:
def kreuzvalidiere(n, folds, mean = False, knn_metric = 'cosine', set_k=False, k_neighbors=15, data=get_data()):
    upper = 0
    lower = 10
    
    doku = pd.DataFrame(columns={'RMSE','Testsize','Fold','mean', 'knn_metric','set_k','k_neighbors','data'})
    
    
    for i in range(0,folds):
        rmse = test_predict_mr(n, mean, knn_metric, set_k, k_neighbors)
        doku.loc[i,'RMSE'] = rmse
        doku.loc[i,'Testsize'] = n
        doku.loc[i,'Fold'] = str(i)
        doku.loc[i,'mean'] = mean
        doku.loc[i,'knn_metric'] = knn_metric
        doku.loc[i,'set_k'] = set_k
        doku.loc[i,'data'] = str(data.columns)
        if set_k:
            doku['k_neighbors'] = k_neighbors
        
        
        if (float(rmse) < float(lower)):
            lower = rmse

        if (float(rmse) > float(upper)):
            upper = rmse

    print('RMSE upper Bound: '+str(upper))
    print('RMSE lower Bound: '+str(lower))
    
    #build output csv
    doku = doku.sort_values(by=['RMSE'], ascending=True)
    doku = doku.dropna(how='all', axis=1)
    doku.to_csv('../../data/tuning/ContentbasedTuning_'+str(round(lower,4))+'.csv') 

## Tuning for Mean() vs. distance-weighted rating

In [32]:
n = 100
folds = 4
print("Testing for n= "+str(n)+", Mean: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, mean=False)

Testing for n= 100, Mean: False, Folds: 4
RMSE: 0.804577548248386
RMSE: 0.8327401654270914
RMSE: 0.7110886702295112
RMSE: 0.8678981722445336
RMSE upper Bound: 0.8678981722445336
RMSE lower Bound: 0.7110886702295112


In [10]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Mean: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, mean=False)

Testing for n= 25000, Mean: False, Folds: 10
RMSE: 0.8093717866581247
RMSE: 0.8109298741225405
RMSE: 0.8122008042821607
RMSE: 0.804284010080862
RMSE: 0.8091955357611241
RMSE: 0.8068427998071319
RMSE: 0.8090052843861771
RMSE: 0.8122322697998022
RMSE: 0.8059915436817127
RMSE: 0.8087654925187531
RMSE upper Bound: 0.8122322697998022
RMSE lower Bound: 0.804284010080862


In [11]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Mean: True"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, mean=True)

Testing for n= 25000, Mean: True, Folds: 10
RMSE: 0.8104320862629049
RMSE: 0.812571557942916
RMSE: 0.8105755766394918
RMSE: 0.8158718554745932
RMSE: 0.8194891221568065
RMSE: 0.8190834153694807
RMSE: 0.8160188609612186
RMSE: 0.8127711506527119
RMSE: 0.8084195815842717
RMSE: 0.8103183042621105
RMSE upper Bound: 0.8194891221568065
RMSE lower Bound: 0.8084195815842717


## Tuning for knn_metric

In [12]:
n = 100
folds = 4
print("Testing for n= "+str(n)+", Metric: 'cosine'"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, knn_metric = 'cosine')

Testing for n= 100, Metric: 'cosine', Folds: 4
RMSE: 0.7201151679152322
RMSE: 0.8245112956227939
RMSE: 0.8076094224428513
RMSE: 0.8226267898123077
RMSE upper Bound: 0.8245112956227939
RMSE lower Bound: 0.7201151679152322


In [None]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Metric: 'cosine'"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, knn_metric = 'cosine')

Testing for n= 25000, Metric: 'cosine', Folds: 10
RMSE: 0.810439104119757
RMSE: 0.808101040622642
RMSE: 0.8070554252186012
RMSE: 0.808497482150073
RMSE: 0.8004682317644266
RMSE: 0.8075338510177409
RMSE: 0.8156765064661976
RMSE: 0.8083017533747673
RMSE: 0.8102161937950325


In [None]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", Metric: 'minkowski'"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, knn_metric = 'minkowski')

## Tuning for k_neighbors

In [None]:
n = 100
folds = 4
print("Testing for n= "+str(n)+", set_k: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False)

In [None]:
n = 25000
folds = 10
print("Testing for n= "+str(n)+", set_k: False"+", Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=False)

In [None]:
n = 25000
folds = 10
for i in range(4,10)
    print("Testing for n= "+str(n)+", set_k: True,"+"k= "+str(i)+ "Folds: "+str(folds))
    kreuzvalidiere(n, folds, set_k=True, k_neighbors=i)

In [None]:
n = 25000
folds = 10
for i in range(10,20)
    print("Testing for n= "+str(n)+", set_k: True,"+"k= "+str(i)+ "Folds: "+str(folds))
    kreuzvalidiere(n, folds, set_k=True, k_neighbors=i)

In [None]:
n = 25000
folds = 10
for i in range(20,30)
    print("Testing for n= "+str(n)+", set_k: True,"+"k= "+str(i)+ "Folds: "+str(folds))
    kreuzvalidiere(n, folds, set_k=True, k_neighbors=i)

## Tuning for input features

In [None]:
data = get_data()

In [None]:
data

In [None]:
#TODO Functionen umschreiben so, dass man auch auf feature Hyperparameter tunen kann 
#->kreuzvalidiere und test_predict_mr müssen 'data' parameter bekommen

In [None]:
n = 25000
folds = 10
data = data['Zu testende Feature Spalten eintragen']
print("Testing for n= "+str(n)+", Features: Tbd,"+ "Folds: "+str(folds))
kreuzvalidiere(n, folds, set_k=True, k_neighbors=i, data)