# Content-based Filtering

Aproach:
- Table mit Features und Ratings erstellen, ohne User - auf Basis des OMDB Datasets

- sklearn-kNN mit cosine-similarity darauf anwenden

- Funktion schreiben, die auf basis des ratings der neighbours das rating eines Filmes vorhersagt.


In [1]:
import pandas as pd
import numpy as np
import time
import getpass 

from itertools import product
import knn_Carmen

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error

kwargs = dict(random_state=42)

In [2]:
# function that computes a rating based on the neighbors
def compute_rating(neighbors, distances, mean = False):
    if mean:
        # Mittelwert der k-nächsten Nachbarn

        pred = neighbors.mean()
    else:
        # Gewichtung der Bewertung der Nachbarn je nach Distanz
        pred = sum(neighbors*((1/(distances[0]+0.000001)**1)/(sum((1/(distances[0]+0.000001)**1)))))

    return float(pred)

In [3]:
## function that predicts the rating of a movie from its imdbID and its nearest neighbors

def predict_movie_rating(imdbID, ratings, feature_data, mean, knn_metric, set_k, k_neighbors=10):
    
    #If no explicit number of neighbors is passed -> use variable neighbors function
    if set_k:
        k_neighbors = k_neighbors
    else:    
        k_neighbors = adjust_k(ratings)

  
    # Get real rating -> remove this in the end -> currently done for validation
    real_ratings = ratings.loc[(ratings['imdbID'] == imdbID)]
    real_idx = ratings.loc[(ratings['imdbID'] == imdbID)].index
    
    #remove real rating for training
    ratings = ratings[ratings['imdbID'] != imdbID] 
    
    # Set algorithm and params
    knn = NearestNeighbors(metric=knn_metric, algorithm='brute', n_neighbors=k_neighbors, n_jobs=-1)
    
    #Select features rated by user for training
    feat = feature_data[ratings['imdbID'].index]

    ratings.reset_index(inplace=True, drop=False)
    
    #train algorithm
    knn.fit(feat)

    #generate input data
    input_data = feature_data[real_idx]
    input_data = input_data.reshape(1, -1)

    #Prediction -> get x nearest neighbors of imdbID
    distances, indices = knn.kneighbors(input_data, n_neighbors=k_neighbors)

    # Zieht indices und ratings der neighbors
    neighbor_ratings = ratings['rating'].loc[indices[0]]

    #Calculate rating
    pred = compute_rating(neighbor_ratings, distances, mean)

    return pred, real_ratings['rating'].values[0]

In [4]:
#Use optimal k based on # rated movies
def adjust_k(ratings_k):
    adjusted_k = 10
    r_size = len(ratings_k)
  
    if 40 < r_size < 100:
        adjusted_k = 15
    elif 100 < r_size < 500:
        adjusted_k = 20
    elif 500 < r_size < 1500:
        adjusted_k = 25
    elif r_size  > 1500:
        adjusted_k = 30

    return adjusted_k

In [5]:
#Load data
ratings = pd.read_csv('../../data/preprocessed/ratings_clean_std_0.csv').drop(columns={'Unnamed: 0'})
omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')

#delete ratings of movie "nomads"
ratings = ratings[ratings['imdbID']!='tt0720339']
#get username
username = getpass.getuser()

In [6]:
def hypersearch(n_samples, params_features, params_knn):
    #import features for test
    features, names = knn_Carmen.features(**params_features)
    print('Used features: '+str(names))
    
    #Sample testset
    test_set = ratings.sample(n_samples)

    predictions = pd.DataFrame(columns=['Prediction'])
    reals = pd.DataFrame(columns=['Real_Rating'])
    
    # add imdbID and set as index
    features = omdb[['imdbID']].join(pd.DataFrame(features)).set_index('imdbID')
    start = time.time()

    for row in test_set.itertuples():
        # select user and movie
        imdbID = row.imdbID
        userID = row.user_id
        # select ratings of the user
        ratings_user = ratings.loc[ratings['user_id'] == userID]
        ratings_user.reset_index(inplace=True, drop=True)
     
        # select features of corresponding movies and convert to array
        features_user = np.array(features.loc[ratings_user['imdbID']])
       
        # compute predictions
        pred , real = predict_movie_rating(imdbID, userID, ratings_user, features_user, **params_knn)
        # store predictions and real ratings
        predictions.loc[row[0]] = pred
        reals.loc[row[0]] = real

    rmse = mean_squared_error(reals['Real_Rating'], predictions['Prediction'], squared=False)
    stop = time.time()
    runtime = stop-start
    #log and save results
    results = pd.DataFrame()
    results['Features'] = [names]
    results['Parameters_knn'] = [list(params_knn.items())]
    results['Sample_size'] = n_samples
    results['RMSE'] = rmse
    results['Runtime'] = runtime


    try:
        results_total = pd.read_csv('Hypersearch_Content_'+str(username)+'.csv')
        results_total = results_total.append(results)
   
    except FileNotFoundError:
        results_total = results
    results_total=results_total.sort_values('RMSE')
    results_total[['Features', 'Parameters_knn', 'RMSE','Sample_size', 'Runtime']].to_csv('Hypersearch_Content_'+str(username)+'.csv')
    
    print(rmse)

In [7]:
# set the combination of features you want to test
params_f = {'threshold_actors': [20,50], 'ts_languages': [20] , 'year': [True, False], 'runtime': [True, False], 'imdbVotes': [True, False], 'series': [True,False], 'awards': [True,False], 'genres': [True,False], 'imdb_rating':[True,False], 'roto_rating':[True], 'pg_rating':[True], 'threshold_keywords':[10], 'threshold_plot':[100], 'threshold_directors':[5]}

In [8]:
# set the possible parameters for the kNN
params_k = {'mean':[False, True], 'knn_metric':['cosine','minkowski'], 'set_k':[False]}

In [9]:
# find permutations over given hyperparameter space
permutations_f = [dict(zip(params_f, v))for v in product(*params_f.values())]
permutations_k = [dict(zip(params_k, v))for v in product(*params_k.values())]

In [None]:
# run gridsearch over defined hyperparameter space
for features in permutations_f:
    for k_params in permutations_k:
        hypersearch(100, features, k_params)