In [28]:
import pandas as pd
import numpy as np

import surprise
from surprise import NormalPredictor
from surprise import SVD
from surprise import SlopeOne
from surprise import Dataset
#from surprise import DatasetAutoFolds
from surprise import Reader
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, BaselineOnly, CoClustering, SVD, SVDpp

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as rmse
from sklearn import pipeline

import knn_features
import nltk

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

# set random state for reproducibility
kwargs = dict(random_state=42)

Documentary of scikit-suprise can be found under https://surprise.readthedocs.io/en/stable/

In [29]:
ratings = pd.read_csv('../data/preprocessed/ratings_clean_std_0.csv', sep=',').drop(columns={'Unnamed: 0'})
omdb = pd.read_csv('../data/preprocessed/omdb_cleaned.csv')

In [30]:
#delete ratings of movie "nomads" until last model
ratings = ratings[ratings['imdbID']!='tt0720339']

In [31]:
ratings.reset_index(inplace=True, drop=True)

In [32]:
# build a reader, define the rating scale (minimum and maximum value)
reader = Reader(rating_scale=(0.5, 5))

In [33]:
#Use optimal k based on # rated movies
def adjust_k(ratings_k):
    adjusted_k = 10
    r_size = len(ratings_k)
  
    if 40 < r_size < 100:
        adjusted_k = 15
    elif 100 < r_size < 500:
        adjusted_k = 20
    elif 500 < r_size < 1500:
        adjusted_k = 25
    elif r_size  > 1500:
        adjusted_k = 30

    return adjusted_k

In [34]:
# function that computes a rating based on the neighbors
def compute_rating(neighbors, distances, mean):
    if mean:
        # Mittelwert der k-nächsten Nachbarn
        pred = neighbors.mean()
    else:
        # Gewichtung der Bewertung der Nachbarn je nach Distanz
        pred = sum(neighbors*((1/(distances[0]+0.000001)**1)/(sum((1/(distances[0]+0.000001)**1)))))

    return float(pred)

In [35]:
## function that predicts the rating of a movie from its imdbID and its nearest neighbors
def predict_movie_rating(imdbID, ratings, feature_data, mean, knn_metric, set_k, k_neighbors=10):
    
    #If no explicit number of neighbors is passed -> use variable neighbors function
    if set_k:
        k_neighbors = k_neighbors
    else:    
        k_neighbors = adjust_k(ratings)

  
    # Get real rating -> remove this in the end -> currently done for validation
    #real_ratings = ratings.loc[(ratings['imdbID'] == imdbID)]
    real_idx = ratings.loc[(ratings['imdbID'] == imdbID)].index
    
    #remove real rating for training
    ratings = ratings[ratings['imdbID'] != imdbID] 
    
    # Set algorithm and params
    knn = NearestNeighbors(metric=knn_metric, algorithm='brute', n_neighbors=k_neighbors, n_jobs=-1)
    
    #Select features rated by user for training
    feat = feature_data[ratings['imdbID'].index]

    ratings.reset_index(inplace=True, drop=False)
    
    #train algorithm
    knn.fit(feat)

    #generate input data
    input_data = feature_data[real_idx]
    input_data = input_data.reshape(1, -1)

    #Prediction -> get x nearest neighbors of imdbID
    distances, indices = knn.kneighbors(input_data, n_neighbors=k_neighbors)

    # Zieht indices und ratings der neighbors
    neighbor_ratings = ratings['rating'].loc[indices[0]]

    #Calculate rating
    pred = compute_rating(neighbor_ratings, distances, mean)

    return pred

In [48]:
# function that finds optimal fixed weights for several models
def compare_weights(algos, params_features, params_knn):
    i=0
    rmses = []
    for train, test in KFold(n_splits=1000, **kwargs, shuffle=True).split(ratings):
        preds = []
        # split data
        trainset = ratings.loc[train]
        testset = ratings.loc[test]
        print(trainset.shape[0])
        print(testset.shape[0])
        # convert data to surprise format
        train_surprise = Dataset.load_from_df(trainset, reader).build_full_trainset()
        test_surprise = Dataset.load_from_df(testset, reader).build_full_trainset().build_testset()       
        
        # Collaborative Filtering Models
        for algo in algos:
            algo.fit(train_surprise)
            preds.append([algo.predict(uid, iid).est for uid, iid,_ in test_surprise])

        # Content-Based Models
        # load features
        features, names = knn_features.features(**params_features)
        # add imdbID and set as index
        features = omdb[['imdbID']].join(pd.DataFrame(features)).set_index('imdbID')
        
        # predict ratings
        pred_content = []
        number_of_ratings = []
        for row in testset.itertuples():
            # select user and movie
            imdbID = row.imdbID
            userID = row.user_id
            # select ratings of the user
            ratings_user = ratings.loc[ratings['user_id'] == userID]
            ratings_user.reset_index(inplace=True, drop=True)

            # select features of corresponding movies and convert to array
            features_user = np.array(features.loc[ratings_user['imdbID']])

            # compute predictions
            pred_content.append(predict_movie_rating(imdbID, ratings_user, features_user, **params_knn))
            # store the number of predictions of a user:
            number_of_ratings.append(ratings_user.shape[0])
            
        predictions = [preds[0], preds[1], pred_content]
        # store predictions and real ratings
        predictions = np.asarray(predictions)
        actuals = np.asarray(testset['rating'])
        # calculate rmses
        rmses.append([rmse(actuals, (predictions[0]*(w1/100)+predictions[1]*(w2/100)+predictions[2]*(1-w1/100-w2/100)), squared=False) for w1 in range(0,101) for w2 in range(0, 101-w1)])
        i+=1
        print('Iteration finished: ',i)
        # Early stopping to allow for fast testing
        if i == 5:
            return rmses, predictions, actuals, number_of_ratings            

In [49]:
# define collaborative filtering models: KNN and SVD
algos = (KNNWithMeans(k=115, min_k=5, sim_options={'name':'msd', 'user_based': False}, **kwargs), SVD(lr_all=0.01, reg_all=0.05, n_epochs=23, **kwargs))

In [50]:
# define features for content-based models
params_f = {'threshold_actors': 0, 'ts_languages': 0 , 'year': True, 'runtime': True, 'imdbVotes': True, 'series': False, 'awards': False,'genres': True, 'imdb_rating': True, 'roto_rating': False, 'pg_rating':True, 'threshold_newkeywords': 0, 'threshold_plots': 0, 'threshold_directors': 200}

In [51]:
# set the possible parameters for the kNN
params_k = {'mean':False, 'knn_metric':'cosine', 'set_k':False}

In [52]:
# get information on the prediction models
res, preds, ratings, numbers = compare_weights(algos, params_f, params_k)

786748
788
Computing the msd similarity matrix...
Done computing similarity matrix.
Iteration finished:  1
786748
788
Computing the msd similarity matrix...
Done computing similarity matrix.
Iteration finished:  2
786748
788
Computing the msd similarity matrix...
Done computing similarity matrix.
Iteration finished:  3
786748
788
Computing the msd similarity matrix...
Done computing similarity matrix.
Iteration finished:  4
786748
788
Computing the msd similarity matrix...
Done computing similarity matrix.
Iteration finished:  5


In [41]:
weights = []
weights.append([(w1/100, w2/100, 1-w1/100-w2/100) for w1 in range(0,101) for w2 in range(0, 101-w1)])
results = pd.DataFrame(np.asarray(weights)[0])
results['rmse'] = res[0]

In [55]:
ratings

array([5. , 3. , 1. , 5. , 2. , 3.5, 4. , 1.5, 4. , 5. , 3.5, 3.5, 2.5,
       3. , 2. , 0.5, 4.5, 4. , 4.5, 4. , 3.5, 3. , 1. , 5. , 4. , 4. ,
       4. , 3.5, 3. , 4. , 4.5, 5. , 4. , 4. , 2. , 3.5, 0.5, 2.5, 2. ,
       2. , 4. , 5. , 1. , 5. , 3.5, 3.5, 3. , 3. , 4. , 4.5, 4. , 4. ,
       3. , 2.5, 4. , 4.5, 4. , 3.5, 4.5, 4. , 4. , 4. , 4. , 3. , 2.5,
       4. , 3.5, 4. , 4. , 4.5, 2. , 3. , 4. , 4. , 4.5, 4.5, 3. , 4. ,
       2.5, 5. , 3.5, 3.5, 5. , 3. , 5. , 2.5, 3.5, 4. , 4. , 3. , 4. ,
       3.5, 3. , 3. , 3. , 4. , 3.5, 3.5, 3.5, 2. , 4.5, 3.5, 2. , 4. ,
       5. , 5. , 4. , 4. , 1.5, 5. , 4. , 4.5, 4. , 5. , 4.5, 4. , 2. ,
       4. , 3.5, 3. , 4. , 3. , 3. , 3.5, 3. , 4. , 1. , 1.5, 4. , 3. ,
       4. , 1.5, 4.5, 2. , 3.5, 4. , 3.5, 3. , 4.5, 3. , 3. , 4.5, 2. ,
       3. , 4. , 5. , 3.5, 1. , 4.5, 4. , 3.5, 3.5, 4. , 4. , 3. , 1. ,
       1. , 4. , 4. , 3. , 3. , 2.5, 4. , 4.5, 3. , 3.5, 4. , 2.5, 4. ,
       4. , 2. , 3.5, 3. , 2.5, 4. , 2. , 4.5, 2. , 4. , 4.5, 3.

In [56]:
preds

array([[3.65989919, 3.6227825 , 2.77673348, ..., 3.7183326 , 3.48635338,
        4.25606444],
       [3.55802162, 3.64592692, 2.67616079, ..., 3.59739985, 3.35588398,
        4.2091984 ],
       [4.53261792, 3.7880021 , 3.12604327, ..., 3.5492948 , 4.00214338,
        3.89170986]])

In [42]:
results

Unnamed: 0,0,1,2,rmse
0,0.00,0.00,1.000000e+00,0.838533
1,0.00,0.01,9.900000e-01,0.838294
2,0.00,0.02,9.800000e-01,0.838142
3,0.00,0.03,9.700000e-01,0.838080
4,0.00,0.04,9.600000e-01,0.838106
...,...,...,...,...
5146,0.98,0.01,1.000000e-02,1.195532
5147,0.98,0.02,1.734723e-17,1.201435
5148,0.99,0.00,1.000000e-02,1.195923
5149,0.99,0.01,8.673617e-18,1.201823


In [54]:
results.sort_values(1)

Unnamed: 0,0,1,2,rmse
0,0.00,0.00,1.00,0.838533
2450,0.28,0.00,0.72,0.868785
2523,0.29,0.00,0.71,0.871137
2595,0.30,0.00,0.70,0.873570
2666,0.31,0.00,0.69,0.876085
...,...,...,...,...
98,0.00,0.98,0.02,1.170142
299,0.02,0.98,0.00,1.182243
200,0.01,0.99,0.00,1.182234
99,0.00,0.99,0.01,1.176170
