# Content-based Filtering

Aproach:
- Table mit Features und Ratings erstellen, ohne User - auf Basis des OMDB Datasets

- sklearn-kNN mit cosine-similarity darauf anwenden

- Funktion schreiben, die auf basis des ratings der neighbours das rating eines Filmes vorhersagt.


Sources:

#### https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831

#### https://www.kaggle.com/johnwill225/movie-recommendations

#### https://towardsdatascience.com/how-we-built-a-content-based-filtering-recommender-system-for-music-with-python-c6c3b1020332


In [1]:
import pandas as pd
import numpy as np
import string
import math
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA

from knn_preprocessing import knn_preprocessing

kwargs = dict(random_state=42)

In [2]:
movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv')
average_ratings = pd.read_csv('../../data/preprocessed/average_ratings.csv')
omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')
ratings = pd.read_csv('../../data/preprocessed/ratings_clean_std_0.csv')

In [3]:
omdb.columns

Index(['Title', 'Year', 'Rated', 'Runtime', 'Writer', 'Plot', 'Language',
       'imdbRating', 'imdbVotes', 'imdbID', 'Rotten Tomatoes', 'Metacritic',
       'Series', 'Released_season', 'Released_month', 'Released_day',
       'PG_Rating', 'Available_languages', 'Oscars_won', 'Oscars_nominated',
       'Golden_globe_won', 'Golden_globe_nominated'],
      dtype='object')

In [4]:
merged_data = knn_preprocessing(['imdbID', 'Year', 'Runtime', 'Language', 'imdbRating', 'imdbVotes', 'Rotten Tomatoes', 'Metacritic',
       'Series', 'PG_Rating', 'Oscars_won', 'Oscars_nominated',
       'Golden_globe_won', 'Golden_globe_nominated'])

In [5]:
merged_data

Unnamed: 0,user_id,imdbID,rating,Year,Runtime,imdbRating,imdbVotes,RottenTomatoes,Metacritic,Series,PG_Rating,Oscars_won,Oscars_nominated,Golden_globe_won,Golden_globe_nominated
0,1264,47034.0,3.5,1954.0,96.0,7.6,27485.0,9.3,7.800000,0.0,0.0,0.0,0.0,0.0,0.0
1,213,304141.0,2.5,2004.0,142.0,7.9,524511.0,9.0,8.200000,0.0,1.0,0.0,2.0,0.0,0.0
2,593,369436.0,3.0,2008.0,88.0,5.7,61335.0,2.4,4.100000,0.0,2.0,0.0,0.0,0.0,0.0
3,609,1077258.0,4.0,2007.0,105.0,7.1,196149.0,7.4,5.779918,0.0,0.0,0.0,0.0,0.0,0.0
4,1590,52182.0,4.0,1958.0,100.0,7.4,6337.0,6.9,5.779918,0.0,0.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787536,1032,83530.0,3.0,1982.0,85.0,6.2,47327.0,4.2,4.800000,0.0,1.0,0.0,0.0,0.0,0.0
787537,99,107798.0,3.0,1993.0,141.0,6.6,73544.0,5.3,5.000000,0.0,2.0,0.0,0.0,0.0,0.0
787538,333,93857.0,3.0,1987.0,88.0,5.1,12882.0,0.7,2.800000,0.0,2.0,0.0,0.0,0.0,0.0
787539,49,144168.0,3.0,1999.0,94.0,6.1,17926.0,7.1,6.500000,0.0,2.0,0.0,0.0,0.0,0.0


In [6]:
merged_data.isna().sum()

user_id                   0
imdbID                    0
rating                    0
Year                      0
Runtime                   0
imdbRating                0
imdbVotes                 0
RottenTomatoes            0
Metacritic                0
Series                    0
PG_Rating                 0
Oscars_won                0
Oscars_nominated          0
Golden_globe_won          0
Golden_globe_nominated    0
dtype: int64

In [48]:
# function that computes a rating based on the neighbors
def compute_rating(neighbors, distances, mean = False):
    
    if mean == True:
        pred = neighbors.mean()
    else:
        pred = sum(neighbors* (1+(1-distances[0]/distances[0].mean()))) / neighbors.shape[0]
    
    return float(pred)

In [51]:
## function that predicts the rating of a movie from its imdbID and its 20 nearest neighbors

def predict_movie_rating(imdbID, userID, user_data):
    # get 20 (or x) nearest neighbors that were rated by userID
    # TODO: Use function from kNN.ipynb to find optimal k for algorithm 
    k_neighbors = 16    
    
    # Select all ratings given by User #userID
    ratings = user_data.loc[user_data['user_id'] == userID]
    
    # Get real rating -> remove this in the end -> currently done for validation
    real_ratings = ratings.loc[(ratings['imdbID'] == imdbID)]
    
    real_idx = ratings.loc[(ratings['imdbID'] == imdbID)].index
    
    #remove real rating
    ratings = ratings[ratings['imdbID'] != imdbID] 
    
    ratings = ratings.drop(columns = {'imdbID','user_id', 'rating'})

    scaler = preprocessing.StandardScaler()
    features = pd.DataFrame(scaler.fit_transform(ratings)).merge(pd.DataFrame(ratings.index), left_index=True, right_index=True, how='left')
    
    
    
    if (ratings.to_numpy().size>0):   
        
        # Convert to Sparse Matrix Format
        #ratings_as_csr = csr_matrix(ratings)
        
        # Set algorithm and params 
        knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=k_neighbors, n_jobs=-1)

        # Training
        #print('---- Training ConBF-kNN-Algorithm ----')
        #print('user_id: '+str(userID))
        #print('imdbID: '+str(imdbID))
        
        knn.fit(features.iloc[:,0:12])
        
        input_data = user_data.iloc[real_idx]
        inputs = scaler.transform(input_data.drop(columns = {'imdbID','user_id', 'rating'}))
        
    
        #Prediction -> get x nearest neighbors of imdbID
        distances , indices = knn.kneighbors(inputs, n_neighbors=k_neighbors)
        
       
        neighbor_ratings = user_data['rating'].loc[features['0_y'].loc[indices[0]]]
      
        # compute rating of movie(imbdID) based on the rating of the 20 nearest neighbors
        #mean = True gibt nur mittelwert der nachbarn
        
        pred = compute_rating(neighbor_ratings, distances)
        
        #Generate Output for Understandability
        #print('Predicted Rating for '+str(imdbID)+': '+str(pred))
        #print('Real Rating of '+str(imdbID)+' was: '+ str(real_ratings['rating'].values[0]))
        
        # return rating prediction and real rating
        return pred , real_ratings['rating'].values[0]
        
    else:
         return "User has not rated other movies. Check input"
    

In [50]:
### Testing function ###
imdbID = 113627.0
# Aufpassen userID und imdbID als float übergeben!!
userID = 123
pred , real = predict_movie_rating(imdbID, userID, merged_data)

In [18]:
### Testing function ###
imdbID = 145653
# Aufpassen userID und imdbID als float übergeben!!
userID = 123
pred , real = predict_movie_rating(imdbID, userID, merged_data)

2.967990241610865
Predicted Rating for 145653: 2.967990241610865
Real Rating of 145653 was: 4.5


In [19]:
## DUPLICATED RATING!!! -> durch das umschreiben mit .values[0] wird das Doppelte Rating nicht mehr angezeigt -> tdm mal checken!

In [37]:
# Larger Test:
# TODO Refactoring needed #UglyPythonCode

def test_predict_mr(no_test_samples):
    # Sampling #no_test_samples of random samples from dataset for testing
    test_set = merged_data.sample(n=no_test_samples)
    
    predictions = pd.DataFrame(columns=['Prediction'])
    reals = pd.DataFrame(columns=['Real_Rating'])
    
    # Iterate over test-set and generate predicitons for it
    # TODO get rid of ugly for-loop
    for row in test_set.itertuples():
        imdbID = row.imdbID
        userID = row.user_id
        pred , real = predict_movie_rating(imdbID, userID, merged_data)
        #print(pred)
        #print(real)
        #print(row[0])
        predictions.loc[row[0]] = pred
        reals.loc[row[0]] = real
    
    rmse = mean_squared_error(reals['Real_Rating'], predictions['Prediction'], squared=False)
    print('RMSE: '+str(rmse))
    return float(rmse)

In [41]:
n = 50
test_predict_mr(n)

RMSE: 0.7907855647465817


0.7907855647465817

In [42]:
upper = 0
lower = 10

for i in range(0,10):
    n = 100
    rmse = test_predict_mr(n)
    
    if (float(rmse) < float(lower)):
        lower = rmse
    
    if (float(rmse) > float(upper)):
        upper = rmse
        
print('RMSE upper Bound: '+str(upper))
print('RMSE lower Bound: '+str(lower))

RMSE: 0.7176198668540387
RMSE: 0.772245813860549
RMSE: 0.7242469714905326
RMSE: 0.7627589088648674
RMSE: 0.8494626898005517
RMSE: 0.8395358560941572
RMSE: 0.8029933634808523
RMSE: 0.8811928302267125
RMSE: 0.7492685286229224
RMSE: 0.7858373767403665
RMSE upper Bound: 0.8811928302267125
RMSE lower Bound: 0.7176198668540387


In [52]:
n = 1000
test_predict_mr(n)

RMSE: 0.8063759602710759


0.8063759602710759

In [None]:
#Tested with (metric='minkowski', p=2)
n = 50
test_predict_mr(n)

In [150]:
#Tested with (metric='minkowski', p=2)
n = 50
test_predict_mr(n)

RMSE: 1.0720381692131251


1.0720381692131251

In [None]:
#Tested with minkowski
upper = 0
lower = 10

for i in range(0,10):
    n = 10000
    rmse = test_predict_mr(n)
    
    if (float(rmse) < float(lower)):
        lower = rmse
    
    if (float(rmse) > float(upper)):
        upper = rmse
        
print('RMSE upper Bound: '+str(upper))
print('RMSE lower Bound: '+str(lower))

RMSE: 0.8098710537656919
RMSE: 0.8129982141799429
RMSE: 0.8090306723256457
RMSE: 0.8089043303750401
RMSE: 0.8164125455805039
RMSE: 0.8253029228451267
