# Content-based Filtering

Aproach:
- Table mit Features und Ratings erstellen, ohne User - auf Basis des OMDB Datasets

- sklearn-kNN mit cosine-similarity darauf anwenden

- Funktion schreiben, die auf basis des ratings der neighbours das rating eines Filmes vorhersagt.


Sources:

#### https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831

#### https://www.kaggle.com/johnwill225/movie-recommendations

#### https://towardsdatascience.com/how-we-built-a-content-based-filtering-recommender-system-for-music-with-python-c6c3b1020332


In [1]:
import pandas as pd
import numpy as np
import string
import math
import time
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA

from knn_preprocessing import knn_preprocessing

kwargs = dict(random_state=42)

In [2]:
movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv')
omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')
ratings = pd.read_csv('../../data/preprocessed/ratings_clean_std_0.csv')
genres = pd.read_csv('../../data/raw/genres.csv', sep=',')

In [3]:
movies = movies.drop(columns={'spanishTitle','imdbPictureURL','rtID','rtPictureURL'})

In [4]:
movies['imdbID'] = movies['imdbID'].str.replace(r'tt', '')
movies['imdbID'] = movies['imdbID'].astype(float)

In [5]:
mapping = movies[['id', 'imdbID']].rename(columns={'id':'movieID'})

In [6]:
omdb.columns

Index(['0', 'Title', 'Year', 'Rated', 'Runtime', 'Writer', 'Plot', 'Language',
       'imdbRating', 'imdbVotes', 'imdbID', 'Rotten Tomatoes', 'Metacritic',
       'Series', 'Released_season', 'Released_month', 'Released_day',
       'PG_Rating', 'Oscars_won', 'Oscars_nominated', 'Golden_globe_won',
       'Golden_globe_nominated', 'Available_languages'],
      dtype='object')

In [7]:
#Moved Preprocessing to own script
merged_data = knn_preprocessing(['imdbID', 'Year', 'Runtime', 'Language', 'imdbRating', 'imdbVotes', 'Rotten Tomatoes', 'Metacritic',
       'Series', 'PG_Rating', 'Oscars_won', 'Oscars_nominated',
       'Golden_globe_won', 'Golden_globe_nominated'])

In [8]:
print("PCA:")
merged_g=merged_data.drop(columns={'user_id','imdbID','rating'})
scaler = preprocessing.StandardScaler()
merged_gt = scaler.fit_transform(merged_g)
pca = PCA()
pca_res = pca.fit(merged_gt)
top_PCA=["%.2f" % a for a in pca_res.explained_variance_ratio_ if a >0.01]
le=len(top_PCA)
print("Main Variance impacting factors:")
print(pca_res.explained_variance_ratio_)

PCA:
Main Variance impacting factors:
[0.13526531 0.09503161 0.07997255 0.06073348 0.04796419 0.0459749
 0.04306016 0.04134322 0.03749745 0.03703565 0.03502202 0.03362328
 0.03296001 0.031551   0.02949176 0.02847078 0.02764098 0.0235196
 0.02326318 0.01913025 0.01829368 0.01733844 0.01569124 0.01429026
 0.0129433  0.00832911 0.00456256]


In [9]:
merged_gt

array([[-2.66620598, -0.74159355,  0.5162769 , ..., -0.62603854,
        -0.24730271, -0.15581087],
       [-2.66620598, -0.74159355,  0.5162769 , ..., -0.62603854,
        -0.24730271, -0.15581087],
       [-2.66620598, -0.74159355,  0.5162769 , ..., -0.62603854,
        -0.24730271, -0.15581087],
       ...,
       [-0.85929759, -0.33005849, -2.05875128, ..., -0.62603854,
        -0.24730271, -0.15581087],
       [ 0.60012841, -0.78274706, -0.66394435, ..., -0.62603854,
        -0.24730271, -0.15581087],
       [ 0.60012841, -1.81158471, -0.12748015, ..., -0.62603854,
        -0.24730271, -0.15581087]])

In [10]:
merged_gt.shape

(787541, 27)

In [11]:
trans_feat = pca.transform(merged_gt)

In [12]:
trans_feat

array([[ 0.10617641, -0.45172897,  1.1021209 , ..., -0.46553995,
        -0.0110636 , -0.26558938],
       [ 0.10617641, -0.45172897,  1.1021209 , ..., -0.46553995,
        -0.0110636 , -0.26558938],
       [ 0.10617641, -0.45172897,  1.1021209 , ..., -0.46553995,
        -0.0110636 , -0.26558938],
       ...,
       [-2.81759916,  1.62111641, -1.06381306, ..., -0.16357943,
         1.55103453, -0.65513327],
       [-1.17596481, -0.08990454, -1.63835239, ..., -0.36351356,
         0.38577804,  0.04991554],
       [-1.15812093,  0.58849326, -1.2362714 , ..., -0.23219469,
         0.21408446, -0.36017218]])

In [13]:
trans_feat.shape

(787541, 27)

In [14]:
top_PCA=["%.2f" % a for a in pca_res.explained_variance_ratio_ if a > 0.03]
le=len(top_PCA)
print(le)
trans_feat = pca.transform(merged_gt)[:, :len(top_PCA)]

14


In [15]:
trans_feat

array([[ 0.10617641, -0.45172897,  1.1021209 , ...,  0.02938981,
         0.30556019,  0.73659346],
       [ 0.10617641, -0.45172897,  1.1021209 , ...,  0.02938981,
         0.30556019,  0.73659346],
       [ 0.10617641, -0.45172897,  1.1021209 , ...,  0.02938981,
         0.30556019,  0.73659346],
       ...,
       [-2.81759916,  1.62111641, -1.06381306, ...,  3.30381686,
         1.4144614 , -0.23187338],
       [-1.17596481, -0.08990454, -1.63835239, ...,  0.03502384,
        -0.70645966, -1.65423871],
       [-1.15812093,  0.58849326, -1.2362714 , ...,  0.13670507,
        -0.2971807 , -2.11149819]])

In [16]:
trans_feat.shape

(787541, 14)

In [54]:
# function that computes a rating based on the neighbors
def compute_rating(neighbors, distances, mean):
    
    if mean:
        # Mittelwert der k-nächsten Nachbarn
        pred = neighbors.mean()
    
    else:
        # Gewichtung der Bewertung der Nachbarn je nach Distanz
        
        #Falls perfecter Match(es) gefunden -> benutze nur matches:
        if 0 in distances[0]:
            for d in distance[0]:
                c=0
                if d==0:
                    pred = pred+neighbors.iloc[c]   
                    c=c+1
           
            pred = pred / c  
        else:
            pred = sum(neighbors*((1/(distances[0]+0.000001)**1)/(sum((1/(distances[0]+0.000001)**1)))))
    
    
    return float(pred)

In [44]:
# First Approach for easy k tuning - use method later to implement in depth tuning of k

def adjust_k(ratings):
    adjusted_k = 10
    r_size = len(ratings)
    #adjusted_k = int(math.sqrt(r_size))
    if r_size > 40 and r_size  < 100:
        adjusted_k = 15
    elif r_size  > 100 and r_size < 500:
        adjusted_k = 20
    elif r_size  > 500 and r_size < 1500:
        adjusted_k = 25
    elif r_size  > 1500:
        adjusted_k = 30
        #print(r_size) 
        
    return adjusted_k

In [45]:
## function that predicts the rating of a movie from its imdbID and its nearest neighbors

def predict_movie_rating(imdbID, userID, user_data=merged_data, mean=False, knn_metric='cosine', set_k=False, k_neighbors=15):

    # Select all ratings given by User #userID
    ratings = user_data.loc[user_data['user_id'] == userID]
    
    #If no explicit number of neighbors is passed -> use variable neighbors function
    if set_k:
        k_neighbors = k_neighbors
    else:    
        k_neighbors = adjust_k(ratings)

  
    # Get real rating -> remove this in the end -> currently done for validation
    real_ratings = ratings.loc[(ratings['imdbID'] == imdbID)]
    
    real_idx = ratings.loc[(ratings['imdbID'] == imdbID)].index
    
    #remove real rating
    ratings = ratings[ratings['imdbID'] != imdbID] 

    #Scaling features -> maybe do outside function in future
    scaler = preprocessing.StandardScaler()
    features = pd.DataFrame(scaler.fit_transform(ratings.drop(columns = {'imdbID','user_id', 'rating'}))).merge(pd.DataFrame(ratings.index), left_index=True, right_index=True, how='left')
      
    
    if (ratings.to_numpy().size>0):   
        
        # Set algorithm and params
        if knn_metric == 'minkowski':
            knn = NearestNeighbors(metric='minkowski',p=2 , algorithm='brute', n_neighbors=k_neighbors, n_jobs=-1)
        else:    
            knn = NearestNeighbors(metric=knn_metric , algorithm='brute', n_neighbors=k_neighbors, n_jobs=-1)
    
        knn.fit(csr_matrix(features.iloc[:,0:(user_data.shape[1]-3)]))
        
        input_data = user_data.iloc[real_idx]
        inputs = scaler.transform(input_data.drop(columns = {'imdbID','user_id', 'rating'})) 
    
        #Prediction -> get x nearest neighbors of imdbID
        distances , indices = knn.kneighbors(inputs, n_neighbors=k_neighbors)
        
       # Zieht indices und ratings der neighbors
        neighbor_ratings = user_data['rating'].loc[features['0_y'].loc[indices[0]]]

        #Compute Rating    
        pred = compute_rating(neighbor_ratings, distances, mean)
    
    
        # return rating prediction and real rating
        return pred , real_ratings['rating'].values[0]
        
    else:
         return "User has not rated other movies. Check input"
    

In [46]:
#Keeping this for future testing
### Testing function for ToyStory###
#imdbID = 114709.0
# Aufpassen userID und imdbID als float übergeben!! User: 394,1171, 3682
#userID = 394
#pred , real = predict_movie_rating(imdbID, userID, merged_data)

In [47]:
def get_data():
    return merged_data

In [48]:
# Larger Test:

def test_predict_mr(no_test_samples, mean = False, knn_metric = 'cosine', set_k=False, k_neighbors=15, data = merged_data):
    # Sampling #no_test_samples of random samples from dataset for testing
    test_set = data.sample(n=no_test_samples)

    predictions = pd.DataFrame(columns=['Prediction'])
    reals = pd.DataFrame(columns=['Real_Rating'])

    # Iterate over test-set and generate predicitons for it
    # TODO get rid of ugly for-loop
    for row in test_set.itertuples():
        imdbID = row.imdbID
        userID = row.user_id
        pred , real = predict_movie_rating(imdbID, userID, data, mean, knn_metric, set_k=False, k_neighbors=15)
        predictions.loc[row[0]] = pred
        reals.loc[row[0]] = real
    
    rmse = mean_squared_error(reals['Real_Rating'], predictions['Prediction'], squared=False)
    print('RMSE: '+str(rmse))
    return float(rmse)

In [55]:
start = time.time()
test_predict_mr(1)
stop = time.time()
print((stop-start))

1.0
RMSE: 0.3612986671149181
0.06125903129577637


In [36]:
test = [[0,0,0]]

In [30]:
if 0 in test[0]:
    print('0')
else:    
    print(test[0][0])

0
