# Content-based Filtering

Aproach:
- Table mit Features und Ratings erstellen, ohne User - auf Basis des OMDB Datasets

- sklearn-kNN mit cosine-similarity darauf anwenden

- Funktion schreiben, die auf basis des ratings der neighbours das rating eines Filmes vorhersagt.


Sources:

#### https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831

#### https://www.kaggle.com/johnwill225/movie-recommendations

#### https://towardsdatascience.com/how-we-built-a-content-based-filtering-recommender-system-for-music-with-python-c6c3b1020332


In [1]:
import pandas as pd
import numpy as np
import string
import math
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA

from knn_preprocessing import knn_preprocessing

kwargs = dict(random_state=42)

In [2]:
movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv')
omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')
ratings = pd.read_csv('../../data/preprocessed/ratings_clean_std_0.csv')
genres = pd.read_csv('../../data/raw/genres.csv', sep=',')

In [3]:
movies = movies.drop(columns={'spanishTitle','imdbPictureURL','rtID','rtPictureURL'})

In [4]:
movies['imdbID'] = movies['imdbID'].str.replace(r'tt', '')
movies['imdbID'] = movies['imdbID'].astype(float)

In [5]:
mapping = movies[['id', 'imdbID']].rename(columns={'id':'movieID'})

In [6]:
omdb.columns

Index(['Title', 'Year', 'Rated', 'Runtime', 'Writer', 'Plot', 'Language',
       'imdbRating', 'imdbVotes', 'imdbID', 'Rotten Tomatoes', 'Metacritic',
       'Series', 'Released_season', 'Released_month', 'Released_day',
       'PG_Rating', 'Available_languages', 'Oscars_won', 'Oscars_nominated',
       'Golden_globe_won', 'Golden_globe_nominated'],
      dtype='object')

In [7]:
#Moved Preprocessing to own script
merged_data = knn_preprocessing(['imdbID', 'Year', 'Runtime', 'Language', 'imdbRating', 'imdbVotes', 'Rotten Tomatoes', 'Metacritic',
       'Series', 'PG_Rating', 'Oscars_won', 'Oscars_nominated',
       'Golden_globe_won', 'Golden_globe_nominated'])

In [8]:
merged_data

Unnamed: 0,user_id,imdbID,rating,Year,Runtime,imdbRating,imdbVotes,RottenTomatoes,Metacritic,Awards,...,10,11,12,13,14,15,16,17,18,19
0,1264.0,47034.0,3.5,1954.0,96.0,7.6,27485.0,9.300000,7.800000,0.0,...,1,0,0,0,0,1,0,0,0,0
1,981.0,47034.0,3.5,1954.0,96.0,7.6,27485.0,9.300000,7.800000,0.0,...,1,0,0,0,0,1,0,0,0,0
2,481.0,47034.0,1.0,1954.0,96.0,7.6,27485.0,9.300000,7.800000,0.0,...,1,0,0,0,0,1,0,0,0,0
3,98.0,47034.0,2.5,1954.0,96.0,7.6,27485.0,9.300000,7.800000,0.0,...,1,0,0,0,0,1,0,0,0,0
4,249.0,47034.0,4.0,1954.0,96.0,7.6,27485.0,9.300000,7.800000,0.0,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787536,243.0,47376.0,3.0,1954.0,100.0,6.5,1813.0,6.203849,5.779918,0.0,...,0,0,0,0,0,0,0,0,1,0
787537,417.0,43132.0,4.0,1950.0,95.0,7.6,7217.0,10.000000,5.779918,0.0,...,0,0,0,0,0,0,0,0,0,0
787538,379.0,81433.0,3.0,1980.0,106.0,5.2,1048.0,1.400000,5.779918,0.0,...,0,0,1,0,0,0,0,0,0,0
787539,279.0,295480.0,1.0,2001.0,95.0,6.5,156.0,5.500000,5.500000,0.0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# function that computes a rating based on the neighbors
def compute_rating(neighbors, distances, mean = False):
    
    if mean == True:
        pred = neighbors.mean()
    else:
        #scaling ratings based on distance
        pred = sum(neighbors* (1+(1-distances[0]/distances[0].mean()))) / neighbors.shape[0]
    
    return float(pred)

In [10]:
# First Approach for easy k tuning - use method later to implement in depth tuning of k

def adjust_k(ratings):
    adjusted_k = 10
    r_size = len(ratings)
    
    if r_size > 40 and r_size  < 100:
        adjusted_k = 15
    elif r_size  > 100 and r_size < 500:
        adjusted_k = 20
    elif r_size  > 500 and r_size < 1500:
        adjusted_k = 25
    elif r_size  > 1500:
        adjusted_k = 30
        #print(r_size) 
        
    return adjusted_k

In [11]:
## function that predicts the rating of a movie from its imdbID and its nearest neighbors

def predict_movie_rating(imdbID, userID,user_data=merged_data, mean=False, knn_metric='cosine', set_k=False, k_neighbors=15):
     
    # Select all ratings given by User #userID
    ratings = user_data.loc[user_data['user_id'] == userID]
    
    #If no explicit number of neighbors is passed -> use variable neighbors function
    if set_k:
        k_neighbors = k_neighbors
    else:    
        k_neighbors = adjust_k(ratings)

  
    # Get real rating -> remove this in the end -> currently done for validation
    real_ratings = ratings.loc[(ratings['imdbID'] == imdbID)]
    
    real_idx = ratings.loc[(ratings['imdbID'] == imdbID)].index
    
    #remove real rating
    ratings = ratings[ratings['imdbID'] != imdbID] 

    #Scaling features -> maybe do outside function in future
    scaler = preprocessing.StandardScaler()
    features = pd.DataFrame(scaler.fit_transform(ratings.drop(columns = {'imdbID','user_id', 'rating'}))).merge(pd.DataFrame(ratings.index), left_index=True, right_index=True, how='left')
      
    
    if (ratings.to_numpy().size>0):   
        
        # Set algorithm and params
        if knn_metric == 'minkowski':
            knn = NearestNeighbors(metric='minkowski',p=2 , algorithm='brute', n_neighbors=k_neighbors, n_jobs=-1)
        else:    
            knn = NearestNeighbors(metric=knn_metric , algorithm='brute', n_neighbors=k_neighbors, n_jobs=-1)

        # Training
        #print('---- Training ConBF-kNN-Algorithm ----')
        #print('user_id: '+str(userID))
        #print('imdbID: '+str(imdbID))
        
    
        knn.fit(csr_matrix(features.iloc[:,0:(user_data.shape[1]-3)]))
        
        input_data = user_data.iloc[real_idx]
        inputs = scaler.transform(input_data.drop(columns = {'imdbID','user_id', 'rating'}))
        
    
        #Prediction -> get x nearest neighbors of imdbID
        distances , indices = knn.kneighbors(inputs, n_neighbors=k_neighbors)
        
       # Zieht indices und ratings der neighbors
        neighbor_ratings = user_data['rating'].loc[features['0_y'].loc[indices[0]]]
      
        # compute rating of movie(imbdID) based on the rating of the 20 nearest neighbors
        #mean = True gibt nur mittelwert der nachbarn
        
        pred = compute_rating(neighbor_ratings, distances, mean)
        
        #Generate Output for Understandability
        #print('Predicted Rating for '+str(imdbID)+': '+str(pred))
        #print('Real Rating of '+str(imdbID)+' was: '+ str(real_ratings['rating'].values[0]))
        
        
        #Output to understand mistakes
        #neighbor_data = ratings.loc[features['0_y'].loc[indices[0]]]
        
        #neighbor_movies = neighbor_data.merge(movies, how='left', on='imdbID')
        
        #for i in range (0, len(neighbor_data)):
        #    print(genres_grouped[genres_grouped['imdbID']==neighbor_data['imdbID'].values[i]])
        
        #print(neighbor_movies)
        #print(neighbor_movies.describe())
    
    
        # return rating prediction and real rating
        return pred , real_ratings['rating'].values[0]
        
    else:
         return "User has not rated other movies. Check input"
    

In [12]:
#Keeping this for future testing
### Testing function for ToyStory###
#imdbID = 114709.0
# Aufpassen userID und imdbID als float übergeben!! User: 394,1171, 3682
#userID = 394
#pred , real = predict_movie_rating(imdbID, userID, merged_data)

In [13]:
def get_data():
    return merged_data

In [14]:
# Larger Test:

def test_predict_mr(no_test_samples, mean = False, knn_metric = 'cosine', set_k=False, k_neighbors=15):
    # Sampling #no_test_samples of random samples from dataset for testing
    test_set = merged_data.sample(n=no_test_samples)
    
    predictions = pd.DataFrame(columns=['Prediction'])
    reals = pd.DataFrame(columns=['Real_Rating'])
    
    # Iterate over test-set and generate predicitons for it
    # TODO get rid of ugly for-loop
    for row in test_set.itertuples():
        imdbID = row.imdbID
        userID = row.user_id
        pred , real = predict_movie_rating(imdbID, userID, merged_data, mean, knn_metric, set_k=False, k_neighbors=15)
        predictions.loc[row[0]] = pred
        reals.loc[row[0]] = real
    
    rmse = mean_squared_error(reals['Real_Rating'], predictions['Prediction'], squared=False)
    print('RMSE: '+str(rmse))
    return float(rmse)

In [15]:
#test_predict_mr(50)

In [16]:
print("PCA:")
merged_g=merged_data.drop(columns={'user_id','imdbID','rating'})
scaler = preprocessing.StandardScaler()
merged_gt = scaler.fit_transform(merged_g)
pca = PCA().fit(merged_gt)
top_PCA=["%.2f" % a for a in pca.explained_variance_ratio_ if a >0.01]
print("Main Variance impacting factors:")
print(pca.explained_variance_ratio_)

PCA:
Main Variance impacting factors:
[0.1352569  0.09503712 0.07998231 0.06074159 0.04796725 0.04597668
 0.0430597  0.04134133 0.03750373 0.03703565 0.03502601 0.0336219
 0.03295991 0.03155028 0.0294954  0.02847374 0.02764069 0.02351684
 0.02326137 0.01913027 0.01828914 0.01731806 0.0156969  0.01428033
 0.01294528 0.00832974 0.0045619 ]


In [17]:
first_comp = pca.components_[0]
first_comps = pd.DataFrame(zip(first_comp, merged_g.columns), columns=['weights', 'features'])
first_comps['abs_weights']=first_comps['weights'].apply(lambda x: np.abs(x))
first_comps= first_comps.sort_values('abs_weights', ascending=False)
print(first_comps)

     weights        features  abs_weights
2   0.460198      imdbRating     0.460198
5   0.428491      Metacritic     0.428491
4   0.422208  RottenTomatoes     0.422208
6   0.352119          Awards     0.352119
3   0.305617       imdbVotes     0.305617
1   0.277951         Runtime     0.277951
14  0.209256               7     0.209256
0  -0.148325            Year     0.148325
11 -0.143027               4     0.143027
25  0.120398              18     0.120398
17 -0.086768              10     0.086768
12  0.083021               5     0.083021
16  0.067442               9     0.067442
18  0.049997              11     0.049997
22 -0.047021              15     0.047021
20  0.044221              13     0.044221
26  0.042664              19     0.042664
19  0.035038              12     0.035038
13  0.021776               6     0.021776
7  -0.019760               0     0.019760
9   0.019544               2     0.019544
8   0.018165               1     0.018165
21  0.016990              14     0