# Content-based Filtering

Aproach:
- Table mit Features und Ratings erstellen, ohne User - auf Basis des OMDB Datasets

- sklearn-kNN mit cosine-similarity darauf anwenden

- Funktion schreiben, die auf basis des ratings der neighbours das rating eines Filmes vorhersagt.


Sources:

#### https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831

#### https://www.kaggle.com/johnwill225/movie-recommendations

#### https://towardsdatascience.com/how-we-built-a-content-based-filtering-recommender-system-for-music-with-python-c6c3b1020332


In [None]:
import pandas as pd
import numpy as np
import string
import math
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA

from knn_preprocessing import knn_preprocessing

kwargs = dict(random_state=42)

In [None]:
movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv')
omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')
ratings = pd.read_csv('../../data/preprocessed/ratings_clean_std_0.csv')
genres = pd.read_csv('../../data/raw/genres.csv', sep=',')

In [None]:
movies = movies.drop(columns={'spanishTitle','imdbPictureURL','rtID','rtPictureURL'})

In [None]:
movies['imdbID'] = movies['imdbID'].str.replace(r'tt', '')
movies['imdbID'] = movies['imdbID'].astype(float)

In [None]:
mapping = movies[['id', 'imdbID']].rename(columns={'id':'movieID'})

In [None]:
omdb.columns

In [None]:
#Moved Preprocessing to own script
merged_data = knn_preprocessing(['imdbID', 'Year', 'Runtime', 'Language', 'imdbRating', 'imdbVotes', 'Rotten Tomatoes', 'Metacritic',
       'Series', 'PG_Rating', 'Oscars_won', 'Oscars_nominated',
       'Golden_globe_won', 'Golden_globe_nominated'])

In [None]:
merged_data

In [None]:
# function that computes a rating based on the neighbors
def compute_rating(neighbors, distances, mean = False):
    
    if mean == True:
        pred = neighbors.mean()
    else:
        #scaling ratings based on distance
        pred = sum(neighbors* (1+(1-distances[0]/distances[0].mean()))) / neighbors.shape[0]
    
    return float(pred)

In [None]:
# First Approach for easy k tuning - use method later to implement in depth tuning of k

def adjust_k(ratings):
    adjusted_k = 10
    r_size = len(ratings)
    
    if r_size > 40 and r_size  < 100:
        adjusted_k = 15
    elif r_size  > 100 and r_size < 500:
        adjusted_k = 20
    elif r_size  > 500 and r_size < 1500:
        adjusted_k = 25
    elif r_size  > 1500:
        adjusted_k = 30
        #print(r_size) 
        
    return adjusted_k

In [None]:
## function that predicts the rating of a movie from its imdbID and its nearest neighbors

def predict_movie_rating(imdbID, userID,user_data=merged_data, mean=False, knn_metric='cosine', set_k=False, k_neighbors=15):
     
    # Select all ratings given by User #userID
    ratings = user_data.loc[user_data['user_id'] == userID]
    
    #If no explicit number of neighbors is passed -> use variable neighbors function
    if set_k:
        k_neighbors = k_neighbors
    else:    
        k_neighbors = adjust_k(ratings)

  
    # Get real rating -> remove this in the end -> currently done for validation
    real_ratings = ratings.loc[(ratings['imdbID'] == imdbID)]
    
    real_idx = ratings.loc[(ratings['imdbID'] == imdbID)].index
    
    #remove real rating
    ratings = ratings[ratings['imdbID'] != imdbID] 

    #Scaling features -> maybe do outside function in future
    scaler = preprocessing.StandardScaler()
    features = pd.DataFrame(scaler.fit_transform(ratings.drop(columns = {'imdbID','user_id', 'rating'}))).merge(pd.DataFrame(ratings.index), left_index=True, right_index=True, how='left')
      
    
    if (ratings.to_numpy().size>0):   
        
        # Set algorithm and params
        if knn_metric == 'minkowski':
            knn = NearestNeighbors(metric='minkowski',p=2 , algorithm='brute', n_neighbors=k_neighbors, n_jobs=-1)
        else:    
            knn = NearestNeighbors(metric=knn_metric , algorithm='brute', n_neighbors=k_neighbors, n_jobs=-1)

        # Training
        #print('---- Training ConBF-kNN-Algorithm ----')
        #print('user_id: '+str(userID))
        #print('imdbID: '+str(imdbID))
        
    
        knn.fit(csr_matrix(features.iloc[:,0:(user_data.shape[1]-3)]))
        
        input_data = user_data.iloc[real_idx]
        inputs = scaler.transform(input_data.drop(columns = {'imdbID','user_id', 'rating'}))
        
    
        #Prediction -> get x nearest neighbors of imdbID
        distances , indices = knn.kneighbors(inputs, n_neighbors=k_neighbors)
        
       # Zieht indices und ratings der neighbors
        neighbor_ratings = user_data['rating'].loc[features['0_y'].loc[indices[0]]]
      
        # compute rating of movie(imbdID) based on the rating of the 20 nearest neighbors
        #mean = True gibt nur mittelwert der nachbarn
        
        pred = compute_rating(neighbor_ratings, distances, mean)
        
        #Generate Output for Understandability
        #print('Predicted Rating for '+str(imdbID)+': '+str(pred))
        #print('Real Rating of '+str(imdbID)+' was: '+ str(real_ratings['rating'].values[0]))
        
        
        #Output to understand mistakes
        #neighbor_data = ratings.loc[features['0_y'].loc[indices[0]]]
        
        #neighbor_movies = neighbor_data.merge(movies, how='left', on='imdbID')
        
        #for i in range (0, len(neighbor_data)):
        #    print(genres_grouped[genres_grouped['imdbID']==neighbor_data['imdbID'].values[i]])
        
        #print(neighbor_movies)
        #print(neighbor_movies.describe())
    
    
        # return rating prediction and real rating
        return pred , real_ratings['rating'].values[0]
        
    else:
         return "User has not rated other movies. Check input"
    

In [None]:
#Keeping this for future testing
### Testing function for ToyStory###
#imdbID = 114709.0
# Aufpassen userID und imdbID als float übergeben!! User: 394,1171, 3682
#userID = 394
#pred , real = predict_movie_rating(imdbID, userID, merged_data)

In [None]:
# Larger Test:

def test_predict_mr(no_test_samples, mean = False, knn_metric = 'cosine', set_k=False, k_neighbors=15):
    # Sampling #no_test_samples of random samples from dataset for testing
    test_set = merged_data.sample(n=no_test_samples)
    
    predictions = pd.DataFrame(columns=['Prediction'])
    reals = pd.DataFrame(columns=['Real_Rating'])
    
    # Iterate over test-set and generate predicitons for it
    # TODO get rid of ugly for-loop
    for row in test_set.itertuples():
        imdbID = row.imdbID
        userID = row.user_id
        pred , real = predict_movie_rating(imdbID, userID, merged_data, mean, knn_metric, set_k=False, k_neighbors=15)
        predictions.loc[row[0]] = pred
        reals.loc[row[0]] = real
    
    rmse = mean_squared_error(reals['Real_Rating'], predictions['Prediction'], squared=False)
    print('RMSE: '+str(rmse))
    return float(rmse)

In [None]:
#test_predict_mr(50)

In [None]:
print("PCA:")
merged_g=merged_data.drop(columns={'user_id','imdbID','rating'})
scaler = preprocessing.StandardScaler()
merged_gt = scaler.fit_transform(merged_g)
pca = PCA().fit(merged_gt)
top_PCA=["%.2f" % a for a in pca.explained_variance_ratio_ if a >0.01]
print("Main Variance impacting factors:")
print(pca.explained_variance_ratio_)

In [None]:
first_comp = pca.components_[0]
first_comps = pd.DataFrame(zip(first_comp, merged_g.columns), columns=['weights', 'features'])
first_comps['abs_weights']=first_comps['weights'].apply(lambda x: np.abs(x))
first_comps= first_comps.sort_values('abs_weights', ascending=False)
print(first_comps)