## TP1 - Q5
#### Mohammed Ramzi Bouthiba, Matricule 2065386
#### Mohamad Charara, Matricule 1720462
#### Vithor Bertalan, Matricule 2135362

##### Question 5 - Je suis un nouvel utilisateur. Vous connaissez ma profession, mon sexe et mon âge. Développez un algorithme bayésien pour recommander 10 films sur la base de ces trois catégories. Effectuez une validation croisée de 5 replis pour calculer la performance prédictive de l'approche (erreur quadratique). Considérez 2 catégories pour la prédiction, j'aime si le vote est 4 ou 5, je n'aime pas autrement et faites une pondération selon la fréquence des votes pour calculer l'erreur quadratique, et prenez comme catégories d'âge deux intervals délimités par 25 ans.

In [31]:
## Imports libraries
from scipy.sparse import csr_matrix
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Loads CSVs into Pandas variables
u = pd.read_csv('u.csv', delimiter= '|')
votes = pd.read_csv('votes.csv', delimiter= '|')
items = pd.read_csv('items.csv', delimiter= '|', header=0)

## Excludes the first value from votes
votes['user.id'] -= 1
votes['item.id'] -= 1

## Adjusts columns in U datset
u.columns = [column.replace(" ", "") for column in u.columns];

## Calculates cross validation based on the number of folds given
def cross_validation(data, number_of_folds):
   
    ## Stores the original dataset
    sample_data = data.sample(frac=1)
    ## Creates variables to store the folds, and to guide the loop
    fold_size = len(sample_data) // number_of_folds
    ini, end, folds = 0, fold_size, []
    ## While it doesn't reach the number of folds, splits data
    for i in range(number_of_folds):
        folds.append(sample_data.iloc[ini:end, :])
        ini = end
        if (i + 1 == number_of_folds - 1):
            end = len(data)
        else:
            end += fold_size
    return folds

#### Part 1 - Calculate MSE with Bayesian approach

In [42]:
def calculate_error(data, folds, profession, sex, age):
    
    overall_error = []
    folds_data = cross_validation(data,folds)
    
    for fold in range(len(folds_data)):
        
        ## Gets the test-rating split
        test_data = folds_data[fold]

        ## Converts the rating data to 0
        rating_data = test_data.copy()
        rating_data['rating'] = np.nan

        ## Inserts that test-rating split back to the set, and builds train data and CSR matrix from it
        train_data = pd.concat(folds_data[:fold] + [rating_data] + folds_data[fold + 1:])              
        train_matrix = csr_matrix((train_data['rating'], (train_data['user.id'], train_data['item.id'])))

        ## The minimum grade that makes one person like a movie
        vote_min = 4

        ## Selects rows based on the parameters
        pro_rows = u[u["job"]==profession].index.tolist()
        sex_rows = u[u["gender"]==sex].index.tolist()

        ## Age must be delimited by 25 years
        age_rows = u[u["age"] > age-25][u["age"] < age+25].index.tolist()

        ## Calculates Bayes Theorem for each parameter, applying Laplace correction
        prob_job = np.sum(train_matrix[pro_rows,:] >= vote_min, axis=0).astype(float)+1 / np.sum(train_matrix >= vote_min, axis=0)+2
        prob_age = np.sum(train_matrix[age_rows,:] >= vote_min, axis=0).astype(float)+1 / np.sum(train_matrix >= vote_min, axis=0)+2
        prob_sex = np.sum(train_matrix[sex_rows,:] >= vote_min, axis=0).astype(float)+1 / np.sum(train_matrix >= vote_min, axis=0)+2
        prob_grade = np.sum(train_matrix >= vote_min, axis=0).astype(float)+1 / np.sum(train_matrix > 0, axis=0)+2
        comb_prob = np.multiply(prob_grade,np.multiply(prob_job,np.multiply(prob_age,prob_sex)))
        
        ## Applies correction for NaN (if a probability is NaN, becomes 0.5, since we do not know the like/dislike option)
        lap_prob = np.array(comb_prob)[0]
        nan_prob = np.nan_to_num(lap_prob)
        prob = np.where(nan_prob != 0, nan_prob, 0.5)
        
        err = np.full(prob.shape, np.nan)
        trans_matrix = train_matrix.transpose()

        ## For each of the probabilities
        for i in range(len(prob)):
            
            line = trans_matrix[i]
            
            ## Calculates the mean of likes (votes 4 and 5)
            mean_likes = np.mean(line[line >= vote_min])
            ## And calculates the mean of dislikes (votes 1, 2 and 3)
            mean_dislikes = np.mean(line[line < vote_min])
    
            ## Calculates prediction based on the equation given in class
            pred = min(max(((1-prob[i]) * mean_dislikes)  + (prob[i] * mean_likes), 1),5)
    
            ## If the probability is higher than 0.5, predicts a like
            if (prob[i] >= 0.5):
                dif = (mean_likes - pred)
        
            ## If the probability is lower than 0.5, predicts a dislike
            else:
                dif = (mean_dislikes - pred)

            ## Calculates fold error
            err[i] =  dif ** 2
            mean_mse = np.mean(err)
            
        overall_error.append(mean_mse)

    return np.mean(overall_error)
    


#### Calculates 5-fold cross validation for different parameters

In [43]:
print(calculate_error(votes,5,"engineer","M",40))
print(calculate_error(votes,5,"administrator","F",20))
print(calculate_error(votes,5,"student","M",25))
print(calculate_error(votes,5,"marketing","F",50))

0.6110706257674501
0.6111023549264666
0.6112938997377138
0.6109275865119641


#### Part 2 - Give 10 movie recommendations

In [49]:
def give_recommendations(data, profession, sex, age):
    
    ## Buils train matrix
    train_matrix = csr_matrix((votes.loc[:, 'rating'], (votes.loc[:, 'user.id'], votes.loc[:, 'item.id'])))
        
    ## The minimum grade that makes one person like a movie
    vote_min = 4

    ## Selects rows based on the parameters
    pro_rows = u[u["job"]==profession].index.tolist()
    sex_rows = u[u["gender"]==sex].index.tolist()

    ## Age must be delimited by 25 years
    age_rows = u[u["age"] > age-25][u["age"] < age+25].index.tolist()

    ## Calculates Bayes Theorem for each parameter, applying Laplace correction
    prob_job = np.sum(train_matrix[pro_rows,:] >= vote_min, axis=0).astype(float) / np.sum(train_matrix >= vote_min, axis=0)
    prob_age = np.sum(train_matrix[age_rows,:] >= vote_min, axis=0).astype(float) / np.sum(train_matrix >= vote_min, axis=0)
    prob_sex = np.sum(train_matrix[sex_rows,:] >= vote_min, axis=0).astype(float) / np.sum(train_matrix >= vote_min, axis=0)
    prob_grade = np.sum(train_matrix >= vote_min, axis=0).astype(float) / np.sum(train_matrix > 0, axis=0)
    comb_prob = np.multiply(prob_grade,np.multiply(prob_job,np.multiply(prob_age,prob_sex)))
    
    prob = np.array(comb_prob)[0]
    nb_nan = np.sum(np.isnan(prob))

    ## Recommends 10 movies
    print("************************************************************************")
    print("Movie recommendations for a/an {}, of gender {}, {} years old".format(profession,sex,age))
    movies = np.argsort(prob)[-(nb_nan + 11):-nb_nan][::-1]
    recom_set = items.loc[movies]
    print(recom_set[" movie title "])

In [50]:
give_recommendations(votes,"engineer","M",40)
give_recommendations(votes,"administrator","F",20)
give_recommendations(votes,"student","M",25)
give_recommendations(votes,"marketing","F",50)

************************************************************************
Movie recommendations for a/an engineer, of gender M, 40 years old
1541    Scarlet Letter, The (1926)
1143        Quiet Room, The (1996)
1448        Pather Panchali (1955)
1409                 Harlem (1993)
1267            Bitter Moon (1992)
1439          Above the Rim (1994)
1301          Late Bloomers (1996)
1533              Twin Town (1997)
1270                  North (1994)
785       Perez Family, The (1995)
1523          Kaspar Hauser (1993)
Name:  movie title , dtype: object
************************************************************************
Movie recommendations for a/an administrator, of gender F, 20 years old
1129                     Jupiter's Wife (1994)
1325                               Boys (1996)
1599                       Guantanamera (1994)
1268              Love in the Afternoon (1957)
1384    Roseanna's Grave (For Roseanna) (1997)
781                       Little Odessa (1994)
1263         