In [1]:
import pandas as pd
import numpy as np
from scipy.stats import rankdata
from ml_metrics import apk
from tqdm import tqdm

# Functions to evaluate performance

In [24]:
def rmse_evaluate(y_true, y_pred):
    """ RMSE-based predictive performance evaluation. """
    rmse = np.sqrt(np.nanmean(np.power(y_true - y_pred, 2)))
    return rmse
    

def map_evaluate(ids, y_pred, y_true, k=20, relevance_cut='median'):
    
    # Movies that we know that the user likes
    if relevance_cut == '2.5':
        relevant_movies = ids[y_true > 2.5]
        
    elif relevance_cut == 'median':
        median = np.median(y_true)
        relevant_movies = ids[(y_true > 2.5) & (y_true > median)]
    
    # K - movies that we recommend to the user
    rank = rankdata(-y_pred, method='min')
    shown_movies = ids[rank <= k]
    shown_movies = shown_movies[np.argsort(-y_pred[rank <= k])]
    
    # AP@K
    apk_v = apk(actual=relevant_movies.tolist(), predicted=shown_movies.tolist(), k=k)
    
    return apk_v

# Explore function used to make predictions

## Weighted mean

This is the most common formula. However, we see that it has some issues when working with only likes/dislikes. For instance, user 300 has only inputed likes, which results in all films having the same score.

In [200]:
all_ratings = pd.read_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_with_predRatings/binary2.5/ratings_split.csv')
all_ratings[['CF_prediction','CB_prediction']] = 5 * all_ratings[['CF_prediction','CB_prediction']]
all_ratings['mean_pred'] = (all_ratings.CF_prediction + all_ratings.CB_prediction)/2
all_ratings.loc[np.isnan(all_ratings.mean_pred),'mean_pred'] = all_ratings.loc[np.isnan(all_ratings.mean_pred),'CF_prediction']

# We can exclude input movies
all_ratings = all_ratings[~all_ratings.as_input]
all_ratings = all_ratings[~np.isnan(all_ratings.CF_prediction)]

all_ratings.drop(['Unnamed: 0'],axis=1,inplace=True)

all_ratings

Unnamed: 0,userId,movieId,rating,as_input,for_testing,binary_rating,CF_prediction,CB_prediction,mean_pred
2,99191,1,3.5,False,True,1,4.596685,5.000000,4.798343
4,18083,1,5.0,False,False,1,4.695122,5.000000,4.847561
5,151959,1,5.0,False,False,1,4.407449,4.319579,4.363514
6,99178,1,3.0,False,True,1,5.000000,5.000000,5.000000
7,99251,1,5.0,False,True,1,5.000000,5.000000,5.000000
...,...,...,...,...,...,...,...,...,...
24070378,5413,175813,4.0,False,True,1,4.671533,2.702703,3.687118
24070379,118906,175813,4.0,False,False,1,4.347390,5.000000,4.673695
24070380,93118,175813,3.5,False,False,1,5.000000,5.000000,5.000000
24070381,90691,175813,4.0,False,False,1,4.835897,5.000000,4.917949


In [None]:
all_ratings[(~all_ratings.as_input) & (all_ratings.userId == 300)].sort_values('rating', ascending=False)

## Sum likes weight, rest dislikes weight

Because this is not escaled to 0-5, we re-escale, giving 5 to the best rating and 0 to the lowest one.

In [187]:
all_ratings2 = pd.read_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_with_predRatings/unEscaled/ratings_split.csv')

# We can exclude input movies
all_ratings2 = all_ratings2[~all_ratings2.as_input]
all_ratings2 = all_ratings2[~np.isnan(all_ratings2.CF_prediction)]

all_ratings2.drop(['Unnamed: 0'],axis=1,inplace=True)

all_ratings2.loc[np.isnan(all_ratings2.mean_pred),'mean_pred'] = all_ratings2.loc[np.isnan(all_ratings2.mean_pred),'CF_prediction']

In [189]:
all_ratings2.head(5)

Unnamed: 0,userId,movieId,rating,as_input,for_testing,binary_rating,CF_prediction,CB_prediction,mean_pred
1,18083,1,5.0,False,False,1,3.563536,1.183228,2.373382
2,99178,1,3.0,False,True,1,4.525424,2.420914,3.473169
4,99191,1,3.5,False,True,1,3.283063,1.445668,2.364365
6,120383,1,3.5,False,False,1,3.43832,3.668616,3.553468
7,144678,1,5.0,False,True,1,3.835616,1.402845,2.619231


## Evaluate performance

In [191]:
# Random values as reference
all_ratings['random'] = 5*np.random.rand(all_ratings.shape[0])
all_ratings2['random'] = 5*np.random.rand(all_ratings2.shape[0])
all_ratings2

Unnamed: 0,userId,movieId,rating,as_input,for_testing,binary_rating,CF_prediction,CB_prediction,mean_pred,random
1,18083,1,5.0,False,False,1,3.563536,1.183228,2.373382,0.062352
2,99178,1,3.0,False,True,1,4.525424,2.420914,3.473169,1.252458
4,99191,1,3.5,False,True,1,3.283063,1.445668,2.364365,0.778574
6,120383,1,3.5,False,False,1,3.438320,3.668616,3.553468,3.507148
7,144678,1,5.0,False,True,1,3.835616,1.402845,2.619231,3.719423
...,...,...,...,...,...,...,...,...,...,...
24070378,59398,175813,2.0,False,False,-1,3.169291,2.652680,2.910986,2.108432
24070379,161720,175813,4.5,False,False,1,3.525469,0.275089,1.900279,3.252166
24070380,24356,175813,3.5,False,False,1,3.959732,1.216012,2.587872,4.820303
24070381,41940,175813,3.5,False,False,1,3.534483,2.433939,2.984211,1.331563


In [192]:
to_eval = ['CF_B_RMSE','CF_B_APK','CF_B_invAPK',
          'CB_B_RMSE','CB_B_APK','CB_B_invAPK',
          'MN_B_RMSE','MN_B_APK','MN_B_invAPK',
          'RD_B_RMSE','RD_B_APK','R_B_invAPK',           
          'CF_UE_RMSE','CF_UE_APK','CF_UE_invAPK',
          'CB_UE_RMSE','CB_UE_APK','CB_UE_invAPK',
          'MN_UE_RMSE','MN_UE_APK','MN_UE_invAPK',
          'RD_UE_RMSE','RD_UE_APK','RD_UE_invAPK',]

users = all_ratings.userId.unique()
np.random.shuffle(users)
users = users[0:10000]
performance = np.zeros((len(users), len(to_eval)))
performance.shape

(10000, 24)

In [196]:
import warnings
warnings.filterwarnings('ignore')

i = 0
for user in tqdm(users):
    
    # 1st version
    ratings = all_ratings[(~all_ratings.as_input) & (all_ratings.userId == user)]
    
    # CF
    performance[i,0] = rmse_evaluate(ratings.rating.values, ratings.CF_prediction.values)
    performance[i,1] = map_evaluate(ratings.movieId.values, ratings.CF_prediction.values, ratings.rating.values)
    performance[i,2] = map_evaluate(ratings.movieId.values, ratings.rating.values, ratings.CF_prediction.values)
    
    # CB
    performance[i,3] = rmse_evaluate(ratings.rating.values, ratings.CB_prediction.values)
    performance[i,4] = map_evaluate(ratings.movieId.values, ratings.CB_prediction.values, ratings.rating.values)
    performance[i,5] = map_evaluate(ratings.movieId.values, ratings.rating.values, ratings.CB_prediction.values)
    
    # Mean
    performance[i,6] = rmse_evaluate(ratings.rating.values, ratings.mean_pred.values)
    performance[i,7] = map_evaluate(ratings.movieId.values, ratings.mean_pred.values, ratings.rating.values)
    performance[i,8] = map_evaluate(ratings.movieId.values, ratings.rating.values, ratings.mean_pred.values)
    
    # Random
    performance[i,9] = rmse_evaluate(ratings.rating.values, ratings.random.values)
    performance[i,10] = map_evaluate(ratings.movieId.values, ratings.random.values, ratings.rating.values)
    performance[i,11] = map_evaluate(ratings.movieId.values, ratings.rating.values, ratings.random.values)
    
    # 2nd version
    ratings = all_ratings2[(~all_ratings2.as_input) & (all_ratings2.userId == user)]
    
    # CF
    performance[i,12] = rmse_evaluate(ratings.rating.values, ratings.random.values)
    performance[i,13] = map_evaluate(ratings.movieId.values, ratings.random.values, ratings.rating.values)
    performance[i,14] = map_evaluate(ratings.movieId.values, ratings.rating.values, ratings.random.values)
    
    # CB
    performance[i,15] = rmse_evaluate(ratings.rating.values, ratings.random.values)
    performance[i,16] = map_evaluate(ratings.movieId.values, ratings.random.values, ratings.rating.values)
    performance[i,17] = map_evaluate(ratings.movieId.values, ratings.rating.values, ratings.random.values)
    
    # Mean
    performance[i,18] = rmse_evaluate(ratings.rating.values, ratings.random.values)
    performance[i,19] = map_evaluate(ratings.movieId.values, ratings.random.values, ratings.rating.values)
    performance[i,20] = map_evaluate(ratings.movieId.values, ratings.rating.values, ratings.random.values)
    
    # Random
    performance[i,21] = rmse_evaluate(ratings.rating.values, ratings.random.values)
    performance[i,22] = map_evaluate(ratings.movieId.values, ratings.random.values, ratings.rating.values)
    performance[i,23] = map_evaluate(ratings.movieId.values, ratings.rating.values, ratings.random.values)     
    
    i += 1
    

100%|██████████| 10000/10000 [06:07<00:00, 27.22it/s]


In [198]:
performance_vals = np.nanmean(performance, axis=1)

for measure, value in zip(to_eval, performance_vals):
    print(measure + ': ' + str(value))

CF_B_RMSE: 0.8166093325659108
CF_B_APK: 0.711214025728844
CF_B_invAPK: 0.8219113396776644
CB_B_RMSE: 0.9997190125907296
CB_B_APK: 0.9960082197375236
CB_B_invAPK: 1.0439697086230444
MN_B_RMSE: 1.0191661189430616
MN_B_APK: 0.8609590195134385
MN_B_invAPK: 0.968924860013891
RD_B_RMSE: 0.8115897933121587
RD_B_APK: 0.9170072451863032
R_B_invAPK: 0.792741285495648
CF_UE_RMSE: 0.9113751565040848
CF_UE_APK: 0.8467611060852042
CF_UE_invAPK: 0.8137238492819069
CB_UE_RMSE: 0.6431478711365971
CB_UE_APK: 0.8099076006224705
CB_UE_invAPK: 0.8895888990207309
MN_UE_RMSE: 0.896036421441427
MN_UE_APK: 0.7289962651292171
MN_UE_invAPK: 0.9913243171859474
RD_UE_RMSE: 0.725927224220371
RD_UE_APK: 0.7365396738548534
RD_UE_invAPK: 0.8863702213901089


## Add additional evidence

In [31]:
metadata = pd.read_csv('../data/input/movies.csv')
metadata.drop(['imdbId','tmdbId','directors','title','genres'], axis=1, inplace=True)
metadata.head()

directors = pd.read_csv('../data/input/directors.csv')
directors.drop(['birthYear','deathYear'], axis=1, inplace=True)
directors.head()

metadata = pd.merge(metadata, directors, on='directorId')
metadata.drop(['directorId'], axis=1, inplace=True)
metadata.head(3)

Unnamed: 0,movieId,year,num_ratings,avg_rating,isAdult,runtimeMinutes,no_genres,action,adventure,animation,...,horror,imax,musical,mystery,romance,sci-fi,thriller,war,western,name
0,1,1995.0,57309,3.89,0.0,81.0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,John Lasseter
1,45517,2006.0,8147,3.33,0.0,117.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,John Lasseter
2,2,1995.0,24228,3.25,0.0,104.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Joe Johnston


In [13]:
len(users)

50000

In [32]:
user_input = pd.read_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_for_model/ratings_split.csv')
user_input = user_input[user_input.as_input]
user_input = user_input.drop(['Unnamed: 0'], axis=1)
user_input = pd.merge(user_input, metadata, on='movieId')
user_input.loc[user_input.binary_rating == 0,'binary_rating'] = -1

# Save important directors
directors = user_input.name.value_counts()[user_input.name.value_counts() >10000].index.values.tolist()

# Subset
users = user_input.userId.unique()[0:50000]
#user_input = user_input[user_input.userId.isin(users)]

In [36]:
all_ratings = pd.read_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_with_predRatings/unEscaled/ratings_split.csv')
all_ratings = all_ratings[all_ratings.userId.isin(users)]
all_ratings.drop(['Unnamed: 0'], axis=1, inplace=True)
all_ratings = pd.merge(all_ratings, metadata, on='movieId')
all_ratings.loc[all_ratings.binary_rating == 0,'binary_rating'] = -1
all_ratings = all_ratings[~all_ratings.as_input]

input_meta = pd.read_csv('../../GMAM_noGit/data_for_hybrid/input_summary.csv')
input_meta = input_meta.set_index('Unnamed: 0', drop=True)
input_meta = input_meta.loc[users]

In [5]:
# Columns to be filled as user-input stats
review_summary_cols = user_input.columns.values[6:31].tolist()
other_columns = ['total', 'year_SD', 'num_ratings_SD', 'avg_rating_SD', 'runtimeMinutes_SD']
columns = np.array(other_columns + review_summary_cols + directors, dtype='object')
columns = ('L_' + columns).tolist() + ('D_' + columns).tolist()
genres = user_input.columns.values[11:31]

# Data frame to store results
input_meta = pd.DataFrame(np.zeros((len(users),len(columns))),
                          index=users, columns=columns)

In [7]:
for user in tqdm(users):
    for rate in [-1, 1]:
        df = user_input[(user_input.userId == user) & (user_input.binary_rating == rate)]

        stats = [len(df), np.std(df.year), np.std(df.num_ratings), np.std(df.avg_rating), np.std(df.runtimeMinutes), np.mean(df.year), np.mean(df.num_ratings), 
         np.mean(df.avg_rating), np.sum(df.isAdult), np.mean(df.runtimeMinutes)]

        n_genres = [np.sum(df[genre]) for genre in genres]
        n_directors = [np.sum(df.name == director) for director in directors]

        stats = stats + n_genres + n_directors

        if rate == 1:
            input_meta.loc[user][0:69] = stats

        else:
            input_meta.loc[user][69:138] = stats




100%|██████████| 162538/162538 [36:54<00:00, 73.40it/s] 


In [8]:
input_meta.to_csv('../../GMAM_noGit/data_for_hybrid/input_summary.csv')
input_meta.head(5)

Unnamed: 0,L_total,L_year_SD,L_num_ratings_SD,L_avg_rating_SD,L_runtimeMinutes_SD,L_year,L_num_ratings,L_avg_rating,L_isAdult,L_runtimeMinutes,...,D_Jan de Bont,D_Tony Scott,D_Richard Donner,D_Michael Bay,D_Mel Gibson,D_Brian De Palma,D_Andrew Stanton,D_Sam Mendes,D_Andrew Davis,D_Barry Levinson
1,13.0,21.61689,10445.550837,0.208636,26.016153,1984.692308,10497.846154,3.803077,0.0,120.076923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,12.0,1.656217,11653.647215,0.402054,14.052481,1993.583333,20458.416667,3.521667,0.0,114.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179,15.0,1.306395,22478.014262,0.305493,18.351627,1993.6,20052.866667,3.732667,0.0,123.533333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
332,9.0,22.788886,16889.776224,0.209025,14.204851,1980.333333,17107.555556,3.825556,0.0,111.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
523,14.0,9.613978,8285.257345,0.401025,36.04051,1991.0,13545.857143,3.679286,0.0,134.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# Info about the movies that we want to predict
# Director to dummy
all_ratings.loc[~all_ratings.name.isin(directors),'name'] = np.nan
director_df = pd.get_dummies(all_ratings.name, prefix='director')

all_ratings = pd.concat([all_ratings, director_df], axis=1)
all_ratings.drop(['movieId','as_input','name','binary_rating','mean_pred'], axis=1, inplace=True)
all_ratings = all_ratings[~np.isnan(all_ratings.CB_prediction)]
all_ratings = all_ratings[~np.isnan(all_ratings.CF_prediction)]

all_ratings = all_ratings.set_index('userId', drop=True)
all_ratings.head(3)

Unnamed: 0_level_0,movieId,rating,as_input,for_testing,binary_rating,CF_prediction,CB_prediction,mean_pred,year,num_ratings,...,director_Roland Emmerich,director_Ron Howard,director_Sam Mendes,director_Stanley Kubrick,director_Steven Spielberg,director_Terry Gilliam,director_Tim Burton,director_Tom Shadyac,director_Tony Scott,director_Wolfgang Petersen
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
120383,1,3.5,False,False,1,3.43832,3.668616,3.553468,1995.0,57309,...,0,0,0,0,0,0,0,0,0,0
152300,1,4.0,False,True,1,3.39939,1.351103,2.375247,1995.0,57309,...,0,0,0,0,0,0,0,0,0,0
10237,1,3.0,False,False,1,3.158458,0.940342,2.0494,1995.0,57309,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Merge input movies summary-info with output movies info
all_ratings = pd.merge(all_ratings, input_meta, left_index=True, right_index=True)
print('done')
# all_ratings.to_csv('../../GMAM_noGit/data_for_hybrid/ratings_split.csv')

columns = all_ratings.columns[(all_ratings.columns != 'rating') & (all_ratings.columns != 'for_testing')].values
train_y = all_ratings.loc[~all_ratings.for_testing,'rating'].values
train_x = all_ratings[~all_ratings.for_testing].drop(['rating','for_testing'], axis=1, inplace=False).to_numpy()

# test_ids = all_ratings.loc[all_ratings.for_testing,'movieId'].values
test_y = all_ratings.loc[all_ratings.for_testing,'rating'].values
test_x = all_ratings[all_ratings.for_testing].drop(['rating','for_testing'], axis=1, inplace=False).to_numpy()

# when there are no dislikes, we have nans
train_x[np.isnan(train_x)] = 0
test_x[np.isnan(test_x)] = 0


# del all_ratings

train_x.shape, train_y.shape, test_x.shape, test_y.shape

In [11]:
from sklearn.ensemble import GradientBoostingRegressor

regr = GradientBoostingRegressor(random_state=0, n_estimators=100, verbose=True)
print('fiting')
regr.fit(train_x, train_y)
print('predicting')
predictions = regr.predict(test_x)
print('scoring')
regr.score(test_x, test_y)

fiting
      Iter       Train Loss   Remaining Time 
         1           1.0413           63.34m
         2           1.0035           63.13m
         3           0.9720           62.69m
         4           0.9454           62.14m
         5           0.9231           61.67m
         6           0.9043           61.02m
         7           0.8882           60.33m
         8           0.8745           59.54m
         9           0.8628           58.84m
        10           0.8526           58.17m
        20           0.8020           52.30m
        30           0.7859           45.30m
        40           0.7791           38.44m
        50           0.7754           31.76m
        60           0.7730           25.28m
        70           0.7714           18.89m
        80           0.7702           12.52m
        90           0.7692            6.23m
       100           0.7684            0.00s
predicting
scoring


0.2845411319327076

In [13]:
from joblib import dump, load
dump(regr, 'GBR100.joblib') 

['GBR100.joblib']

In [19]:
x = pd.DataFrame({'variable': columns, 'importance': regr.feature_importances_})
x.sort_values('importance', ascending=False).head(50)

Unnamed: 0,variable,importance
4,avg_rating,0.585433
135,D_total,0.179223
0,CF_prediction,0.097258
138,D_avg_rating_SD,0.044473
142,D_avg_rating,0.022801
73,L_avg_rating,0.019706
66,L_total,0.011567
136,D_year_SD,0.010934
140,D_year,0.003871
69,L_avg_rating_SD,0.002613


In [27]:
# all_ratings = pd.read_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_with_predRatings/unEscaled/ratings_split.csv')
users = user_input.userId.unique()[0:50000]
all_ratings = all_ratings[all_ratings.userId.isin(users)]
ids = all_ratings.loc[all_ratings.for_testing,'movieId'].values

In [28]:
len(ids)

1320109

In [51]:
predictions

array([3.86012045, 3.44662285, 3.47272372, ..., 3.30698905, 3.34216425,
       3.64085851])

In [50]:
test_x.shape

(1059040, 204)