In [3]:
import pandas as pd
import numpy as np
import json
import itertools as it
import pymongo
import ssl
from tqdm import tqdm

In [3]:
# Load data
all_ratings = pd.read_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_for_model/ratings.csv')
all_ratings.drop('rating_date', axis=1, inplace=True) 
all_ratings.shape, all_ratings.userId.nunique(), all_ratings.movieId.nunique()

# Keep only movies used in CF
content_ids = pd.read_csv('content_based_index.txt')
content_ids = content_ids['movieId'].unique()
all_ratings = all_ratings[all_ratings.movieId.isin(content_ids)]

## Input-output split

Use 15 movies as user-input, the rest as output to be predicted.

In [1]:
def assign_to_set(df):
    if len(df.index) >= 30:
        n = 15
    else:
        n = int(len(df.index)/2)
    sampled_ids = np.random.choice(df.index,
                                   size=n,
                                   replace=False)
    df.loc[sampled_ids, 'as_input'] = True
    return df

In [5]:
all_ratings.loc[:,'as_input'] = False
all_ratings = all_ratings.groupby('userId', group_keys=False).apply(assign_to_set) 

In [6]:
all_ratings.as_input.value_counts()

False    21735290
True      2336230
Name: as_input, dtype: int64

## Train-test split

Use 20\% of users for validation.

In [7]:
all_users = all_ratings.userId.unique()
np.random.shuffle(all_users)
i = int(len(all_users) * 0.2)
testing_idx = all_users[0:i]
all_ratings.loc[:,'for_testing'] = False
all_ratings.loc[all_ratings.userId.isin(testing_idx),'for_testing'] = True

In [8]:
all_ratings.for_testing.value_counts()

False    19240007
True      4831513
Name: for_testing, dtype: int64

In [8]:
all_ratings.head(5)

Unnamed: 0,userId,movieId,rating,as_input,for_testing
0,1,296,5.0,False,False
1,1,306,3.5,True,False
2,1,307,5.0,False,False
3,1,665,5.0,False,False
4,1,899,3.5,False,False


### Input to binary

We could use the median value as threshold for binarizing. However, it probably makes more sense to use the middle value (2.5), because users are likely to have seen and rated more movies that they like than movies that they dislike.

In [9]:
all_ratings.rating.value_counts()

4.0    6447742
3.0    4731283
5.0    3507930
3.5    3017481
4.5    2118631
2.0    1581343
2.5    1189064
1.0     744688
1.5     375986
0.5     357372
Name: rating, dtype: int64

In [10]:
threshold = np.median(all_ratings.rating)
threshold

4.0

In [9]:
threshold = 3.0
all_ratings[['binary_rating']] = np.where(all_ratings.rating < threshold, 0, 1)
all_ratings.binary_rating.value_counts()

1    19823067
0     4248453
Name: binary_rating, dtype: int64

In [10]:
all_ratings.to_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_for_model/ratings_split.csv')

In [172]:
all_ratings

Unnamed: 0,userId,movieId,rating,as_input,for_testing,binary_rating
16490438,106973,1,5.0,False,False,1
18733588,121460,1,5.0,False,False,1
14486242,93811,1,5.0,True,False,1
11531945,74777,1,3.0,False,False,1
14486300,93812,1,3.0,False,False,1
...,...,...,...,...,...,...
24738933,160805,175813,3.5,False,False,1
15398772,99774,175813,4.5,False,False,1
8640426,56282,175813,3.5,False,True,1
19432215,126157,175813,4.0,False,False,1


## Split ratings for parallelization

In [11]:
all_users = all_ratings.userId.unique()
np.random.shuffle(all_users)
n = 5
t = int(len(all_users)/n)

for i in range(n):
    lo = i * t
    hi = (i+1) * t
    users = all_users[lo:hi]
    ratings = all_ratings[all_ratings.userId.isin(users)]
    ratings.to_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_for_model/ratings_split'+str(i)+'.csv')
    print(i, lo, hi, ratings.shape)
    
    
    
    

0 0 32507 (4792944, 6)
1 32507 65014 (4797342, 6)
2 65014 97521 (4867544, 6)
3 97521 130028 (4834336, 6)
4 130028 162535 (4778217, 6)


## Merge ratings
After predicting all output ratings with predict_all_ratings.py, we re-merge them in a single dataframe.

In [3]:
all_ratings = pd.DataFrame({})
for i in range(5):
    ratings = pd.read_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_with_predRatings/binary2.5/ratings_split'+str(i)+'.csv')
    all_ratings = pd.concat([all_ratings, ratings], axis=0)

all_ratings[['CF_prediction','CB_prediction']] = all_ratings[['CF_prediction','CB_prediction']] * 5
all_ratings['mean_pred'] = (all_ratings.CF_prediction + all_ratings.CB_prediction)/2
all_ratings.to_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_with_predRatings/binary2.5/ratings_split.csv')

In [9]:
all_ratings

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,userId,movieId,rating,as_input,for_testing,binary_rating,CF_prediction,CB_prediction,mean_pred
0,344223,1791877,11926,1,3.0,True,False,1,,,
1,2922461,15309970,99199,1,5.0,True,False,1,,,
2,2922307,15309191,99191,1,3.5,False,True,1,4.596685,5.000000,4.798343
3,2922218,15307897,99180,1,4.0,True,False,1,,,
4,514372,2722875,18083,1,5.0,False,False,1,4.695122,5.000000,4.847561
...,...,...,...,...,...,...,...,...,...,...,...
4778212,148472,803926,5413,175813,4.0,False,True,1,4.671533,2.702703,3.687118
4778213,3550434,18361997,118906,175813,4.0,False,False,1,4.347390,5.000000,4.673695
4778214,2773054,14381892,93118,175813,3.5,False,False,1,5.000000,5.000000,5.000000
4778215,2697417,13995764,90691,175813,4.0,False,False,1,4.835897,5.000000,4.917949


In [None]:
all_ratings[(~all_ratings.as_input) & (all_ratings.userId == 300)].sort_values('rating', ascending=False).head(60)

In [11]:
def scale(df):
    minv = min(df.CF_prediction)
    maxv = max(df.CF_prediction)
    df.CF_prediction = 5*(df.CF_prediction - minv)/(maxv - minv)
    
    minv = min(df.CB_prediction)
    maxv = max(df.CB_prediction)
    df.CB_prediction = 5*(df.CB_prediction - minv)/(maxv - minv)
    
    return df

In [14]:
#all_ratings = pd.DataFrame({})
#for i in range(5):
#    ratings = pd.read_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_with_predRatings/unEscaled/ratings_split'+str(i)+'.csv')
#    all_ratings = pd.concat([all_ratings, ratings], axis=0)

# Scale to 0-5
#all_ratings = all_ratings.groupby('userId', group_keys=False).apply(scale) 
#all_ratings['mean_pred'] = (all_ratings.CF_prediction + all_ratings.CB_prediction)/2
#all_ratings.to_csv('/Users/irenebonafonte/Documents/MasterDS/AgileDS/GMAM_noGit/data_with_predRatings/unEscaled/ratings_split.csv')

In [13]:
all_ratings

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,userId,movieId,rating,as_input,for_testing,binary_rating,CF_prediction,CB_prediction,mean_pred
0,0,0,11926,1,3.0,True,False,1,,,
0,0,4,18083,1,5.0,False,False,1,3.563536,1.183228,2.373382
0,0,6,99178,1,3.0,False,True,1,4.525424,2.420914,3.473169
0,0,3,99180,1,4.0,True,False,1,,,
0,0,2,99191,1,3.5,False,True,1,3.283063,1.445668,2.364365
...,...,...,...,...,...,...,...,...,...,...,...
4845374,3904525,19292137,59398,175813,2.0,False,False,-1,3.169291,2.652680,2.910986
4845375,3904526,19292139,161720,175813,4.5,False,False,1,3.525469,0.275089,1.900279
4845376,3904527,19292144,24356,175813,3.5,False,False,1,3.959732,1.216012,2.587872
4845377,977943,4792943,41940,175813,3.5,False,False,1,3.534483,2.433939,2.984211


In [None]:
all_ratings[(~all_ratings.as_input) & (all_ratings.userId == 300)].sort_values('rating', ascending=False).head(60)