In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle as pkl
import sys
import seaborn as sns

from fastFM import als
from fastFM.datasets import make_user_item_regression
from scipy.sparse import csc_matrix, hstack, vstack
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, mean_absolute_error
from hyperopt import tpe, fmin, hp, Trials, STATUS_OK
from collections import OrderedDict

% matplotlib inline
sns.set_style('whitegrid')

### Import data

In [60]:
# import full data
data_path = os.path.join('..', 'data-2')
splits_path = os.path.join(data_path,'splits')
sparse_path = os.path.join(data_path, 'sparse')
columns = ['user','item','rating']

with open(os.path.join(splits_path, 'train.df'), 'rb') as file_in:
    train_df = pkl.load(file_in)
    
with open(os.path.join(splits_path, 'dev.df'), 'rb') as file_in:
    cv_df = pkl.load(file_in)
    
with open(os.path.join(splits_path, 'test.df'), 'rb') as file_in:
    test_df = pkl.load(file_in)
    
# import sparse data    
sparse = dict()
features = ['actors', 'country', 'directors-imdb', 'genres-amazon', 'genres-imdb', 'language', 'mpaa',
           'studios-amazon', 'studios-imdb', 'type', 'user-item']
for feature in features:
    with open(os.path.join(sparse_path, feature + '.dict'), 'rb') as file_in:
        sparse[feature] = pkl.load(file_in)
        
# import results from LSH and Baseline
results_lsh_train = pd.read_csv(os.path.join(data_path, 'results_lsh_train.csv'))
results_lsh_cv = pd.read_csv(os.path.join(data_path, 'results_lsh_cv.csv'))
results_lsh_test = pd.read_csv(os.path.join(data_path, 'results_lsh_test.csv'))

results_baseline_train = pd.read_csv(os.path.join(data_path, 'results_baseline_train.csv'))
results_baseline_cv = pd.read_csv(os.path.join(data_path, 'results_baseline_cv.csv'))
results_baseline_test = pd.read_csv(os.path.join(data_path, 'results_baseline_test.csv'))

train_df['pred_lsh'] = results_lsh_train['prediction'].values
train_df['pred_baseline'] = results_baseline_train['pred'].values

cv_df['pred_lsh'] = results_lsh_cv['prediction'].values
cv_df['pred_baseline'] = results_baseline_cv['pred'].values

test_df['pred_lsh'] = results_lsh_test['prediction'].values
test_df['pred_baseline'] = results_baseline_test['pred'].values

In [61]:
train_df

Unnamed: 0,item,user,rating,title,box_office,country,language,metascore,mpaa_rating,runtime,...,imdb_votes,directors,genres_amazon,actors,studios_amazon,sales_rank,pred_lsh,pred_baseline,pred_fm_ratings,pred_fm_lsh
36684,6300214575,A1O60Q7LBN4MQ,4.0,Testament,,[usa],[english],,PG,90.0,...,4849.0,[lynne littman],[],"[jane alexander, william devane, rossie harris...",[paramount],262065.0,3.658580,4.263574,4.211944,4.236853
307578,B005LAIIA8,A16IB3AQZQKKK0,5.0,Frankenweenie,35287788.0,[usa],[english],74.0,PG,87.0,...,81994.0,[tim burton],[],"[winona ryder, catherine o'hara, martin short,...","[buena, vista]",76122.0,3.945169,4.002513,3.987346,3.972175
1904,B004UXUX4Q,A3SS6VRWCTB7V,4.0,The Descendants,78500000.0,[usa],[english],84.0,R,115.0,...,214728.0,[alexander payne],[drama],"[george clooney, judy greer]","[fox, searchlight]",8233.0,3.212189,3.742103,3.728722,3.736297
261125,B00005V1WW,A1TO1P3NV7OAU6,4.0,The One,,[usa],[english],25.0,PG-13,87.0,...,82150.0,[james wong],"[science, fiction]","[jason statham, delroy lindo, carla gugino, je...","[sony, picture, home, entertainment]",65673.0,4.365854,4.194401,4.313058,4.173425
30998,6301404440,A1K3NQAO49LXL2,5.0,Who Framed Roger Rabbit,,[usa],[english],83.0,PG,104.0,...,154844.0,[robert zemeckis],[],"[bob hoskins, christopher lloyd, joanna cassid...","[walt, disney, video]",130330.0,4.218719,4.636631,4.749815,4.742123
381514,0790731886,A11FX8HL2ANK6T,5.0,Arthur,,[usa],[english],69.0,PG,97.0,...,22718.0,[steve gordon],[comedy],"[dudley moore, liza minnelli, john gielgud, ge...",[warnerbrother],4475.0,4.448013,4.687167,4.835389,4.739586
144812,B000M341QE,A1IWR4YH4ZA9BM,5.0,The Departed,132300000.0,"[usa, hong kong]","[english, cantonese]",85.0,R,151.0,...,972551.0,[martin scorsese],[drama],"[leonardo dicaprio, matt damon, jack nicholson...",[warnerbrother],15794.0,4.602179,4.973455,4.455645,4.429202
209178,B002VECM1S,A2CK0VXUDVXGMN,2.0,It's Complicated,112703470.0,"[usa, japan]","[english, french]",57.0,R,120.0,...,74681.0,[nancy meyers],[],"[john krasinski, meryl streep]","[universal, picture]",,2.839109,2.663505,2.700457,2.732101
34212,6304744404,AAOF195P95ZVC,3.0,Event Horizon,,"[uk, usa]","[english, latin]",35.0,R,96.0,...,131953.0,[paul w.s. anderson],[],"[laurence fishburne, sam neill, kathleen quinl...",[paramount],281755.0,3.181709,3.062755,3.158599,3.214903
18321,B001UV4XHY,AT3NIL6Q0G44I,4.0,Harry Potter and the Deathly Hallows: Part 1,294980434.0,"[uk, usa]",[english],65.0,PG-13,146.0,...,373457.0,[david yates],[],"[daniel radcliffe, rupert grint, emma watson, ...",[warnerbrother],15395.0,4.120982,4.330097,4.128601,4.161422


### Additional data cleaning

In [48]:
# convert years to ints
train_df['year'] = train_df['year'].apply(lambda x: x[0:4]).astype(int)
cv_df['year'] = cv_df['year'].apply(lambda x: x[0:4]).astype(int)
test_df['year'] = test_df['year'].apply(lambda x: x[0:4]).astype(int)

# Train and test using output from LSH and Baseline

In [49]:
test_df.head(2)

Unnamed: 0,item,user,rating,title,box_office,country,language,metascore,mpaa_rating,runtime,...,studios_imdb,imdb_rating,imdb_votes,directors,genres_amazon,actors,studios_amazon,sales_rank,pred_lsh,pred_baseline
314030,B0002IQJ8W,A21XVS00AIENLX,4.0,Mean Girls,0.012681,"[usa, canada]","[english, german, vietnamese, swahili]",0.32333,PG-13,-0.394437,...,"[paramount, picture, mg, film, broadway, video]",7.0,0.51092,[mark waters],[],"[lindsay lohan, jonathan bennett, rachel mcada...",[paramount],-0.365103,4.02151,4.32494
336736,B009AMAOTQ,A3OVE04AUPODOT,5.0,Last Vegas,-0.329912,[usa],[english],-0.670356,PG-13,-0.167343,...,"[cbs, film, good, universe, laurence, mark, pr...",6.6,-0.18702,[jon turteltaub],[],"[robert de niro, morgan freeman]","[cbs, film]",,4.360181,4.424858


In [50]:
def build_data(use_actors, use_country, use_directors, use_genres, use_language, use_mpaa, use_studios, use_type,
               use_scores, use_popularity, use_year, use_model_results):
    X_orig = dict(train=train_df, cv=cv_df, test=test_df)
    X = dict()
    y = dict()
    datasets = ['train', 'cv', 'test']
    for dataset in datasets:
        X[dataset] = sparse['user-item'][dataset]
        y[dataset] = X_orig[dataset]['rating']
    X['columns'] = list()

    sparse_features = list()
    dense_features = list()
    if use_actors: sparse_features.append('actors')
    if use_country: sparse_features.append('country')
    if use_directors: sparse_features.append('directors-imdb')
    if use_genres:
        sparse_features.extend(['genres-amazon','genres-imdb'])
        dense_features.append('vfx')
    if use_language: sparse_features.append('language')
    if use_mpaa: sparse_features.append('mpaa')
    if use_studios: sparse_features.extend(['studios-amazon','studios-imdb'])
    if use_type:
        sparse_features.append('type')
        dense_features.append('runtime')
    if use_scores: dense_features.extend(['metascore','imdb_rating'])
    if use_popularity: dense_features.extend(['imdb_votes','sales_rank','box_office'])
    if use_year: dense_features.append('year')
    if use_model_results: dense_features.extend(['pred_lsh', 'pred_baseline'])
    
    for feature in sparse_features:
        X['columns'] += list(sparse[feature]['columns'])
        for dataset in datasets:
            X[dataset] = hstack([X[dataset], sparse[feature][dataset]])
            
    for feature in dense_features:
        X['columns'].append(feature)
        for dataset in datasets:
            new_data = X_orig[dataset][feature]
            median = new_data.quantile(0.5)
            new_data = new_data.fillna(median).values
            new_data = new_data.reshape(X_orig[dataset].shape[0],-1)
            X[dataset] = hstack([X[dataset], new_data])
            
    return X, y    

In [51]:
X_, y_ = build_data(use_actors=True, use_country=True, use_directors=True, use_genres=True,
                 use_language=True, use_mpaa=True, use_studios=True, use_type=True,
                 use_scores=True, use_popularity=True, use_year=True, use_model_results=True)

In [57]:
X_, y_ = build_data(use_actors=False, use_country=False, use_directors=False, use_genres=True,
                 use_language=False, use_mpaa=False, use_studios=False, use_type=False,
                 use_scores=False, use_popularity=False, use_year=False, use_model_results=True)

fm = als.FMRegression(n_iter=883,
                          rank=2,
                          init_stdev=0.4,
                          l2_reg_w=7.3,
                          l2_reg_V=1.5)

fm.fit(X_['train'], y_['train'])
y_cv_pred = fm.predict(X_['cv'])
print(mean_absolute_error(y_cv_pred, y_['cv']))

0.730550124517


In [56]:
mean_absolute_error(np.clip(y_cv_pred, a_max=5, a_min=1), y_['cv'])

0.72192410185889788

In [54]:
# define the objective function that the fmin module can later optimize on
def test_fm(params):
    print('==========TESTING FM==========')
    params['n_iter'] = int(params['n_iter'])
    params['rank'] = int(params['rank'])
    print(params)
    
    fm = als.FMRegression(n_iter=params['n_iter'],
                          rank=params['rank'],
                          init_stdev=params['init_stdev'],
                          l2_reg_w=params['l2_reg_w'],
                          l2_reg_V=params['l2_reg_V'])
    
    X, y = build_data(use_actors=params['use_actors'],
                      use_country=params['use_country'],
                      use_directors=params['use_directors'],
                      use_genres=params['use_genres'],
                      use_language=params['use_language'],
                      use_mpaa=params['use_mpaa'],
                      use_studios=params['use_studios'],
                      use_type=params['use_type'],
                      use_scores=params['use_scores'],
                      use_popularity=params['use_popularity'],
                      use_year=params['use_year'],
                      use_model_results=params['use_model_results'])

    # build model and evaluate
    fm.fit(X['train'], y['train'])
    y_cv_pred = fm.predict(X['cv'])
    mae = mean_absolute_error(y_cv_pred, y['cv'])
    print('MAE:', mae)
    return mae

In [None]:
use_pretrained = True

if use_pretrained:
    with open(os.path.join(data_path, 'trials_fm_external'), 'rb') as file_in:
        trials = pkl.load(file_in)
    with open(os.path.join(data_path, 'best_fm_external.dict'), 'rb') as file_in:
        best = pkl.load(file_in)
else:
    '''set the range of hyperparameters for the FM
    '''
    trials = Trials()
    space = {
        'n_iter': hp.uniform('n_iter', 100, 1000),
        'init_stdev': hp.uniform('init_stdev', 0, 1),
        'rank': hp.uniform('rank', 2, 6),
        'l2_reg_w': hp.uniform('l2_reg_w', 0, 21),
        'l2_reg_V': hp.uniform('l2_reg_V', 0, 21),
        'use_actors': hp.choice('use_actors', [False,True]),
        'use_country': hp.choice('use_country', [False,True]),
        'use_directors': hp.choice('use_directors', [False,True]),
        'use_genres': hp.choice('use_genres', [False,True]),
        'use_language': hp.choice('use_language', [False,True]),
        'use_mpaa': hp.choice('use_mpaa', [False,True]),
        'use_studios': hp.choice('use_studios', [False,True]),
        'use_type': hp.choice('use_type', [False,True]),
        'use_scores': hp.choice('use_scores', [False,True]),
        'use_popularity': hp.choice('use_popularity', [False,True]),
        'use_year': hp.choice('use_year', [False,True]),
        'use_model_results': hp.choice('use_model_results', [False,True])
    }
    
    # Choose the Tree-structured Parzen Estimator (TPE) as the algorithm to optimize the objective function
    best = fmin(algo = tpe.suggest,
               fn = test_fm,
               trials = trials,
               max_evals = 300, # max number of tests
               space = space)
    with open(os.path.join(data_path, 'trials_fm_external'), 'wb') as file_out:
        pkl.dump(trials, file_out)
    with open(os.path.join(data_path, 'best_fm_external.dict'), 'wb') as file_out:
        pkl.dump(best, file_out)

{'init_stdev': 0.7869225950466535, 'l2_reg_V': 19.050805103823443, 'l2_reg_w': 4.92064101877817, 'n_iter': 304, 'rank': 3, 'use_actors': True, 'use_country': False, 'use_directors': False, 'use_genres': True, 'use_language': False, 'use_mpaa': False, 'use_popularity': True, 'use_scores': True, 'use_studios': True, 'use_type': True, 'use_year': False}
MAE: 2.2910883472
{'init_stdev': 0.622960291130542, 'l2_reg_V': 2.9112399305925605, 'l2_reg_w': 15.811117090928366, 'n_iter': 663, 'rank': 2, 'use_actors': False, 'use_country': True, 'use_directors': False, 'use_genres': False, 'use_language': True, 'use_mpaa': False, 'use_popularity': True, 'use_scores': False, 'use_studios': False, 'use_type': False, 'use_year': True}
MAE: 0.990856411405
{'init_stdev': 0.2627607847177368, 'l2_reg_V': 14.903734407150765, 'l2_reg_w': 5.74041436786561, 'n_iter': 216, 'rank': 5, 'use_actors': True, 'use_country': True, 'use_directors': True, 'use_genres': True, 'use_language': True, 'use_mpaa': True, 'use_p

In [52]:
fm = als.FMRegression(n_iter=200, init_stdev=0.1, rank=3, l2_reg_w=7, l2_reg_V=7)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [53]:
mean_absolute_error(y_pred, y_test)

0.72999601412891579