In [1]:
from helpers import *
from baseline import *
from baseline_helpers import *
from surprise_helpers import *
from spotlight_helpers import *
from pyfm_helpers import *
import scipy.optimize as sco

from os import listdir
from os.path import isfile, join

In [3]:
train_df = load_dataset(train_dataset)
train_df.head()

[load_dataset] Valid: (1176813, 3)


Unnamed: 0,User,Movie,Rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [48]:
def define_folds(df, k):
    """Defines k batch of dataset for cross validation"""
    folds_dict = dict.fromkeys(range(k))
    
    for i in range(k):
        # split half-half for train/test
        train, test = split_dataset(df, p_test=0.5)
        folds_dict[i] = dict(train=train, test=test)
    
    return folds_dict

In [49]:
folds_dict = define_folds(train_df, 5)

In [85]:
def load_models():
    print("Loading models...")
    models_dict = dict(
        # Baseline parameters: (train, test)
        baseline = dict(
            global_mean = baseline_global_mean,
            global_median = baseline_global_median,
            user_mean = baseline_user_mean,
            user_median = baseline_user_median,
            movie_mean = baseline_movie_mean,
            movie_median = baseline_movie_median,
            movie_mean_user_std = movie_mean_user_standardize,
            movie_median_user_std = movie_median_user_standardize,
            movie_mean_user_habit_std = movie_mean_user_habit_standardize,
            movie_median_user_habit_std = movie_median_user_habit_standardize,
            movie_mean_user_habit = movie_mean_user_habit,
            movie_median_user_habit = movie_median_user_habit,
        ),
        
        surprise
        surprise = dict(
            surprise_svd = SVD(n_factors=50, n_epochs=200, lr_bu=1e-9 , lr_qi=1e-5, reg_all=0.01),
            surprise_svd_pp = SVDpp(n_factors=50, n_epochs=200, lr_bu=1e-9 , lr_qi=1e-5, reg_all=0.01),
            surprise_knn = KNNBaseline(k=100, sim_options={'name': 'pearson_baseline', 'user_based': False}),
        ),
        spotlight
        spotlight = dict(
            spotlight=ExplicitFactorizationModel(loss='regression',
                                   embedding_dim=150,  # latent dimensionality
                                   n_iter=50,  # number of epochs of training
                                   batch_size=256,  # minibatch size
                                   l2=1e-5,  # strength of L2 regularization
                                   learning_rate=0.0001,
                                   use_cuda=torch.cuda.is_available()),
        ),
        # als
        
        # pyfm
        pyfm = dict(
            pyfm=pylibfm.FM(num_factors=20, num_iter=200, verbose=True, 
                          task="regression", initial_learning_rate=0.001, 
                          learning_rate_schedule="optimal")
        ),
        # keras
        # MF
    )
    
    model_msg = "{} model families loaded:\n ".format(len(list(models_dict.keys())))
    for i in list(models_dict.keys()):
        model_msg = model_msg + "{}; ".format(i)
    print(model_msg)
    return models_dict
    

In [86]:
models = load_models()

Loading models...
1 model families loaded:
 baseline; 


In [16]:
def load_algos():
    algo_dict = dict(
        baseline = baseline_algo, # baseline_algo(train, test, model)
        surprise = surprise_algo, # surprise_algo(train, test, algo, verbose=True, training=False)
        spotlight = spotlight_algo, # spotlight_algo(train, test, model, verbose=True)
        pyfm = pyfm_algo,
    )
    return algo_dict
algos = load_algos()

In [108]:
def predict_and_save(folder = "./predictions/", training = True):
    # create folder if not existent
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    # load csv
    train_df = load_dataset(train_dataset, min_num_ratings = 0)
    test_df = load_dataset(test_dataset, min_num_ratings = 0)
    
    # Split training to blend
    if training:
        print("Splitting data for training...")
        train = train_df.copy()
        train_df, test_df = split_dataset(train_df, p_test=0.5)
        # folds_dict = define_folds(train_df, 5) - FOR FOLDS?
    
    # dictionary of the predictions
    predictions = dict()
        
    # load models
    models_dict = load_models()
    # load algos
    algo_dict = load_algos()
    t = Timer()
    t.start()
    for model_family_name, model_family in models_dict.items():
        algo = algo_dict[model_family_name]
        print("Predicting using algo: {}, model: {}...".format(algo, model_family_name))

        for model_name, model in model_family.items():
            print("Time: {}, predicting with model: {}".format(t.now(), model_name))
            if model_family == 'baseline':
                if training:
                    prediction = algo(train, test_df, model)
                else: # predicting
                    prediction = algo(train_df.copy(), test_df.copy(), model)
            else:
                prediction = algo(train_df, test_df, model)
            print("Time: {}, Saving results of {}...\n".format(t.now(), model_name))
            prediction.to_csv("{}{}_predictions({}).csv".format(folder, model_name, t.now()))
            predictions[model_name] = prediction
        
    return predictions, test_df
        

In [18]:
predictions, ground_truth = predict_and_save("./predictions_train/")

[load_dataset] Valid: (1176813, 3)
[load_dataset] Valid: (1176790, 3)
[split_dataset] Valid: (1176813, 3)
Loading models...
1 model families loaded:
 baseline; 
Predicting using algo: <function baseline_algo at 0x7f637a7f80d0>, model: baseline...
Time: 0:00:00.000034, predicting with model: global_mean
Time: 0:00:00.010808, Saving results of global_mean...

Time: 0:00:02.729773, predicting with model: global_median
Time: 0:00:02.752637, Saving results of global_median...

Time: 0:00:05.199607, predicting with model: user_mean
Time: 0:00:10.967199, Saving results of user_mean...

Time: 0:00:13.817070, predicting with model: user_median
Time: 0:00:21.281023, Saving results of user_median...

Time: 0:00:24.030324, predicting with model: movie_mean
Time: 0:00:25.110543, Saving results of movie_mean...

Time: 0:00:28.376726, predicting with model: movie_median
Time: 0:00:29.249773, Saving results of movie_median...

Time: 0:00:31.381042, predicting with model: movie_mean_user_std
Time: 0:02

In [19]:
def load_predictions(folder="./predictions"):
    def get_model_name(file_name):
        results = file_name.split('_predictions')
        return results[0]
        
    pred_array = [f for f in listdir(folder) if isfile(join(folder, f))]
    # Set user, col indices
    predictions = pd.read_csv(join(folder, pred_array[0]), index_col=0).copy().reset_index(drop=True)
    predictions = predictions.drop(['Rating'], axis=1)
    predictions = predictions.sort_values(by=['User', 'Movie'])

    for i, pred in enumerate(pred_array):
        print("Reading {}/{} : {}...".format(i + 1, len(pred_array), pred))
        p = pd.read_csv(join(folder, pred), index_col=0).sort_values(by=['User', 'Movie']).reset_index(drop=True)
        p = p.rename(index=str, columns={'Rating': get_model_name(pred)})
        predictions = pd.merge(predictions, p, how='outer', on=['User', 'Movie']).reset_index(drop=True)
    
    return predictions
        
predictions_tr = load_predictions("./predictions_train/")


Reading 1/12 : global_mean_predictions(0:00:00.010909).csv...
Reading 2/12 : global_median_predictions(0:00:02.752742).csv...
Reading 3/12 : movie_mean_predictions(0:00:25.111229).csv...
Reading 4/12 : movie_mean_user_habit_predictions(0:07:18.812214).csv...
Reading 5/12 : movie_mean_user_habit_std_predictions(0:04:57.073134).csv...
Reading 6/12 : movie_mean_user_std_predictions(0:02:06.749171).csv...
Reading 7/12 : movie_median_predictions(0:00:29.249886).csv...
Reading 8/12 : movie_median_user_habit_predictions(0:08:31.053131).csv...
Reading 9/12 : movie_median_user_habit_std_predictions(0:06:06.261281).csv...
Reading 10/12 : movie_median_user_std_predictions(0:03:48.336812).csv...
Reading 11/12 : user_mean_predictions(0:00:10.967303).csv...
Reading 12/12 : user_median_predictions(0:00:21.281134).csv...


In [20]:
predictions_tr.head()


Unnamed: 0,User,Movie,global_mean,global_median,movie_mean,movie_mean_user_habit,movie_mean_user_habit_std,movie_mean_user_std,movie_median,movie_median_user_habit,movie_median_user_habit_std,movie_median_user_std,user_mean,user_median
0,1,61,3.857306,4.0,3.884834,4.071006,4.073816,4.055245,4.0,4.186172,4.174677,4.175801,4.043478,4.0
1,1,68,3.857306,4.0,3.505051,3.691223,3.699835,3.703227,4.0,4.186172,3.860551,3.816298,4.043478,4.0
2,1,310,3.857306,4.0,4.557819,4.743991,4.733773,4.737487,5.0,5.186172,4.971536,4.983678,4.043478,4.0
3,1,546,3.857306,4.0,4.275251,4.461423,4.460371,4.459905,5.0,5.186172,4.858031,4.863882,4.043478,4.0
4,1,605,3.857306,4.0,4.400752,4.586924,4.559841,4.561001,5.0,5.186172,4.851478,4.874463,4.043478,4.0


In [21]:
aa = pd.merge(ground_truth, predictions_tr, how='outer', on=['User', 'Movie'])
aa[aa.isnull().any(axis=1)]

Unnamed: 0,User,Movie,Rating,global_mean,global_median,movie_mean,movie_mean_user_habit,movie_mean_user_habit_std,movie_mean_user_std,movie_median,movie_median_user_habit,movie_median_user_habit_std,movie_median_user_std,user_mean,user_median


In [46]:
ground_truth.sort_values(by=['User', 'Movie']).head()

Unnamed: 0,User,Movie,Rating
279067,1,61,5
495645,1,68,4
258936,1,310,4
549474,1,546,5
61156,1,605,5


In [74]:
def optimize(models, ground_truth, folder="./predictions/"):
    t = Timer()
    t.start()
    print("Loading predictions....")
    predictions = load_predictions(folder=folder)
    print("Time: {}, Finished loading.".format(t.now()))
    t.stop(verbose= False)
    
    # Initialize first weights (- nb columns for User, Movie)
    w0 = [1/(len(predictions.columns) - 2) for i in range(len(predictions.columns) - 2)]
    
    print("Optimizing...")
    t.start()
    res = sco.minimize(evaluate_stacking, w0, method='SLSQP', args=(models, predictions, ground_truth), options={'maxiter': 1000, 'disp': True})
    print("Time: {}. Optimization done.".format(t.now()))
    t.stop()
    
    return res

def get_best_weights(res, models, predictions, ground_truth):
    # Create best dictionnary
    best_dict = {}
    idx = 0
    for key, model_family in models.items():
        best_dict[key] = dict()
        for name in model_family.keys():
            best_dict[key][name] = res.x[idx]
            idx = idx + 1
    
    print("Best weights: \n {}".format(best_dict))
    # test
    rmse = evaluate_stacking(res.x, models, predictions, ground_truth)
    print("Best weights rmse: {}".format(rmse))
    return best_dict, rmse

In [67]:
res = optimize(models, ground_truth)

Loading predictions....
Reading 1/12 : global_mean_predictions(0:00:00.010909).csv...
Reading 2/12 : global_median_predictions(0:00:02.752742).csv...
Reading 3/12 : movie_mean_predictions(0:00:25.111229).csv...
Reading 4/12 : movie_mean_user_habit_predictions(0:07:18.812214).csv...
Reading 5/12 : movie_mean_user_habit_std_predictions(0:04:57.073134).csv...
Reading 6/12 : movie_mean_user_std_predictions(0:02:06.749171).csv...
Reading 7/12 : movie_median_predictions(0:00:29.249886).csv...
Reading 8/12 : movie_median_user_habit_predictions(0:08:31.053131).csv...
Reading 9/12 : movie_median_user_habit_std_predictions(0:06:06.261281).csv...
Reading 10/12 : movie_median_user_std_predictions(0:03:48.336812).csv...
Reading 11/12 : user_mean_predictions(0:00:10.967303).csv...
Reading 12/12 : user_median_predictions(0:00:21.281134).csv...
Time: 0:00:13.322957, Finished loading.
Optimizing...
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.991982097036

In [75]:
best_dict, rmse = get_best_weights(res, models, predictions_tr, ground_truth)

Best weights: 
 {'baseline': {'global_mean': -0.024981370718988857, 'global_median': -0.02898930048715219, 'user_mean': 0.04665145172639839, 'user_median': 0.03575061751380726, 'movie_mean': 0.1346921975521403, 'movie_median': -0.042103646284213654, 'movie_mean_user_std': 0.35254139862350287, 'movie_median_user_std': 0.1926768467682129, 'movie_mean_user_habit_std': 0.216735516843958, 'movie_median_user_habit_std': -0.12040169416858357, 'movie_mean_user_habit': 0.20632386228911023, 'movie_median_user_habit': 0.029530390078346635}}
Best weights rmse: 0.9919820970367211


In [111]:
def predict(weight_dict):
    print("Predicting....")
    predictions, _ = predict_and_save(folder="./pred_tmp/", training=False)
    predictions = load_predictions(folder="./pred_tmp")
    print("Finished loading.")
    
    stacked = np.zeros(predictions.shape[0])
    for key, model_fam in models.items():
        weights = weight_dict[key]
        for name in model_fam.keys():
            weight = weights[name]
            print("Stacking {} * {}...".format(weight, name))
            stacked = stacked + weight * predictions[name]
    
    pred = predictions[['User', 'Movie']].copy()
    pred['Rating'] = stacked
    return pred

In [112]:
predictions = predict(best_dict)

Predicting....
[load_dataset] Valid: (1176952, 3)
[load_dataset] Valid: (1176952, 3)
Loading models...
1 model families loaded:
 baseline; 
Predicting using algo: <function baseline_algo at 0x7f637a7f80d0>, model: baseline...
Time: 0:00:00.000028, predicting with model: global_mean
Time: 0:00:00.013422, Saving results of global_mean...

Time: 0:00:05.270736, predicting with model: global_median
Time: 0:00:05.298369, Saving results of global_median...

Time: 0:00:09.327589, predicting with model: user_mean
Time: 0:00:15.683861, Saving results of user_mean...

Time: 0:00:21.105028, predicting with model: user_median
Time: 0:00:28.866726, Saving results of user_median...

Time: 0:00:32.934296, predicting with model: movie_mean
Time: 0:00:33.582614, Saving results of movie_mean...

Time: 0:00:39.057338, predicting with model: movie_median
Time: 0:00:39.829904, Saving results of movie_median...

Time: 0:00:43.930557, predicting with model: movie_mean_user_std
Time: 0:02:53.494157, Saving re

In [114]:
predictions.head()

Unnamed: 0,User,Movie,Rating
0,1,4,4.118236
1,1,8,4.062649
2,1,21,3.749176
3,1,102,4.36533
4,1,127,3.636194


In [118]:
submission = create_csv_submission(predictions)

Creating submission file...


In [119]:
submission.to_csv("blended_baseline.csv")

In [103]:
train_df = load_dataset(train_dataset)
test_df = load_dataset(test_dataset)

[load_dataset] Valid: (1176813, 3)
[load_dataset] Valid: (1176790, 3)


In [None]:
test_df

In [92]:
predictions.head()

AttributeError: 'dict' object has no attribute 'head'

In [51]:
def evaluate_stacking(weights, models, predictions, ground_truth):
    # Get stacking results
    user_movie = predictions[['User', 'Movie']]
    truth = pd.merge(user_movie, ground_truth, on=['User', 'Movie'], how='inner').reset_index(drop=True)
    pred = stack(weights, predictions, models)
    return compute_rmse(pred, truth)
    

In [63]:
def stack(weights, predictions, models):
    stacked = np.zeros(predictions.shape[0])
    idx = 0
    for key, model_fam in models.items():
        for name in model_fam.keys():
            weight = weights[idx]
#             print("Stacking {} * {}...".format(weight, name))
            stacked = stacked + weight * predictions[name]
            idx = idx + 1
    
    pred= predictions[['User', 'Movie']].copy()
    pred['Rating'] = stacked
    return pred

w0 = [1/(len(predictions_tr.columns) - 2) for i in range(len(predictions_tr.columns) - 2)]
pred = stack(w0, predictions_tr, models)
pred.head()

Unnamed: 0,User,Movie,Rating
0,1,61,4.043528
1,1,68,3.863595
2,1,310,4.567937
3,1,546,4.455485
4,1,605,4.493451


In [58]:
evaluate_stacking(w0, models, predictions_tr, ground_truth)

Stacking 0.08333333333333333 * global_mean...
Stacking 0.08333333333333333 * global_median...
Stacking 0.08333333333333333 * user_mean...
Stacking 0.08333333333333333 * user_median...
Stacking 0.08333333333333333 * movie_mean...
Stacking 0.08333333333333333 * movie_median...
Stacking 0.08333333333333333 * movie_mean_user_std...
Stacking 0.08333333333333333 * movie_median_user_std...
Stacking 0.08333333333333333 * movie_mean_user_habit_std...
Stacking 0.08333333333333333 * movie_median_user_habit_std...
Stacking 0.08333333333333333 * movie_mean_user_habit...
Stacking 0.08333333333333333 * movie_median_user_habit...


1.005557699080822

In [44]:
pred.head()

0    4.043528
1    3.863595
2    4.567937
3    4.455485
4    4.493451
dtype: float64

In [76]:
predictions_tr[predictions_tr.columns[2:]].mean(axis=1).head()

0    4.043528
1    3.863595
2    4.567937
3    4.455485
4    4.493451
dtype: float64

In [10]:
# def build_k_indices(y, k_fold, seed=42):
#     """build k indices for k-fold."""
#     num_row = y.shape[0]
#     interval = int(num_row / k_fold)
#     np.random.seed(seed)
#     indices = np.random.permutation(num_row)
#     k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
#     return np.array(k_indices)

df = train_df.copy()
# k_indices = build_k_indices(df, 5)
folds_dict = dict.fromkeys(range(5))

for i in range(5):
#     ind_sorted = sorted(indices)
#     fold_data = df.iloc[ind_sorted]
    train, test = split_dataset(fold_data)
    folds_dict[i] = dict(train=train, test=test)

In [14]:
folds_dict[i]['train'].shape, folds_dict[i]['test'].shape

((188312, 3), (47078, 3))