In [1]:
import sys
import IPython
import numpy as np
import pandas as pd
import sklearn as sk

In [2]:
version = 7

In [3]:
pd.options.display.max_colwidth=-1

In [4]:
from sklearn.model_selection import GridSearchCV

class EstimatorSelectionHelper:
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
    
    def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, 
                              verbose=verbose, scoring=scoring, refit=refit, return_train_score=True)
            gs.fit(X, y)
            self.grid_searches[key] = gs    
    
    def score_summary(self, sort_by=None):
        scores = pd.concat(list(map(
            lambda k: pd.DataFrame.from_dict({'estimator': k, **self.grid_searches[k].cv_results_}),
            self.keys)))
        if sort_by: scores.sort_values(sort_by, inplace=True, ascending=False)
        return scores

In [5]:
features_labels_iq_train = pd.read_csv('./generated/3-refined-o-' + str(version) + '-train-iq.csv', 
                                       parse_dates=['week_start_date'], 
                                       index_col='week_start_date');
features_labels_sj_train = pd.read_csv('./generated/3-refined-o-' + str(version) + '-train-sj.csv', 
                                       parse_dates=['week_start_date'], 
                                       index_col='week_start_date');

In [6]:
from sklearn import preprocessing

In [7]:
features_labels_sj_train.drop('total_cases', axis=1)

X = preprocessing.scale(features_labels_sj_train.drop('total_cases', axis=1))
y = features_labels_sj_train[['total_cases']]

In [8]:
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, \
    GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor

models2 = {
#     'AB_R': AdaBoostRegressor(),
#     'B_R': BaggingRegressor(),
#     'ET_R': ExtraTreesRegressor(),
#     'GB_R': GradientBoostingRegressor(),
#     'RF_R': RandomForestRegressor(),
    'MLP_R': MLPRegressor()
}

params2 = { 
#     'AB_R': { 'learning_rate': np.linspace(0.05, 0.2, 11), 
#              'n_estimators': np.linspace(25, 75, endpoint=False, num=15).astype(int) },
#     'B_R': { 'n_estimators': np.linspace(25, 75, endpoint=False, num=15).astype(int) },
#     'ET_R': { 'n_estimators': np.linspace(25, 75, endpoint=False, num=15).astype(int) },
#     'GB_R': { 'learning_rate': np.linspace(0.05, 0.2, 11), 
#              'n_estimators': np.linspace(25, 75, endpoint=False, num=15).astype(int), 
#              'min_samples_leaf': [6, 8, 10]},
#     'RF_R': { 'n_estimators': np.linspace(25, 75, endpoint=False, num=16).astype(int), 
#              'min_samples_leaf': [6, 8, 10] },
    'MLP_R': {}
}

In [9]:
helper2 = EstimatorSelectionHelper(models2, params2)
helper2.fit(X, y, n_jobs=-1, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error'], cv=5)

Running GridSearchCV for MLP_R.
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.0s finished


In [10]:
helper2.score_summary(sort_by='mean_test_neg_mean_absolute_error')[
    ['estimator', 'mean_test_neg_mean_absolute_error', 'mean_test_neg_mean_squared_error', 'params']].head(10)

Unnamed: 0,estimator,mean_test_neg_mean_absolute_error,mean_test_neg_mean_squared_error,params
0,MLP_R,-25.879623,-2505.02773,{}


In [11]:
sj_train_subtrain = features_labels_sj_train.head(800)
sj_train_subtest = features_labels_sj_train.tail(features_labels_sj_train.shape[0] - 800)

iq_train_subtrain = features_labels_iq_train.head(400)
iq_train_subtest = features_labels_iq_train.tail(features_labels_iq_train.shape[0] - 400)

In [12]:
from statsmodels.tools import eval_measures
import statsmodels.api as sm
import statsmodels.formula.api as smf

def get_best_model(train, test):
    # Step 1: specify the form of the model
    model_formula = "total_cases ~ 1 + " + " + ".join(filter(lambda s: s != 'total_cases', list(train.columns)))
    
    grid = 10 ** np.arange(-8, -3, dtype=np.float64)
                    
    best_alpha = []
    best_score = 1000
        
    # Step 2: Find the best hyper parameter, alpha
    for alpha in grid:
        model = smf.glm(formula=model_formula,
                        data=train,
                        family=sm.families.NegativeBinomial(alpha=alpha))

        results = model.fit()
        predictions = results.predict(test).astype(int)
        score = eval_measures.meanabs(predictions, test.total_cases)

        if score < best_score:
            best_alpha = alpha
            best_score = score

    print('best alpha = ', best_alpha)
    print('best score = ', best_score)
            
    # Step 3: refit on entire dataset
    full_dataset = pd.concat([train, test])
    model = smf.glm(formula=model_formula,
                    data=full_dataset,
                    family=sm.families.NegativeBinomial(alpha=best_alpha))

    fitted_model = model.fit()
    return fitted_model
    
sj_best_model = get_best_model(sj_train_subtrain, sj_train_subtest)
iq_best_model = get_best_model(iq_train_subtrain, iq_train_subtest)

best alpha =  1e-08
best score =  22.92063492063492
best alpha =  1e-08
best score =  6.864077669902913


In [13]:
features_iq_test = pd.read_csv('./generated/3-refined-o-' + str(version) + '-test-iq.csv', 
                                       parse_dates=['week_start_date'], 
                                       index_col='week_start_date');

features_sj_test = pd.read_csv('./generated/3-refined-o-' + str(version) + '-test-sj.csv', 
                                       parse_dates=['week_start_date'], 
                                       index_col='week_start_date');

In [23]:
sj_predictions = sj_best_model.predict(features_sj_test)\
    .rolling(7, center=True).mean().bfill().ffill().astype(int)
iq_predictions = iq_best_model.predict(features_iq_test)\
    .rolling(7, center=True).mean().bfill().ffill().astype(int)

In [24]:
submission = pd.read_csv("../submission_format.csv",
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
submission.to_csv("./generated/nb-x3.csv")

In [25]:
besty = pd.read_clipboard()

In [28]:
besty

Unnamed: 0,18
0,18
1,18
2,18
3,18
4,18
5,17
6,18
7,20
8,16
9,17


In [29]:
besty = besty.rolling(7, center=True).mean().bfill().ffill().astype(int)

In [33]:
besty.to_clipboard(index=False)