In [193]:
import pandas as pd
import numpy as np
import math, sys
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from joblib import dump, load
from itertools import product
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

df = pd.read_csv('../data/raw/intern_data.csv', index_col=0)
num_cols = ['a', 'b', 'd', 'e', 'f', 'g']
ctg_cols = ['c', 'h']
Y_LABEL = 'y'
df_dummy = pd.get_dummies(df, columns=ctg_cols)
df_dummy

Unnamed: 0,a,b,d,e,f,g,y,c_blue,c_green,c_red,c_yellow,h_black,h_white
7,0.951786,0.669570,0.170130,0.623469,0.925886,0.812685,3.707514,0,1,0,0,0,1
43,0.510447,0.922627,0.087899,0.025415,0.698444,0.658545,2.689243,0,0,1,0,0,1
47,0.294838,0.351081,0.710892,0.699661,0.545722,0.836863,2.886508,0,0,0,1,1,0
53,0.798645,0.572042,0.026137,0.609730,0.488668,0.342675,2.478168,0,1,0,0,1,0
54,0.689666,0.395323,0.172448,0.736433,0.708408,0.695521,3.182666,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4889,0.195745,0.791511,0.784001,0.778692,0.407301,0.895939,3.646691,0,1,0,0,0,1
4910,0.995119,0.076542,0.326500,0.829949,0.500763,0.545784,3.270344,0,0,0,1,0,1
4920,0.091773,0.326965,0.922553,0.257745,0.348771,0.624851,2.672514,0,1,0,0,0,1
4931,0.761853,0.654755,0.252334,0.128781,0.658069,0.405367,1.259850,1,0,0,0,1,0


In [172]:
df_dummy['ehc'] = df_dummy['e'] * df_dummy['h_white'] * df_dummy['c_blue'].map({0: 1, 1: 0})
df_dummy['e(h+c)'] = df_dummy['e'] * (df_dummy['h_white'] + df_dummy['c_blue'].map({0: 1, 1: 0}))
df_dummy['f+g'] = df_dummy['f'] + df_dummy['g']
df_dummy['(f+g)*h*c'] = (df_dummy['f'] + df_dummy['g']) * df_dummy['h_white'] * df_dummy['c_blue'].map({0: 1, 1: 0})
df_dummy['(f+g)*(h+c)'] = (df_dummy['f'] + df_dummy['g']) * (df_dummy['h_white'] + df_dummy['c_blue'].map({0: 1, 1: 0}))

df_dummy['ehc(f+g)'] = df_dummy['ehc'] * df_dummy['f+g']
df_dummy['e(h+c)(f+g)'] = df_dummy['e(h+c)'] * df_dummy['f+g']

df_dummy['ehc+f+g)'] = df_dummy['ehc'] + df_dummy['f+g']
df_dummy['e(h+c)+f+g)'] = df_dummy['e(h+c)'] + df_dummy['f+g']

df_dummy['eh(f+g)*(h+c)'] = df_dummy['ehc'] * df_dummy['(f+g)*(h+c)']
df_dummy['eh+(f+g)*(h+c)'] = df_dummy['ehc'] + df_dummy['(f+g)*(h+c)']

df_dummy['e(h+c)(f+g)*(h+c)'] = df_dummy['e(h+c)'] * df_dummy['(f+g)*(h+c)']
df_dummy['e(h+c)+(f+g)*(h+c)'] = df_dummy['e(h+c)'] + df_dummy['(f+g)*(h+c)']
df_dummy

Unnamed: 0,a,b,d,e,f,g,y,c_blue,c_green,c_red,...,(f+g)*h*c,(f+g)*(h+c),ehc(f+g),e(h+c)(f+g),ehc+f+g),e(h+c)+f+g),eh(f+g)*(h+c),eh+(f+g)*(h+c),e(h+c)(f+g)*(h+c),e(h+c)+(f+g)*(h+c)
7,0.951786,0.669570,0.170130,0.623469,0.925886,0.812685,3.707514,0,1,0,...,1.738571,3.477142,1.083945,2.167891,2.362040,2.985509,2.167891,4.100611,4.335781,4.724080
43,0.510447,0.922627,0.087899,0.025415,0.698444,0.658545,2.689243,0,0,1,...,1.356988,2.713977,0.034488,0.068977,1.382404,1.407819,0.068977,2.739392,0.137953,2.764808
47,0.294838,0.351081,0.710892,0.699661,0.545722,0.836863,2.886508,0,0,0,...,0.000000,1.382585,0.000000,0.967340,1.382585,2.082245,0.000000,1.382585,0.967340,2.082245
53,0.798645,0.572042,0.026137,0.609730,0.488668,0.342675,2.478168,0,1,0,...,0.000000,0.831344,0.000000,0.506895,0.831344,1.441074,0.000000,0.831344,0.506895,1.441074
54,0.689666,0.395323,0.172448,0.736433,0.708408,0.695521,3.182666,0,0,1,...,1.403930,2.807860,1.033900,2.067800,2.140363,2.876795,2.067800,3.544293,4.135600,4.280725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4889,0.195745,0.791511,0.784001,0.778692,0.407301,0.895939,3.646691,0,1,0,...,1.303240,2.606481,1.014822,2.029644,2.081932,2.860623,2.029644,3.385172,4.059289,4.163864
4910,0.995119,0.076542,0.326500,0.829949,0.500763,0.545784,3.270344,0,0,0,...,1.046547,2.093094,0.868581,1.737162,1.876496,2.706446,1.737162,2.923043,3.474324,3.752993
4920,0.091773,0.326965,0.922553,0.257745,0.348771,0.624851,2.672514,0,1,0,...,0.973623,1.947245,0.250946,0.501893,1.231368,1.489113,0.501893,2.204990,1.003786,2.462735
4931,0.761853,0.654755,0.252334,0.128781,0.658069,0.405367,1.259850,1,0,0,...,0.000000,0.000000,0.000000,0.000000,1.063436,1.063436,0.000000,0.000000,0.000000,0.000000


In [173]:
#column_to_nmlz = num_cols + ['eh']
column_to_nmlz = list(set(df.columns) - set([Y_LABEL]) - set(ctg_cols))
def normalize_data(df_input, scaler='robust'):
    result = df_input.copy(deep=True)
    if scaler == 'robust':
        num_pipeline = Pipeline([('robust_scaler', RobustScaler())])
    else:
        num_pipeline = Pipeline([('std_scaler', StandardScaler())])    
    #column_to_nmlz = list(set(df_input.columns) - set([y_label]))
    result[column_to_nmlz] = num_pipeline.fit_transform(df_input[column_to_nmlz])
    return result
df_dummy_normed = normalize_data(df_dummy, scaler='std')
df_dummy_normed

Unnamed: 0,a,b,d,e,f,g,y,c_blue,c_green,c_red,...,(f+g)*h*c,(f+g)*(h+c),ehc(f+g),e(h+c)(f+g),ehc+f+g),e(h+c)+f+g),eh(f+g)*(h+c),eh+(f+g)*(h+c),e(h+c)(f+g)*(h+c),e(h+c)+(f+g)*(h+c)
7,1.552595,0.561935,-1.166187,0.447364,1.368097,1.071096,3.707514,0,1,0,...,1.738571,3.477142,1.083945,2.167891,2.362040,2.985509,2.167891,4.100611,4.335781,4.724080
43,0.044537,1.428104,-1.455388,-1.753411,0.588872,0.528442,2.689243,0,0,1,...,1.356988,2.713977,0.034488,0.068977,1.382404,1.407819,0.068977,2.739392,0.137953,2.764808
47,-0.692202,-0.528193,0.735634,0.727741,0.065641,1.156217,2.886508,0,0,0,...,0.000000,1.382585,0.000000,0.967340,1.382585,2.082245,0.000000,1.382585,0.967340,2.082245
53,1.029311,0.228115,-1.672603,0.396806,-0.129825,-0.583588,2.478168,0,1,0,...,0.000000,0.831344,0.000000,0.506895,0.831344,1.441074,0.000000,0.831344,0.506895,1.441074
54,0.656931,-0.376760,-1.158036,0.863059,0.623012,0.658619,3.182666,0,0,1,...,1.403930,2.807860,1.033900,2.067800,2.140363,2.876795,2.067800,3.544293,4.135600,4.280725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4889,-1.030802,0.979316,0.992755,1.018567,-0.408593,1.364197,3.646691,0,1,0,...,1.303240,2.606481,1.014822,2.029644,2.081932,2.860623,2.029644,3.385172,4.059289,4.163864
4910,1.700667,-1.467889,-0.616245,1.207189,-0.088388,0.131463,3.270344,0,0,0,...,1.046547,2.093094,0.868581,1.737162,1.876496,2.706446,1.737162,2.923043,3.474324,3.752993
4920,-1.386076,-0.610738,1.480035,-0.898462,-0.609117,0.409822,2.672514,0,1,0,...,0.973623,1.947245,0.250946,0.501893,1.231368,1.489113,0.501893,2.204990,1.003786,2.462735
4931,0.903592,0.511226,-0.877083,-1.373038,0.450548,-0.362880,1.259850,1,0,0,...,0.000000,0.000000,0.000000,0.000000,1.063436,1.063436,0.000000,0.000000,0.000000,0.000000


In [174]:
def get_train_test_sets(test_size=0.2, dummy=False, normed=False):
    if dummy is True and normed is True:
        df_input = df_dummy_normed
    elif dummy is True and normed is False:
        df_input = df_dummy
    else:
        df_input = df
            
    train_set, test_set = train_test_split(df_input, test_size=test_size, random_state=42)
    x_train = train_set.drop(columns=[Y_LABEL], axis=1)
    x_test = test_set.drop(columns=[Y_LABEL], axis=1)
    #y_train, y_test = num_pipeline.fit_transform(train_set[[Y_LABEL]]), num_pipeline.fit_transform(test_set[[Y_LABEL]])
    y_train, y_test = train_set[Y_LABEL], test_set[Y_LABEL]
    y_all = df_input[Y_LABEL]
    x_all = df_input.drop(columns=[Y_LABEL])
    return x_train, x_test, y_train, y_test, x_all, y_all

x_train, x_test, y_train, y_test, x_all, y_all = get_train_test_sets(dummy=True, normed=False)
x_train

Unnamed: 0,a,b,d,e,f,g,c_blue,c_green,c_red,c_yellow,...,(f+g)*h*c,(f+g)*(h+c),ehc(f+g),e(h+c)(f+g),ehc+f+g),e(h+c)+f+g),eh(f+g)*(h+c),eh+(f+g)*(h+c),e(h+c)(f+g)*(h+c),e(h+c)+(f+g)*(h+c)
2364,0.090244,0.155677,0.272952,0.038507,0.744463,0.350609,1,0,0,0,...,0.000000,1.095072,0.000000,0.042168,1.095072,1.133579,0.000000,1.095072,0.042168,1.133579
4237,0.559451,0.932398,0.589896,0.276736,0.122540,0.018161,0,0,0,1,...,0.000000,0.140701,0.000000,0.038937,0.140701,0.417437,0.000000,0.140701,0.038937,0.417437
226,0.130311,0.473161,0.110819,0.114864,0.093494,0.054012,0,0,0,1,...,0.147505,0.295011,0.016943,0.033886,0.262370,0.377234,0.033886,0.409875,0.067772,0.524740
3197,0.464829,0.418934,0.043737,0.563155,0.766045,0.913024,0,0,1,0,...,0.000000,1.679069,0.000000,0.945576,1.679069,2.242224,0.000000,1.679069,0.945576,2.242224
3291,0.296832,0.050419,0.374237,0.961693,0.792818,0.410662,0,1,0,0,...,1.203480,2.406960,1.157378,2.314757,2.165173,3.126866,2.314757,3.368653,4.629513,4.330346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,0.294201,0.348118,0.376214,0.790190,0.586324,0.527688,1,0,0,0,...,0.000000,1.114012,0.000000,0.880281,1.114012,1.904202,0.000000,1.114012,0.880281,1.904202
2628,0.661600,0.041014,0.633891,0.107164,0.913002,0.350590,0,0,1,0,...,1.263592,2.527184,0.135411,0.270822,1.370755,1.477919,0.270822,2.634347,0.541644,2.741511
3476,0.790146,0.141229,0.859917,0.268874,0.109422,0.475036,1,0,0,0,...,0.000000,0.584458,0.000000,0.157145,0.584458,0.853331,0.000000,0.584458,0.157145,0.853331
4258,0.212004,0.717193,0.244838,0.109056,0.955032,0.628380,0,1,0,0,...,0.000000,1.583412,0.000000,0.172681,1.583412,1.692468,0.000000,1.583412,0.172681,1.692468


In [175]:
def mape(y_preds, y_test, **kwargs):
    return np.absolute((y_preds - y_test) / y_test).mean()

def train_n_evaluate(model, model_name, cv=3, scoring=mape):
    print("training {} regressor...".format(model_name))
    #model.fit(x_train, y_train)
    print("{} regressor trained, saving model...".format(model_name))
    dump(model, '../models/{}.joblib'.format(model_name))
    print("saving model finished, getting validation scores...")
    scores = -cross_val_score(model, x_train, y_train, cv=cv, scoring=make_scorer(mape, greater_is_better=False))
    print("cross val scores for score:{}, avg:{}, std:{}".format(scores, scores.mean(), scores.std()))

In [176]:
def dict_product(d):
    keys = d.keys()
    for element in product(*d.values()):
        yield dict(zip(keys, element))
        
def evaluate_on_testset(model, verbose=1):
    y_preds_test = model.predict(x_test)
    mae_test = np.absolute((y_preds_test.reshape(1, -1) - y_test.values)).mean()
    if verbose > 0:
        print("MAE on test set: {}".format(mae_test))
    return mae_test

def GridSearchWithVal(model_class, param_grid, metrics='mae'):
    combinations = list(dict_product(param_grid))
    min_metrics = math.inf
    best_comb = None
    print("{} combinations in total. Metric:{}".format(len(combinations), metrics))
    for idx, comb in enumerate(combinations):
        model = model_class(**comb)
        model.fit(x_train, y_train)
        y_preds = model.predict(x_test)
        #print("y_preds:{}".format(y_preds))
        #print("y_test:{}".format(y_test.values))
        error = 0;
        if metrics == 'mape':
            metrics_num = np.absolute((y_preds - y_test) / y_test).mean()
        else:
            metrics_num = np.absolute(y_preds - y_test).mean()
        if metrics_num < min_metrics:
            min_metrics = metrics_num
            best_comb = comb
        progress_str = "{} / {}, best comb: {}, best score: {}".format(idx + 1, len(combinations), best_comb, min_metrics)
        sys.stdout.write('\r' + progress_str)
            
    print("best params:{}".format(best_comb))     
    print("min {}: {}".format(metrics, min_metrics))
    return best_comb

In [180]:
rf_param_grid = {"n_estimators" : [200, 230, 250, 260],
                  "criterion" : ["mae"],
                  "max_depth": [None],
                  "max_features": [0.1, 0.2, 0.5, 0.8, None],
                  "min_samples_split": [2, 3, 4, 5],
                  "min_samples_leaf": [1, 3, 5, 10],
                  "bootstrap": [True, False]
                 }
rf_best = GridSearchWithVal(RandomForestRegressor, rf_param_grid, metrics='mape')

640 combinations in total. Metric:mape
640 / 640, best comb: {'n_estimators': 250, 'criterion': 'mae', 'max_depth': None, 'max_features': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True}, best score: 0.05858810388762005best params:{'n_estimators': 250, 'criterion': 'mae', 'max_depth': None, 'max_features': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True}
min mape: 0.05858810388762005


In [181]:
rf_best_model = RandomForestRegressor(**rf_best)
rf_best_model.fit(x_train, y_train)
train_n_evaluate(rf_best_model, 'random_forest', cv=10)

training random_forest regressor...
random_forest regressor trained, saving model...
saving model finished, getting validation scores...
cross val scores for score:[0.06352496 0.05128431 0.06347095 0.04380266 0.05123607 0.04689049
 0.06505285 0.04485762 0.04114323 0.0644763 ], avg:0.053573945008639746, std:0.009104678197254652


In [182]:
krr_param_grid = {"alpha": [0.3, 0.5, 0.6, 0.7, 0.9, 1, 2, 5, 7],
                  "kernel": ['polynomial', 'rbf', 'sigmoid'],
                  "degree": [1, 2, 3, 5],
                  "coef0": [1, 2, 3, 5, 7, 10, 12, 14]
                 }
krr_best = GridSearchWithVal(KernelRidge, krr_param_grid, metrics='mape')
krr_best_model = KernelRidge(**krr_best)
krr_best_model.fit(x_train, y_train)
train_n_evaluate(krr_best_model, 'Kernel_Ridge', cv=10)

864 combinations in total. Metric:mape
864 / 864, best comb: {'alpha': 0.3, 'kernel': 'polynomial', 'degree': 2, 'coef0': 14}, best score: 0.0317666114400075best params:{'alpha': 0.3, 'kernel': 'polynomial', 'degree': 2, 'coef0': 14}
min mape: 0.0317666114400075
training Kernel_Ridge regressor...
Kernel_Ridge regressor trained, saving model...
saving model finished, getting validation scores...
cross val scores for score:[0.03986139 0.027992   0.03517795 0.02755166 0.03227555 0.02630381
 0.03941805 0.0299494  0.0300616  0.03612501], avg:0.0324716424753347, std:0.004669648545742274


In [183]:
lasso_param_grid = {"alpha" : [1e-4, 5e-4, 8e4, 1e-3, 3e-3, 5e-3, 0.01, 0.05, 0.1, 0.2, 0.5],
                    "fit_intercept": [True, False],
                    "normalize": [True, False],
                    "precompute": [True, False],
                    "tol": [1e-3, 5e-3, 6e-3, 8e-3, 9e-3, 0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2],
                    "positive": [True, False],
                    "selection": ["cyclic", "random"]
                 }
lasso_best_param = GridSearchWithVal(Lasso, lasso_param_grid, metrics='mape')
lasso_best_model = Lasso(**lasso_best_param)
lasso_best_model.fit(x_train, y_train)
train_n_evaluate(lasso_best_model, 'Lasso', cv=10)

4576 combinations in total. Metric:mape
4576 / 4576, best comb: {'alpha': 0.001, 'fit_intercept': True, 'normalize': False, 'precompute': True, 'tol': 0.001, 'positive': False, 'selection': 'cyclic'}, best score: 0.030396695948559763best params:{'alpha': 0.001, 'fit_intercept': True, 'normalize': False, 'precompute': True, 'tol': 0.001, 'positive': False, 'selection': 'cyclic'}
min mape: 0.030396695948559763
training Lasso regressor...
Lasso regressor trained, saving model...
saving model finished, getting validation scores...
cross val scores for score:[0.03927578 0.02680738 0.03616828 0.02795596 0.0336462  0.02744191
 0.04044584 0.02899455 0.02918416 0.03537721], avg:0.03252972662207824, std:0.004837444619386056


In [187]:
xgb_param_grid = {"objective" : ['reg:squarederror'],
                  "n_estimators": [250, 300, 400, 500],
                  "base_score" : [0.05, 0.06, 0.07, 0.08, 1],
                  "max_depth": [3, 5, 7, 10],
                  "gamma": [1e-4, 3e-4, 5e-4, 0.001, 0.003, 0.005],
                  "min_child_weight": range(1, 9, 2),
                  "learning_rate": [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
                 }
xgb_best = GridSearchWithVal(XGBRegressor, xgb_param_grid, metrics='mape')

11520 combinations in total. Metric:mape
11520 / 11520, best comb: {'objective': 'reg:squarederror', 'n_estimators': 250, 'base_score': 0.07, 'max_depth': 5, 'gamma': 0.0003, 'min_child_weight': 7, 'learning_rate': 0.2}, best score: 0.039342083002841526best params:{'objective': 'reg:squarederror', 'n_estimators': 250, 'base_score': 0.07, 'max_depth': 5, 'gamma': 0.0003, 'min_child_weight': 7, 'learning_rate': 0.2}
min mape: 0.039342083002841526


In [188]:
xbg_best_model = XGBRegressor(**xgb_best)
xbg_best_model.fit(x_train, y_train)
train_n_evaluate(xbg_best_model, 'xgboost', cv=10)

training xgboost regressor...
xgboost regressor trained, saving model...
saving model finished, getting validation scores...
cross val scores for score:[0.05486808 0.040516   0.05157001 0.0397778  0.03925434 0.0418365
 0.05244702 0.03444873 0.03358722 0.05053638], avg:0.0438842072200908, std:0.0073904143100523645


In [194]:
gbr_model = GradientBoostingRegressor()
model_name = 'dt_model_gbr'

# run cross validation on model to find best parameters
param_grid = {"n_estimators" : [1000, 1050, 1100],
                  "loss" : ["huber", "lad"],
                  "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.25, 0.3, 0.35, 0.4],
                  "max_depth": [6, 7, 8, 9, 10, 12, 13, 15, 17, 20, 25],
                  "min_samples_leaf": [20, 30, 40, 45, 50, 55, 60, 70, 80],
                  "max_features": [0.7, 0.5, 0.3, 0.2, 0.1, 0.05, None] 
             }
cv_gbr_model = GridSearchCV(gbr_model, param_grid, n_jobs=-1).fit(x_train, y_train)
train_n_evaluate(cv_gbr_model, 'gb_GridSearchCV', cv=10)

KeyboardInterrupt: 

In [189]:
gb_param_grid = {"n_estimators" : [1000, 1050, 1100],
                  "loss" : ["huber", "lad"],
                  "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.25, 0.3, 0.35, 0.4],
                  "max_depth": [6, 7, 8, 9, 10, 12, 13, 15, 17, 20, 25],
                  "min_samples_leaf": [20, 30, 40, 45, 50, 55, 60, 70, 80],
                  "max_features": [0.7, 0.5, 0.3, 0.2, 0.1, 0.05, None] 
                 }
gb_best = GridSearchWithVal(GradientBoostingRegressor, gb_param_grid, metrics='mape')

33264 combinations in total. Metric:mae
1 / 33264, best comb: {'n_estimators': 1000, 'loss': 'huber', 'learning_rate': 0.01, 'max_depth': 6, 'min_samples_leaf': 20, 'max_features': 0.7}, best score: 0.10552022282760691

KeyboardInterrupt: 

In [192]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]        
        for model in self.models_:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions, axis=1)
    
averaged_models = AveragingModels(models=(xbg_best_model, lasso_best_model, krr_best_model, rf_best_model))
averaged_models.fit(x_train, y_train)
train_n_evaluate(averaged_models, 'averaged', cv=10)

training averaged regressor...
averaged regressor trained, saving model...
saving model finished, getting validation scores...
cross val scores for score:[0.0458757  0.03303246 0.04215937 0.02997912 0.034429   0.03267847
 0.04602536 0.0266083  0.028591   0.04059421], avg:0.03599729957965181, std:0.006773011282718774
