In [99]:
import pandas as pd
import numpy as np
import math, sys
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from joblib import dump, load
from itertools import product
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

df = pd.read_csv('../data/raw/intern_data.csv', index_col=0)
num_cols = ['a', 'b', 'd', 'e', 'f', 'g']
ctg_cols = ['c', 'h']
Y_LABEL = 'y'
df_dummy = pd.get_dummies(df, columns=ctg_cols)
df_dummy

Unnamed: 0,a,b,d,e,f,g,y,c_blue,c_green,c_red,c_yellow,h_black,h_white
7,0.951786,0.669570,0.170130,0.623469,0.925886,0.812685,3.707514,0,1,0,0,0,1
43,0.510447,0.922627,0.087899,0.025415,0.698444,0.658545,2.689243,0,0,1,0,0,1
47,0.294838,0.351081,0.710892,0.699661,0.545722,0.836863,2.886508,0,0,0,1,1,0
53,0.798645,0.572042,0.026137,0.609730,0.488668,0.342675,2.478168,0,1,0,0,1,0
54,0.689666,0.395323,0.172448,0.736433,0.708408,0.695521,3.182666,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4889,0.195745,0.791511,0.784001,0.778692,0.407301,0.895939,3.646691,0,1,0,0,0,1
4910,0.995119,0.076542,0.326500,0.829949,0.500763,0.545784,3.270344,0,0,0,1,0,1
4920,0.091773,0.326965,0.922553,0.257745,0.348771,0.624851,2.672514,0,1,0,0,0,1
4931,0.761853,0.654755,0.252334,0.128781,0.658069,0.405367,1.259850,1,0,0,0,1,0


In [100]:
df_dummy['eh'] = df_dummy['e'] * df_dummy['h_white'] * df_dummy['c_blue'].map({0: 1, 1: 0})
df_dummy['f+g'] = (df_dummy['f'] + df_dummy['g']) * df_dummy['h_white'] * df_dummy['c_blue'].map({0: 1, 1: 0})
df_dummy['eh(f+g)'] = df_dummy['eh'] * df_dummy['f+g']
df_dummy['eh+f+g'] = df_dummy['eh'] + df_dummy['f+g']
df_dummy

Unnamed: 0,a,b,d,e,f,g,y,c_blue,c_green,c_red,c_yellow,h_black,h_white,eh,f+g,eh(f+g),eh+f+g
7,0.951786,0.669570,0.170130,0.623469,0.925886,0.812685,3.707514,0,1,0,0,0,1,0.623469,1.738571,1.083945,2.362040
43,0.510447,0.922627,0.087899,0.025415,0.698444,0.658545,2.689243,0,0,1,0,0,1,0.025415,1.356988,0.034488,1.382404
47,0.294838,0.351081,0.710892,0.699661,0.545722,0.836863,2.886508,0,0,0,1,1,0,0.000000,0.000000,0.000000,0.000000
53,0.798645,0.572042,0.026137,0.609730,0.488668,0.342675,2.478168,0,1,0,0,1,0,0.000000,0.000000,0.000000,0.000000
54,0.689666,0.395323,0.172448,0.736433,0.708408,0.695521,3.182666,0,0,1,0,0,1,0.736433,1.403930,1.033900,2.140363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4889,0.195745,0.791511,0.784001,0.778692,0.407301,0.895939,3.646691,0,1,0,0,0,1,0.778692,1.303240,1.014822,2.081932
4910,0.995119,0.076542,0.326500,0.829949,0.500763,0.545784,3.270344,0,0,0,1,0,1,0.829949,1.046547,0.868581,1.876496
4920,0.091773,0.326965,0.922553,0.257745,0.348771,0.624851,2.672514,0,1,0,0,0,1,0.257745,0.973623,0.250946,1.231368
4931,0.761853,0.654755,0.252334,0.128781,0.658069,0.405367,1.259850,1,0,0,0,1,0,0.000000,0.000000,0.000000,0.000000


In [101]:
#column_to_nmlz = num_cols + ['eh']
column_to_nmlz = list(set(df.columns) - set([Y_LABEL]) - set(ctg_cols))
def normalize_data(df_input, scaler='robust'):
    result = df_input.copy(deep=True)
    if scaler == 'robust':
        num_pipeline = Pipeline([('robust_scaler', RobustScaler())])
    else:
        num_pipeline = Pipeline([('std_scaler', StandardScaler())])    
    #column_to_nmlz = list(set(df_input.columns) - set([y_label]))
    result[column_to_nmlz] = num_pipeline.fit_transform(df_input[column_to_nmlz])
    return result
df_dummy_normed = normalize_data(df_dummy, scaler='std')
df_dummy_normed

Unnamed: 0,a,b,d,e,f,g,y,c_blue,c_green,c_red,c_yellow,h_black,h_white,eh,f+g,eh(f+g),eh+f+g
7,1.552595,0.561935,-1.166187,0.447364,1.368097,1.071096,3.707514,0,1,0,0,0,1,0.623469,1.738571,1.083945,2.362040
43,0.044537,1.428104,-1.455388,-1.753411,0.588872,0.528442,2.689243,0,0,1,0,0,1,0.025415,1.356988,0.034488,1.382404
47,-0.692202,-0.528193,0.735634,0.727741,0.065641,1.156217,2.886508,0,0,0,1,1,0,0.000000,0.000000,0.000000,0.000000
53,1.029311,0.228115,-1.672603,0.396806,-0.129825,-0.583588,2.478168,0,1,0,0,1,0,0.000000,0.000000,0.000000,0.000000
54,0.656931,-0.376760,-1.158036,0.863059,0.623012,0.658619,3.182666,0,0,1,0,0,1,0.736433,1.403930,1.033900,2.140363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4889,-1.030802,0.979316,0.992755,1.018567,-0.408593,1.364197,3.646691,0,1,0,0,0,1,0.778692,1.303240,1.014822,2.081932
4910,1.700667,-1.467889,-0.616245,1.207189,-0.088388,0.131463,3.270344,0,0,0,1,0,1,0.829949,1.046547,0.868581,1.876496
4920,-1.386076,-0.610738,1.480035,-0.898462,-0.609117,0.409822,2.672514,0,1,0,0,0,1,0.257745,0.973623,0.250946,1.231368
4931,0.903592,0.511226,-0.877083,-1.373038,0.450548,-0.362880,1.259850,1,0,0,0,1,0,0.000000,0.000000,0.000000,0.000000


In [102]:
def get_train_test_sets(test_size=0.2, dummy=False, normed=False):
    if dummy is True and normed is True:
        df_input = df_dummy_normed
    elif dummy is True and normed is False:
        df_input = df_dummy
    else:
        df_input = df
            
    train_set, test_set = train_test_split(df_input, test_size=test_size, random_state=42)
    x_train = train_set.drop(columns=[Y_LABEL], axis=1)
    x_test = test_set.drop(columns=[Y_LABEL], axis=1)
    #y_train, y_test = num_pipeline.fit_transform(train_set[[Y_LABEL]]), num_pipeline.fit_transform(test_set[[Y_LABEL]])
    y_train, y_test = train_set[Y_LABEL], test_set[Y_LABEL]
    y_all = df_input[Y_LABEL]
    x_all = df_input.drop(columns=[Y_LABEL])
    return x_train, x_test, y_train, y_test, x_all, y_all

x_train, x_test, y_train, y_test, x_all, y_all = get_train_test_sets(dummy=True, normed=False)
x_train

Unnamed: 0,a,b,d,e,f,g,c_blue,c_green,c_red,c_yellow,h_black,h_white,eh,f+g,eh(f+g),eh+f+g
2364,0.090244,0.155677,0.272952,0.038507,0.744463,0.350609,1,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000
4237,0.559451,0.932398,0.589896,0.276736,0.122540,0.018161,0,0,0,1,1,0,0.000000,0.000000,0.000000,0.000000
226,0.130311,0.473161,0.110819,0.114864,0.093494,0.054012,0,0,0,1,0,1,0.114864,0.147505,0.016943,0.262370
3197,0.464829,0.418934,0.043737,0.563155,0.766045,0.913024,0,0,1,0,1,0,0.000000,0.000000,0.000000,0.000000
3291,0.296832,0.050419,0.374237,0.961693,0.792818,0.410662,0,1,0,0,0,1,0.961693,1.203480,1.157378,2.165173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,0.294201,0.348118,0.376214,0.790190,0.586324,0.527688,1,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000
2628,0.661600,0.041014,0.633891,0.107164,0.913002,0.350590,0,0,1,0,0,1,0.107164,1.263592,0.135411,1.370755
3476,0.790146,0.141229,0.859917,0.268874,0.109422,0.475036,1,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000
4258,0.212004,0.717193,0.244838,0.109056,0.955032,0.628380,0,1,0,0,1,0,0.000000,0.000000,0.000000,0.000000


In [103]:
def train_n_evaluate(model, model_name, cv=3, scoring='r2'):
    print("training {} regressor...".format(model_name))
    #model.fit(x_train, y_train)
    print("{} regressor trained, saving model...".format(model_name))
    dump(model, '../models/{}.joblib'.format(model_name))
    print("saving model finished, getting validation scores...")
    scores = -cross_val_score(model, x_train, y_train, cv=cv, scoring=scoring)
    print("cross val scores for score:{}, avg:{}, std:{}".format(scores, scores.mean(), scores.std()))

In [86]:
rf_reg = RandomForestRegressor(n_estimators=400, verbose=1, criterion='mse')
train_n_evaluate(rf_reg, "random_forest", cv=10)

training random_forest regressor...
random_forest regressor trained, saving model...
saving model finished, getting validation scores...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

cross val scores for score:[-0.88145859 -0.91423767 -0.86305172 -0.90147832 -0.83207893 -0.80765632
 -0.79117993 -0.91466206 -0.85580476 -0.88495383], avg:-0.8646562146262099, std:0.04098663238886827


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.0s finished


In [106]:
def dict_product(d):
    keys = d.keys()
    for element in product(*d.values()):
        yield dict(zip(keys, element))
        
def evaluate_on_testset(model, verbose=1):
    y_preds_test = model.predict(x_test)
    mae_test = np.absolute((y_preds_test.reshape(1, -1) - y_test.values)).mean()
    if verbose > 0:
        print("MAE on test set: {}".format(mae_test))
    return mae_test

def GridSearchWithVal(model_class, param_grid, metrics='mae'):
    combinations = list(dict_product(param_grid))
    min_metrics = math.inf
    best_comb = None
    print("{} combinations in total".format(len(combinations)))
    for idx, comb in enumerate(combinations):
        model = model_class(**comb)
        model.fit(x_train, y_train)
        y_preds = model.predict(x_test)
        #print("y_preds:{}".format(y_preds))
        #print("y_test:{}".format(y_test.values))
        error = 0;
        if metrics == 'mape':
            metrics = np.absolute((y_preds - y_test) / y_test).mean()
        else:
            metrics = np.absolute(y_preds - y_test).mean()
        if metrics < min_metrics:
            min_metrics = metrics
            best_comb = comb
        progress_str = "{} / {}, best comb: {}, best score: {}".format(idx + 1, len(combinations), best_comb, min_metrics)
        sys.stdout.write('\r' + progress_str)
            
    print("best params:{}".format(best_comb))     
    print("min metrics:{}".format(min_metrics))
    return best_comb

In [105]:
rf_param_grid = {"n_estimators" : [230, 235, 240, 245, 250],
                  "criterion" : ["mae"],
                  "max_depth": [5, 7, 10, None],
                  "max_features": [0.1, 0.2, 0.5, None],
                  "min_samples_split": [2, 3, 4],
                  "min_samples_leaf": [1, 3, 5, 10],
                  "bootstrap": [True, False]
                 }
rf_best = GridSearchWithVal(RandomForestRegressor, rf_param_grid, metrics='mape')

1920 combinations in total
1920 / 1920, best comb: {'n_estimators': 250, 'criterion': 'mae', 'max_depth': None, 'max_features': 0.2, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': False}, best score: 0.06927973599343844best params:{'n_estimators': 250, 'criterion': 'mae', 'max_depth': None, 'max_features': 0.2, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': False}
min mae:0.06927973599343844


In [63]:
rf_best_params = {'n_estimators': 250, 'criterion': 'mae', 'max_depth': None, 'max_features': 0.5, 
                  'min_samples_split': 2, 'min_samples_leaf': 1}
rf_best = RandomForestRegressor(**rf_best_params)
rf_best.fit(x_train, y_train)
train_n_evaluate(rf_best, 'random forest', cv=10)

training random forest regressor...
random forest regressor trained, saving model...
saving model finished, getting validation scores...
cross val scores for score1:[0.1609142  0.14313115 0.16788294 0.15437114 0.1648534  0.17687647
 0.17030481 0.16292208 0.18273472 0.19423064]


In [60]:
xgb_param_grid = {"objective" : ['reg:squarederror'],
                  "n_estimators": [230, 250, 280],
                  "base_score" : [0.01, 0.02, 0.03, 0.04],
                  "max_depth": [2, 3, 4, 5, 6],
                  "gamma": [0.001, 0.003, 0.005, 0.01],
                  "min_child_weight": range(1, 8, 2),
                  "learning_rate": [0.005, 0.01, 0.05, 0.1]
                 }
xgb_best = GridSearchWithVal(XGBRegressor, xgb_param_grid)

3840 combinations in total
3840 / 3840, best comb: {'objective': 'reg:squarederror', 'n_estimators': 280, 'base_score': 0.01, 'max_depth': 2, 'gamma': 0.001, 'min_child_weight': 7, 'learning_rate': 0.1}, best score: 0.1152357203099218best params:{'objective': 'reg:squarederror', 'n_estimators': 280, 'base_score': 0.01, 'max_depth': 2, 'gamma': 0.001, 'min_child_weight': 7, 'learning_rate': 0.1}
min mae:0.1152357203099218


In [63]:
xgb_best_params = {'objective': 'reg:squarederror', 'n_estimators': 250, 'base_score': 0.03, 'max_depth': 4, 
                   'gamma': 0.005, 'min_child_weight': 3, 'learning_rate': 0.05}
xbg_best = XGBRegressor(**xgb_best_params)
xbg_best.fit(x_train, y_train)
evaluate_on_testset(xbg_best)

MAE on test set: 0.1357397422494753


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [65]:
gb_param_grid = {"n_estimators" : [950, 1000, 1050, 1100],
                  "loss" : ["huber", "lad"],
                  "learning_rate": [0.01, 0.005, 0.1, 0.2],
                  "max_depth": [7, 8, 9, 10, 12, 15, 20],
                  "min_samples_leaf": [5, 10, 20, 50, 40, 60],
                  "max_features": [0.5, 0.3, 0.2, 0.1, 0.05, "sqrt", None] 
                 }
gb_best = GridSearchWithVal(GradientBoostingRegressor, gb_param_grid)

7200 combinations in total
7200 / 7200, best comb: {'n_estimators': 1050, 'loss': 'huber', 'learning_rate': 0.1, 'max_depth': 12, 'min_samples_leaf': 50, 'max_features': 0.05}, best score: 0.10486292886151947best params:{'n_estimators': 1050, 'loss': 'huber', 'learning_rate': 0.1, 'max_depth': 12, 'min_samples_leaf': 50, 'max_features': 0.05}
min mae:0.10486292886151947


In [None]:
gb_param_grid = {"n_estimators" : [1000, 1010, 1030],
                  "loss" : ["huber", "lad"],
                  "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.25, 0.3, 0.35, 0.4],
                  "max_depth": [6, 7, 8, 9, 10, 12, 13, 15, 17, 20, 25],
                  "min_samples_leaf": [20, 30, 40, 45, 50, 55, 60, 70, 80, 100],
                  "max_features": [0.9, 0.7, 0.5, 0.3, 0.2, 0.1, 0.05, None] 
                 }
gb_best = GridSearchWithVal(GradientBoostingRegressor, gb_param_grid)

42240 combinations in total
4495 / 42240, best comb: {'n_estimators': 1000, 'loss': 'huber', 'learning_rate': 0.2, 'max_depth': 8, 'min_samples_leaf': 45, 'max_features': 0.1}, best score: 0.0962017987542613637

In [92]:
def kfold_cv(model, cv=10):
    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
    mae_tests = []
    for idx, (train, test) in enumerate(kfold.split(x_all, y_all)):
        #print("train:{}".format(train))
        #print("test:{}".format(test))
        #print("s:{}".format(x_train[x_train.index.isin(train)]))
        #train = x_train[x_train.index.isin(train)]
        #_model = create_keras_model(optimizer='Adadelta', neuron=20, init='lecun_uniform', act_alpha=0.05)
        #model.fit(x_all[x_all.index.isin(train)], y_all[y_all.index.isin(train)], epochs=500, verbose=0)
        model.fit(x_all[x_all.index.isin(train)], y_all[y_all.index.isin(train)])
        _y_preds_test = model.predict(x_all[x_all.index.isin(test)])
        _mae_test = np.absolute((_y_preds_test.reshape(1, -1) - y_all[y_all.index.isin(test)].values)).mean()
        mae_tests.append(_mae_test)
        print("MAE on test set {}: {}".format(idx, _mae_test))
    print("avg MAE: {}, std:{}".format(np.array(mae_tests).mean(), np.array(mae_tests).std()))

gb_model = GradientBoostingRegressor(**gb_best)
train_n_evaluate(gb_model, 'GradientBoosting', scoring='neg_mean_absolute_error', cv=10)

training GradientBoosting regressor...
GradientBoosting regressor trained, saving model...
saving model finished, getting validation scores...
cross val scores for score:[0.08378105 0.11041904 0.11244233 0.09953988 0.12384259 0.10057475
 0.13254724 0.11107828 0.1058806  0.12061528], avg:0.11007210499721823, std:0.013124721875022635


In [77]:
attemps_mae = []
attemps_model = []
for i in range(100):
    gb_model = GradientBoostingRegressor(**gb_best)
    gb_model.fit(x_train, y_train)
    mae = evaluate_on_testset(gb_model, verbose=0)
    attemps_mae.append(mae)
    attemps_model.append(gb_model)
min(attemps_mae)

0.1019687721685082