In [17]:
import pandas as pd
import numpy as np
import math, sys
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from joblib import dump, load
from itertools import product
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/raw/intern_data.csv', index_col=0)
num_cols = ['a', 'b', 'd', 'e', 'f', 'g']
ctg_cols = ['c', 'h']
Y_LABEL = 'y'
df_dummy = pd.get_dummies(df, columns=ctg_cols)
df_dummy

Unnamed: 0,a,b,d,e,f,g,y,c_blue,c_green,c_red,c_yellow,h_black,h_white
7,0.951786,0.669570,0.170130,0.623469,0.925886,0.812685,3.707514,0,1,0,0,0,1
43,0.510447,0.922627,0.087899,0.025415,0.698444,0.658545,2.689243,0,0,1,0,0,1
47,0.294838,0.351081,0.710892,0.699661,0.545722,0.836863,2.886508,0,0,0,1,1,0
53,0.798645,0.572042,0.026137,0.609730,0.488668,0.342675,2.478168,0,1,0,0,1,0
54,0.689666,0.395323,0.172448,0.736433,0.708408,0.695521,3.182666,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4889,0.195745,0.791511,0.784001,0.778692,0.407301,0.895939,3.646691,0,1,0,0,0,1
4910,0.995119,0.076542,0.326500,0.829949,0.500763,0.545784,3.270344,0,0,0,1,0,1
4920,0.091773,0.326965,0.922553,0.257745,0.348771,0.624851,2.672514,0,1,0,0,0,1
4931,0.761853,0.654755,0.252334,0.128781,0.658069,0.405367,1.259850,1,0,0,0,1,0


In [18]:
def normalize_data(df_input, scaler='robust'):
    result = df_input.copy(deep=True)
    if scaler == 'robust':
        num_pipeline = Pipeline([('robust_scaler', RobustScaler())])
    else:
        num_pipeline = Pipeline([('std_scaler', StandardScaler())])    
    #column_to_nmlz = list(set(df_input.columns) - set([y_label]))
    result[df_input.columns] = num_pipeline.fit_transform(df_input[df_input.columns])
    return result
df_dummy_normed = normalize_data(df_dummy, scaler='std')
df_dummy_normed

Unnamed: 0,a,b,d,e,f,g,y,c_blue,c_green,c_red,c_yellow,h_black,h_white
7,1.552595,0.561935,-1.166187,0.447364,1.368097,1.071096,1.731107,-0.571192,1.741338,-0.558870,-0.605079,-0.524891,0.524891
43,0.044537,1.428104,-1.455388,-1.753411,0.588872,0.528442,0.010879,-0.571192,-0.574271,1.789324,-0.605079,-0.524891,0.524891
47,-0.692202,-0.528193,0.735634,0.727741,0.065641,1.156217,0.344132,-0.571192,-0.574271,-0.558870,1.652678,1.905159,-1.905159
53,1.029311,0.228115,-1.672603,0.396806,-0.129825,-0.583588,-0.345702,-0.571192,1.741338,-0.558870,-0.605079,1.905159,-1.905159
54,0.656931,-0.376760,-1.158036,0.863059,0.623012,0.658619,0.844450,-0.571192,-0.574271,1.789324,-0.605079,-0.524891,0.524891
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4889,-1.030802,0.979316,0.992755,1.018567,-0.408593,1.364197,1.628355,-0.571192,1.741338,-0.558870,-0.605079,-0.524891,0.524891
4910,1.700667,-1.467889,-0.616245,1.207189,-0.088388,0.131463,0.992569,-0.571192,-0.574271,-0.558870,1.652678,-0.524891,0.524891
4920,-1.386076,-0.610738,1.480035,-0.898462,-0.609117,0.409822,-0.017382,-0.571192,1.741338,-0.558870,-0.605079,-0.524891,0.524891
4931,0.903592,0.511226,-0.877083,-1.373038,0.450548,-0.362880,-2.403881,1.750726,-0.574271,-0.558870,-0.605079,1.905159,-1.905159


In [30]:
def get_train_test_sets(test_size=0.2, dummy=False, normed=False):
    if dummy is True and normed is True:
        df_input = df_dummy_normed
    elif dummy is True and normed is False:
        df_input = df_dummy
    else:
        df_input = df
            
    train_set, test_set = train_test_split(df_input, test_size=test_size, random_state=42)
    x_train = train_set.drop(columns=[Y_LABEL], axis=1)
    x_test = test_set.drop(columns=[Y_LABEL], axis=1)
    #y_train, y_test = num_pipeline.fit_transform(train_set[[Y_LABEL]]), num_pipeline.fit_transform(test_set[[Y_LABEL]])
    y_train, y_test = train_set[Y_LABEL], test_set[Y_LABEL]
    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = get_train_test_sets(dummy=True, normed=False)
x_train

Unnamed: 0,a,b,d,e,f,g,c_blue,c_green,c_red,c_yellow,h_black,h_white
2364,0.090244,0.155677,0.272952,0.038507,0.744463,0.350609,1,0,0,0,0,1
4237,0.559451,0.932398,0.589896,0.276736,0.122540,0.018161,0,0,0,1,1,0
226,0.130311,0.473161,0.110819,0.114864,0.093494,0.054012,0,0,0,1,0,1
3197,0.464829,0.418934,0.043737,0.563155,0.766045,0.913024,0,0,1,0,1,0
3291,0.296832,0.050419,0.374237,0.961693,0.792818,0.410662,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1019,0.294201,0.348118,0.376214,0.790190,0.586324,0.527688,1,0,0,0,0,1
2628,0.661600,0.041014,0.633891,0.107164,0.913002,0.350590,0,0,1,0,0,1
3476,0.790146,0.141229,0.859917,0.268874,0.109422,0.475036,1,0,0,0,0,1
4258,0.212004,0.717193,0.244838,0.109056,0.955032,0.628380,0,1,0,0,1,0


In [31]:
def train_n_evaluate(model, model_name, cv=3):
    print("training {} regressor...".format(model_name))
    model.fit(x_train, y_train)
    print("{} regressor trained, saving model...".format(model_name))
    dump(model, '../models/{}.joblib'.format(model_name))
    print("saving model finished, getting validation scores...")
    scores = cross_val_score(model, x_train, y_train, cv=cv, scoring='r2')
    print("cross val scores for score1:{}".format(scores))

In [32]:
rf_reg = RandomForestRegressor(n_estimators=2000, verbose=1, criterion='mse')
train_n_evaluate(rf_reg, "random_forest", cv=10)

training random_forest regressor...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    3.7s finished


random_forest regressor trained, saving model...
saving model finished, getting validation scores...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    3.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    3.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    3.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[

cross val scores for score1:[0.89419044 0.86552859 0.87722828 0.89483252 0.86356884 0.80027666
 0.86137755 0.92291656 0.85312511 0.8594391 ]


[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    3.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    0.1s finished


In [33]:
def dict_product(d):
    keys = d.keys()
    for element in product(*d.values()):
        yield dict(zip(keys, element))
        
def evaluate_on_testset(model):
    y_preds_test = model.predict(x_test)
    mae_test = np.absolute((y_preds_test.reshape(1, -1) - y_test.values)).mean()
    print("MAE on test set: {}".format(mae_test))

def GridSearchWithVal(model_class, param_grid):
    combinations = list(dict_product(param_grid))
    min_mae = math.inf
    best_comb = None
    print("{} combinations in total".format(len(combinations)))
    for idx, comb in enumerate(combinations):
        model = model_class(**comb)
        model.fit(x_train, y_train)
        y_preds = model.predict(x_test)
        #print("y_preds:{}".format(y_preds))
        #print("y_test:{}".format(y_test.values))
        error = 0;
        mae = np.absolute((y_preds - y_test)).mean()
        if mae < min_mae:
            min_mae = mae
            best_comb = comb
        progress_str = "{} / {}, best comb: {}, best score: {}".format(idx + 1, len(combinations), best_comb, min_mae)
        sys.stdout.write('\r' + progress_str)
            
    print("best params:{}".format(best_comb))     
    print("min mae:{}".format(min_mae))
    return best_comb

In [40]:
rf_param_grid = {"n_estimators" : [230, 250, 280],
                  "criterion" : ["mae"],
                  "max_depth": [5, 6, 7, 8, None],
                  "max_features": [0.2, 0.5, 'sqrt', 'auto'],
                  "min_samples_split": [2, 3, 4, 5],
                  "min_samples_leaf": [1, 3],
                  "bootstrap": [True, False]
                 }
rf_best = GridSearchWithVal(RandomForestRegressor, rf_param_grid)

960 combinations in total
960 / 960, best comb: {'n_estimators': 230, 'criterion': 'mae', 'max_depth': None, 'max_features': 0.5, 'min_samples_split': 4, 'min_samples_leaf': 3, 'bootstrap': False}, best score: 0.15365986811015422best params:{'n_estimators': 230, 'criterion': 'mae', 'max_depth': None, 'max_features': 0.5, 'min_samples_split': 4, 'min_samples_leaf': 3, 'bootstrap': False}
min mae:0.15365986811015422


In [63]:
rf_best_params = {'n_estimators': 250, 'criterion': 'mae', 'max_depth': None, 'max_features': 0.5, 
                  'min_samples_split': 2, 'min_samples_leaf': 1}
rf_best = RandomForestRegressor(**rf_best_params)
rf_best.fit(x_train, y_train)
train_n_evaluate(rf_best, 'random forest', cv=10)

training random forest regressor...
random forest regressor trained, saving model...
saving model finished, getting validation scores...
cross val scores for score1:[0.1609142  0.14313115 0.16788294 0.15437114 0.1648534  0.17687647
 0.17030481 0.16292208 0.18273472 0.19423064]


In [36]:
xgb_param_grid = {"objective" : ['reg:squarederror'],
                  "n_estimators": [250, 2000, 2500, 3000],
                  "base_score" : [0.01, 0.02, 0.03, 0.04],
                  "max_depth": [4, 5, 6, 7],
                  "gamma": [0.001, 0.003, 0.005, 0.01],
                  "min_child_weight": range(1, 8, 2),
                  "learning_rate": [0.01, 0.05, 0.1, 0.2]
                 }
xgb_best = GridSearchWithVal(XGBRegressor, xgb_param_grid)

4096 combinations in total
4096 / 4096, best comb: {'objective': 'reg:squarederror', 'n_estimators': 250, 'base_score': 0.03, 'max_depth': 4, 'gamma': 0.005, 'min_child_weight': 3, 'learning_rate': 0.05}, best score: 0.1109435347653505best params:{'objective': 'reg:squarederror', 'n_estimators': 250, 'base_score': 0.03, 'max_depth': 4, 'gamma': 0.005, 'min_child_weight': 3, 'learning_rate': 0.05}
min mae:0.1109435347653505


In [64]:
xgb_best_params = {'objective': 'reg:squarederror', 'n_estimators': 250, 'base_score': 0.03, 'max_depth': 4, 
                   'gamma': 0.005, 'min_child_weight': 3, 'learning_rate': 0.05}
xbg_best = XGBRegressor(**xgb_best_params)
xbg_best.fit(x_train, y_train)
train_n_evaluate(xbg_best, 'xgboost', cv=10)

training xgboost regressor...
xgboost regressor trained, saving model...
saving model finished, getting validation scores...
cross val scores for score1:[0.12420282 0.13197989 0.12570782 0.10763928 0.13253649 0.12066563
 0.13092093 0.13677098 0.12392393 0.13907098]


In [34]:
gb_param_grid = {"n_estimators" : [200, 1000, 1500, 2000],
                  "loss" : ["huber", "lad"],
                  "learning_rate": [0.01, 0.005, 0.1],
                  "max_depth": [4, 5, 6, 7],
                  "min_samples_leaf": [50, 100, 120],
                  "max_features": [0.5, 0.3, None] 
                 }
gb_best = GridSearchWithVal(GradientBoostingRegressor, gb_param_grid)

864 combinations in total
864 / 864, best comb: {'n_estimators': 2000, 'loss': 'huber', 'learning_rate': 0.01, 'max_depth': 4, 'min_samples_leaf': 50, 'max_features': 0.3}, best score: 0.11571270237292694best params:{'n_estimators': 2000, 'loss': 'huber', 'learning_rate': 0.01, 'max_depth': 4, 'min_samples_leaf': 50, 'max_features': 0.3}
min mae:0.11571270237292694
