In [1]:
import pandas as pd
import numpy as np
import math, sys
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from joblib import dump, load
from itertools import product
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/raw/intern_data.csv', index_col=0)
num_cols = ['a', 'b', 'd', 'e', 'f', 'g']
ctg_cols = ['c', 'h']
Y_LABEL = 'y'
df_dummy = pd.get_dummies(df, columns=ctg_cols)
df_dummy

Unnamed: 0,a,b,d,e,f,g,y,c_blue,c_green,c_red,c_yellow,h_black,h_white
7,0.951786,0.669570,0.170130,0.623469,0.925886,0.812685,3.707514,0,1,0,0,0,1
43,0.510447,0.922627,0.087899,0.025415,0.698444,0.658545,2.689243,0,0,1,0,0,1
47,0.294838,0.351081,0.710892,0.699661,0.545722,0.836863,2.886508,0,0,0,1,1,0
53,0.798645,0.572042,0.026137,0.609730,0.488668,0.342675,2.478168,0,1,0,0,1,0
54,0.689666,0.395323,0.172448,0.736433,0.708408,0.695521,3.182666,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4889,0.195745,0.791511,0.784001,0.778692,0.407301,0.895939,3.646691,0,1,0,0,0,1
4910,0.995119,0.076542,0.326500,0.829949,0.500763,0.545784,3.270344,0,0,0,1,0,1
4920,0.091773,0.326965,0.922553,0.257745,0.348771,0.624851,2.672514,0,1,0,0,0,1
4931,0.761853,0.654755,0.252334,0.128781,0.658069,0.405367,1.259850,1,0,0,0,1,0


In [2]:
def get_train_test_sets(df_input, test_size=0.2):
    train_set, test_set = train_test_split(df_input, test_size=test_size, random_state=42)
    x_train = train_set.drop(columns=[Y_LABEL], axis=1)
    x_test = test_set.drop(columns=[Y_LABEL], axis=1)
    #y_train, y_test = num_pipeline.fit_transform(train_set[[Y_LABEL]]), num_pipeline.fit_transform(test_set[[Y_LABEL]])
    y_train, y_test = train_set[Y_LABEL], test_set[Y_LABEL]
    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = get_train_test_sets(df_dummy)
y_train

2364    1.976107
4237    1.909539
226     2.334746
3197    2.464800
3291    3.472318
          ...   
1019    2.640457
2628    2.348951
3476    1.386866
4258    2.462889
1002    2.739158
Name: y, Length: 400, dtype: float64

In [3]:
def train_n_evaluate(model, model_name, cv=3):
    print("training {} regressor...".format(model_name))
    model.fit(x_train, y_train)
    print("{} regressor trained, saving model...".format(model_name))
    dump(model, '../models/{}.joblib'.format(model_name))
    print("saving model finished, getting validation scores...")
    scores = cross_val_score(model, x_train, y_train, cv=cv, scoring='r2')
    print("cross val scores for score1:{}".format(scores))

In [18]:
rf_reg = RandomForestRegressor(n_estimators=2000, verbose=1, criterion='mse')
train_n_evaluate(rf_reg, "random_forest", cv=5)

training random_forest regressor...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    6.2s finished


random_forest regressor trained, saving model...
saving model finished, getting validation scores...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    4.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    4.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    4.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[

cross val scores for score1:[0.86695559 0.89572946 0.82881456 0.87821673 0.81749803]


[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    4.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    0.2s finished


In [54]:
def dict_product(d):
    keys = d.keys()
    for element in product(*d.values()):
        yield dict(zip(keys, element))
        
def evaluate_on_testset(model):
    y_preds_test = model.predict(x_test)
    mae_test = np.absolute((y_preds_test.reshape(1, -1) - y_test.values)).mean()
    print("MAE on test set: {}".format(mae_test))

def GridSearchWithVal(model_class, param_grid):
    combinations = list(dict_product(param_grid))
    min_mae = math.inf
    best_comb = None
    print("{} combinations in total".format(len(combinations)))
    for idx, comb in enumerate(combinations):
        model = model_class(**comb)
        model.fit(x_train, y_train)
        y_preds = model.predict(x_test)
        #print("y_preds:{}".format(y_preds))
        #print("y_test:{}".format(y_test.values))
        error = 0;
        mae = np.absolute((y_preds - y_test)).mean()
        if mae < min_mae:
            min_mae = mae
            best_comb = comb
        progress_str = "{} / {}, best comb: {}, best score: {}".format(idx + 1, len(combinations), best_comb, min_mae)
        sys.stdout.write('\r' + progress_str)
            
    print("best params:{}".format(best_comb))     
    print("min mae:{}".format(min_mae))
    return best_comb

In [9]:
rf_param_grid = {"n_estimators" : [150, 200, 250, 500, 1000, 1500],
                  "criterion" : ["mae"],
                  "max_depth": [5, 6, 7, 8, None],
                  "max_features": [0.2, 0.5, 'sqrt', 'auto'],
                  "min_samples_split": [2, 3, 4, 5],
                  "min_samples_leaf": [1, 3],
                  #"bootstrap": [True, False]
                 }
rf_best = GridSearchWithVal(RandomForestRegressor, rf_param_grid)

960 combinations in total
960 / 960, best comb: {'n_estimators': 250, 'criterion': 'mae', 'max_depth': None, 'max_features': 0.5, 'min_samples_split': 2, 'min_samples_leaf': 1}, best score: 0.16225088358780332best params:{'n_estimators': 250, 'criterion': 'mae', 'max_depth': None, 'max_features': 0.5, 'min_samples_split': 2, 'min_samples_leaf': 1}
min mae:0.16225088358780332


In [55]:
xgb_param_grid = {"objective" : ['reg:squarederror'],
                  "n_estimators": [200, 250, 300, 500, 1000, 1500],
                  "base_score" : [0.01, 0.02, 0.03, 0.04],
                  "max_depth": [4, 5, 6, 7],
                  "gamma": [0.001, 0.003, 0.005, 0.01],
                  "min_child_weight": range(1, 8, 2),
                  "learning_rate": [0.01, 0.05, 0.1, 0.2]
                 }
xgb_best = GridSearchWithVal(XGBRegressor, xgb_param_grid)

MAE on test set: 0.16258204581981756
