# Task2 20minute version: main algorithm
import needed library

In [20]:
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.metrics import fbeta_score, make_scorer, mean_absolute_error
import pickle
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

Loading data

In [21]:
# loading training set
training = pd.read_csv('training2.csv')
del training['time']

# loading submission set
submission = pd.read_csv('submission2.csv')
submission_ori = submission.copy()

# prepare columns for X and y
target = 'volume'
predictors = [x for x in training.columns if x not in [target]]

# prepare training data
X = training[predictors]
X = preprocessing.normalize(X);
y = training[target]
#y = preprocessing.normalize(y);

# prepare submission data
X_submission = submission[predictors] # submission predictors

# initialize model
mlp_model = MLPRegressor(
    hidden_layer_sizes=(20, 20),
    solver='adam',
    activation = 'relu',
    max_iter = 200000,
    learning_rate = 'adaptive',
    learning_rate_init = 0.01)

In [14]:
def mape(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred) / y_true)

loss = make_scorer(mape, greater_is_better=False)

#  Start Grid Searching

## Step 1: 尋找 最佳的 max_depth 和 min_child_weight
介紹各參數可以解決的問題：
* max_depth
   * 顧名思義為各子分類器（決策樹）的深度。
   * 過深的決策樹很容易造成 overfitting。
* min_child_weight
    * 決定在 training 的過程中，各子節點的權重不能過小。
    * 這樣可以避免分類器分類過細，導致 overfitting。

In [31]:
param_test1 = {
    'hidden_layer_sizes': [(6), (4, 4), (5, 5), (4, 4, 4), (6, 6, 6)]#[(6), (20, 20), (10), (20), (20, 20, 20), (40, 20), (40, 30, 20, 10), (200, 200)],
}

gsearch1 = GridSearchCV(
    estimator = mlp_model,
    param_grid = param_test1,
    scoring = loss,
    cv = 5)

gsearch1.fit(X, y)
gsearch1.grid_scores_



[mean: -2.56428, std: 0.10630, params: {'hidden_layer_sizes': 6},
 mean: -2.55781, std: 0.05790, params: {'hidden_layer_sizes': (4, 4)},
 mean: -2.56857, std: 0.07283, params: {'hidden_layer_sizes': (5, 5)},
 mean: -2.59928, std: 0.09154, params: {'hidden_layer_sizes': (4, 4, 4)},
 mean: -2.56244, std: 0.12699, params: {'hidden_layer_sizes': (6, 6, 6)}]

In [29]:
gsearch1.best_params_

{'hidden_layer_sizes': (4, 4)}

印出最佳的參數組合：

經過跳要搜尋找到了最佳的 max_depth 為 5，和最佳的 min_child_weight 為 5。  
因此決定再尋找 5 和 7 周圍的數字有沒有比 5 和 7 更好的結果。

In [32]:
# replace the best model to xgb_model
mlp_model = gsearch1.best_estimator_

param_test2 = {
    'solver':['lbfgs', 'sgd', 'adam']
}

gsearch2 = GridSearchCV(
    estimator = mlp_model,
    param_grid = param_test2,
    scoring = loss,
    cv = 5)

gsearch2.fit(X, y)
gsearch2.grid_scores_



[mean: -2.55603, std: 0.09288, params: {'solver': 'lbfgs'},
 mean: -2.57376, std: 0.08729, params: {'solver': 'sgd'},
 mean: -2.57442, std: 0.09333, params: {'solver': 'adam'}]

印出最佳的參數組合：

In [7]:
print('best_params:', gsearch2.best_params_, 'best_score:', gsearch2.best_score_)

best_params: {'solver': 'adam'} best_score: -3.61061011245


最佳的結果變為：  
best max_depth: 5  
best min_child_weight: 5

## Step2: 尋找 最佳的 gamma
介紹參數 gamma 可以解決的問題：  
* 再決定該節點要不要分離前，確定分離後的 loss 的值下降超過一定的閥值，才會分離該節點。  
* 該閥值就是 gamma
* 若 gamma 越大，模型越保守

In [12]:
# replace the best model to xgb_model
mlp_model = gsearch2.best_estimator_

param_test3 = {
    'learning_rate_init':[0.01, 0.1, 0.05, 0.2]
}

gsearch3 = GridSearchCV(
    estimator = mlp_model,
    param_grid = param_test3,
    scoring = loss,
    cv = 5)

gsearch3.fit(X, y)
gsearch3.grid_scores_



[mean: -5.28397, std: 5.57455, params: {'learning_rate_init': 0.01},
 mean: -4.50005, std: 4.66710, params: {'learning_rate_init': 0.1},
 mean: -9.11911, std: 13.25371, params: {'learning_rate_init': 0.05},
 mean: -2.58049, std: 0.09097, params: {'learning_rate_init': 0.2}]

In [10]:
gsearch2.best_estimator_

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=6, learning_rate='adaptive',
       learning_rate_init=0.01, max_iter=200000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [14]:
print('best_params:', gsearch3.best_params_, 'best_score:', gsearch3.best_score_)

best_params: {'gamma': 0.4} best_score: 0.799207632233


最佳的 gamma 為 0.2

## Step3: 尋找 最佳的 subsample 與 colsample_bytree

In [17]:
xgb_model = gsearch3.best_estimator_
param_test4 = {
     'subsample':[i/10.0 for i in range(6,10)],
     'colsample_bytree':[i/10.0 for i in range(6,10)]
}

gsearch4 = GridSearchCV(
    estimator = xgb_model,
    param_grid = param_test4,
    scoring = loss,
    cv = 5)

gsearch4.fit(X, y)
gsearch4.grid_scores_



[mean: -0.63090, std: 0.23557, params: {'colsample_bytree': 0.6, 'subsample': 0.6},
 mean: -0.61565, std: 0.26337, params: {'colsample_bytree': 0.6, 'subsample': 0.7},
 mean: -0.59694, std: 0.26087, params: {'colsample_bytree': 0.6, 'subsample': 0.8},
 mean: -0.61737, std: 0.25821, params: {'colsample_bytree': 0.6, 'subsample': 0.9},
 mean: -0.56920, std: 0.24744, params: {'colsample_bytree': 0.7, 'subsample': 0.6},
 mean: -0.56298, std: 0.22767, params: {'colsample_bytree': 0.7, 'subsample': 0.7},
 mean: -0.58071, std: 0.25970, params: {'colsample_bytree': 0.7, 'subsample': 0.8},
 mean: -0.55018, std: 0.27078, params: {'colsample_bytree': 0.7, 'subsample': 0.9},
 mean: -0.51142, std: 0.22797, params: {'colsample_bytree': 0.8, 'subsample': 0.6},
 mean: -0.53027, std: 0.23749, params: {'colsample_bytree': 0.8, 'subsample': 0.7},
 mean: -0.48830, std: 0.23968, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
 mean: -0.52119, std: 0.25451, params: {'colsample_bytree': 0.8, 'subsample'

In [18]:
gsearch4.best_params_

{'colsample_bytree': 0.9, 'subsample': 0.8}

In [19]:
xgb_model = gsearch4.best_estimator_
param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

gsearch6 = GridSearchCV(
    estimator = xgb_model,
    param_grid = param_test6,
    scoring = loss,
    cv = 5)

gsearch6.fit(X, y)
gsearch6.grid_scores_



[mean: -0.46537, std: 0.21436, params: {'reg_alpha': 1e-05},
 mean: -0.46548, std: 0.21429, params: {'reg_alpha': 0.01},
 mean: -0.48015, std: 0.23282, params: {'reg_alpha': 0.1},
 mean: -0.47803, std: 0.22437, params: {'reg_alpha': 1},
 mean: -0.46626, std: 0.20268, params: {'reg_alpha': 100}]

In [20]:
gsearch6.best_params_

{'reg_alpha': 1e-05}

In [21]:
xgb_model = gsearch6.best_estimator_.fit(X, y)

y_submission = xgb_model.predict(X_submission)

In [22]:
submission_ori[target] = y_submission
submission[target] = submission_ori[target]
submission_ori = submission_ori[submission_ori.columns[0:4]]
submission_ori.to_csv('submission2_result.csv', index=False)

In [23]:
submission

Unnamed: 0,tollgate_id,time_window,direction,volume,pressure,sea_pressure,wind_direction,wind_speed,temperature,rel_humidity,precipitation,week,hour,minute
0,1.0,"[2016-10-25 08:00:00,2016-10-25 08:20:00)",0.0,50.670559,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,0.0
1,1.0,"[2016-10-25 08:20:00,2016-10-25 08:40:00)",0.0,52.457321,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,20.0
2,1.0,"[2016-10-25 08:40:00,2016-10-25 09:00:00)",0.0,52.459213,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,40.0
3,1.0,"[2016-10-25 09:00:00,2016-10-25 09:20:00)",0.0,53.855652,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,9.0,0.0
4,1.0,"[2016-10-25 09:20:00,2016-10-25 09:40:00)",0.0,52.533863,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,9.0,20.0
5,1.0,"[2016-10-25 09:40:00,2016-10-25 10:00:00)",0.0,51.642765,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,9.0,40.0
6,1.0,"[2016-10-25 08:00:00,2016-10-25 08:20:00)",1.0,96.182449,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,0.0
7,1.0,"[2016-10-25 08:20:00,2016-10-25 08:40:00)",1.0,99.676285,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,20.0
8,1.0,"[2016-10-25 08:40:00,2016-10-25 09:00:00)",1.0,99.679382,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,40.0
9,1.0,"[2016-10-25 09:00:00,2016-10-25 09:20:00)",1.0,102.442223,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,9.0,0.0
