In [1]:
# imports
import models
import utils
import pandas as pd

import numpy as np

from sklearn.preprocessing import MinMaxScaler, Imputer

In [20]:
train_data = utils.load_pickle("./final_features_train2.pkl")
test_data = utils.load_pickle("./final_features_test2.pkl")
print(train_data.shape)
print(test_data.shape)

(307511, 608)
(48744, 607)


In [21]:
train_Y = utils.get_train_labels(train_data)
train_ids = train_data[['SK_ID_CURR']]
test_ids = test_data[['SK_ID_CURR']]

In [22]:
train_data.drop(columns=['SK_ID_CURR'], inplace=True)
train_data.drop(columns=['TARGET'], inplace=True)
test_data.drop(columns=['SK_ID_CURR'], inplace= True)

train_X = train_data.values
test_X = test_data.values
feature_names = train_data.columns # required to create feature importances

# Cleaning dataframes of overflow values
train_X[np.where(train_X >= np.finfo(np.float64).max)] = np.nan
test_X[np.where(test_X >= np.finfo(np.float64).max)] = np.nan

# Median imputation of missing values
imputer = Imputer(strategy = 'median')

imputer.fit(train_X)

train_X = imputer.transform(train_X)
test_X = imputer.transform(test_X)


# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

scaler.fit(train_X)

train_X = scaler.transform(train_X)
test_X = scaler.transform(test_X)

In [29]:
# train_X = pd.DataFrame(train_X)
# train_X = pd.concat([train_ids, train_Y, train_X], axis=1)
# test_X = pd.DataFrame(test_X)
# test_X = pd.concat([test_ids, test_X], axis=1)
# utils.save_pickle("finaltrain.pkl", train_X)
# utils.save_pickle("finaltest.pkl", test_X)

File saved at  finaltrain.pkl
File saved at  finaltest.pkl


In [30]:
train_X.drop(columns=['SK_ID_CURR'], inplace=True)
train_X.drop(columns=['TARGET'], inplace=True)
test_X.drop(columns=['SK_ID_CURR'], inplace= True)

In [None]:
# convert from dataframes to arrays
# get train labels and test ids
feature_names = train_data.columns # required to create feature importances
train_X = train_X.values
test_X = test_X.values

In [33]:
model, predictions, feature_importances, metrics = models.gbm_basic(train_X, train_Y, test_X, feature_names)
utils.create_and_save_submission(test_ids, predictions, save_path='../test_predictions/lgbm_simple.csv')

Training Data Shape:  (307511, 606)
Testing Data Shape:  (48744, 606)
Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.830269	train's binary_logloss: 0.514133	valid's auc: 0.782059	valid's binary_logloss: 0.533941
Early stopping, best iteration is:
[282]	train's auc: 0.847352	train's binary_logloss: 0.495114	valid's auc: 0.783017	valid's binary_logloss: 0.521844
Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.831233	train's binary_logloss: 0.512977	valid's auc: 0.779384	valid's binary_logloss: 0.533948
[400]	train's auc: 0.868313	train's binary_logloss: 0.471264	valid's auc: 0.780495	valid's binary_logloss: 0.507764
Early stopping, best iteration is:
[409]	train's auc: 0.869816	train's binary_logloss: 0.469533	valid's auc: 0.780618	valid's binary_logloss: 0.506629
Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.830263	train's binary_logloss: 0.514243	valid's auc: 0.783614	valid's 

In [34]:
print(metrics)

      fold     train     valid
0        0  0.847352  0.783017
1        1  0.869816  0.780618
2        2  0.870411  0.785172
3        3  0.853020  0.783829
4        4  0.852939  0.782638
5  overall  0.858707  0.783001


In [None]:
import imp; imp.reload(models); imp.reload(utils)
model, predictions, feature_importances, metrics = models.gbm_random_search(train_X, train_Y, test_X, feature_names, samples=30000, max_evals=100)


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [13:00<21:27:40, 780.41s/it][A

fs



  2%|▏         | 2/100 [13:34<15:08:52, 556.46s/it][A

fs



  3%|▎         | 3/100 [14:17<10:50:28, 402.35s/it][A

fs



  4%|▍         | 4/100 [14:20<7:32:08, 282.59s/it] [A

fs



  5%|▌         | 5/100 [14:22<5:14:26, 198.60s/it][A

fs



  6%|▌         | 6/100 [33:17<12:31:14, 479.51s/it][A

fs



  7%|▋         | 7/100 [39:00<11:19:49, 438.59s/it][A

fs


In [35]:
# This set of parameters is equivalent to the one obove and it was found after running "gbm_random_search"
hyperparams = {'is_unbalance': False,
'n_estimators': 2673,
'num_leaves': 77,
'learning_rate': 0.007641070180129345,
'min_child_samples': 460,
'boosting_type': 'gbdt',
'subsample_for_bin': 240000,
'reg_lambda': 0.2040816326530612,
'reg_alpha': 0.8775510204081632,
'subsample': 0.9494949494949496,
'colsample_bytree': 0.7333333333333333}

In [36]:
import imp; imp.reload(models)
model, predictions, feature_importances, metrics = models.gbm_with_params(train_X, train_Y, test_X, feature_names, hyperparams)
utils.create_and_save_submission(test_ids, predictions, save_path='../test_predictions/lgbm_random_tuned_bayes.csv')

Training Data Shape:  (307511, 606)
Testing Data Shape:  (48744, 606)
Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.786262	train's binary_logloss: 0.242407	valid's auc: 0.760017	valid's binary_logloss: 0.246512
[400]	train's auc: 0.810863	train's binary_logloss: 0.23096	valid's auc: 0.771527	valid's binary_logloss: 0.240742
[600]	train's auc: 0.829953	train's binary_logloss: 0.223031	valid's auc: 0.778639	valid's binary_logloss: 0.238077
[800]	train's auc: 0.845841	train's binary_logloss: 0.216482	valid's auc: 0.782239	valid's binary_logloss: 0.236735
[1000]	train's auc: 0.860079	train's binary_logloss: 0.210679	valid's auc: 0.784238	valid's binary_logloss: 0.235997
[1200]	train's auc: 0.872447	train's binary_logloss: 0.205475	valid's auc: 0.7854	valid's binary_logloss: 0.23555
[1400]	train's auc: 0.883423	train's binary_logloss: 0.200746	valid's auc: 0.786142	valid's binary_logloss: 0.235284
[1600]	train's auc: 0.89331	train's binary_logloss: 0.19

In [37]:
print(metrics)

      fold     train     valid
0        0  0.915992  0.786949
1        1  0.898426  0.783576
2        2  0.896294  0.789087
3        3  0.916229  0.788463
4        4  0.919041  0.784381
5  overall  0.909196  0.786465


In [38]:
# This set of parameters was found through Bayesian optimization from https://www.kaggle.com/jsaguiar/updated-0-792-lb-lightgbm-with-simple-features
bayesian_params = {
        'n_estimators': 1327,
        'learning_rate': 0.02,
        'num_leaves': 34,
        'colsample_bytree': 0.9497036,
        'subsample': 0.8715623,
        'max_depth': 8,
        'reg_alpha': 0.041545473,
        'reg_lambda': 0.0735294,
        'min_split_gain': 0.0222415,
        'min_child_weight': 39.3259775,
        'silent': -1,
        'verbose': -1}

In [39]:
model, predictions, feature_importances, metrics = models.gbm_with_params(train_X, train_Y, test_X, feature_names, bayesian_params)

Training Data Shape:  (307511, 606)
Testing Data Shape:  (48744, 606)
Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.794907	train's binary_logloss: 0.235526	valid's auc: 0.772531	valid's binary_logloss: 0.240284
[400]	train's auc: 0.818277	train's binary_logloss: 0.226195	valid's auc: 0.781837	valid's binary_logloss: 0.236899
[600]	train's auc: 0.834087	train's binary_logloss: 0.219911	valid's auc: 0.784936	valid's binary_logloss: 0.235777
[800]	train's auc: 0.846892	train's binary_logloss: 0.214738	valid's auc: 0.786169	valid's binary_logloss: 0.235345
[1000]	train's auc: 0.857774	train's binary_logloss: 0.210162	valid's auc: 0.78677	valid's binary_logloss: 0.235131
Early stopping, best iteration is:
[1057]	train's auc: 0.860719	train's binary_logloss: 0.208916	valid's auc: 0.786952	valid's binary_logloss: 0.235081
Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.795561	train's binary_logloss: 0.23492	valid's auc:

In [41]:
utils.create_and_save_submission(test_ids, predictions, save_path='../test_predictions/lgbm_first_tuned_bayes.csv')

Predictions saved to:  ../test_predictions/lgbm_first_tuned_bayes.csv


In [40]:
print(metrics)

      fold     train     valid
0        0  0.860719  0.786952
1        1  0.860115  0.782791
2        2  0.852945  0.788298
3        3  0.872976  0.787310
4        4  0.867749  0.784546
5  overall  0.862901  0.785958


In [42]:
# Hyperparams from model tuning with Bayesian Optimization
own_bayes = {
    "is_unbalance": False,
    "n_estimators": 1327,
    "num_leaves": 106,
    "learning_rate": 0.0126346500398102,
    'min_child_samples': 390,
    'boosting_type': 'gbdt',
    'subsample_for_bin': 80000,
    'reg_lambda': 0.38268769901820565,
    'reg_alpha': 0.5129992714397862,
    'subsample': 0.7177561548329953,
    'colsample_bytree': 0.6149378064887835
}

In [43]:
model, predictions, feature_importances, metrics = models.gbm_with_params(train_X, train_Y, test_X, feature_names, own_bayes)
utils.create_and_save_submission(test_ids, predictions, save_path='../test_predictions/lgbm_second_tuned_bayes.csv')

Training Data Shape:  (307511, 606)
Testing Data Shape:  (48744, 606)
Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.815527	train's binary_logloss: 0.230407	valid's auc: 0.769971	valid's binary_logloss: 0.24155
[400]	train's auc: 0.852788	train's binary_logloss: 0.214294	valid's auc: 0.781388	valid's binary_logloss: 0.237129
[600]	train's auc: 0.881863	train's binary_logloss: 0.20181	valid's auc: 0.784952	valid's binary_logloss: 0.235769
[800]	train's auc: 0.905048	train's binary_logloss: 0.191259	valid's auc: 0.786276	valid's binary_logloss: 0.235268
[1000]	train's auc: 0.923375	train's binary_logloss: 0.182021	valid's auc: 0.786832	valid's binary_logloss: 0.235117
Early stopping, best iteration is:
[1096]	train's auc: 0.931016	train's binary_logloss: 0.177866	valid's auc: 0.787033	valid's binary_logloss: 0.235059
Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.816101	train's binary_logloss: 0.229823	valid's auc:

In [44]:
print(metrics)

      fold     train     valid
0        0  0.931016  0.787033
1        1  0.924463  0.783593
2        2  0.926619  0.788767
3        3  0.922586  0.788447
4        4  0.923573  0.784308
5  overall  0.925652  0.786423
