In [16]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore') # Just ignore user warning
pd.options.display.max_columns = None

datatrain = pd.read_csv('input_data/data_train_preprocessed_final.csv')
X = datatrain.drop('flag_kredit_macet', axis=1)
y = datatrain['flag_kredit_macet']

In [17]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [19]:
LR = LogisticRegression(class_weight='balanced', n_jobs=-1, random_state=1, max_iter=10, tol=1e-3)
BNB = BernoulliNB()
DTC = DecisionTreeClassifier(class_weight='balanced', random_state=1)
RFC = RandomForestClassifier(class_weight='balanced', n_jobs=-1 ,random_state=1)
GBC = GradientBoostingClassifier(random_state=1)
BC = BaggingClassifier(n_jobs=-1, random_state=1)
ETC = ExtraTreesClassifier(class_weight='balanced', n_jobs=-1, random_state=1)
XGB = XGBClassifier(n_jobs=-1, random_state=1)
LGB = LGBMClassifier(class_weight='balanced', n_jobs=-1, random_state=1)
NN = MLPClassifier()

In [22]:
PARAM_SEARCH_LR = {'penalty':['l1','l2'], 'C':[1, 0.1, 0.5, 10, 50, 100]}
PARAM_SEARCH_BNB = {'alpha':[1, 0.8, 0.5, 0.3, 0]}
PARAM_SEARCH_DTC = {'criterion':['gini', 'entropy'], 'max_depth':[None, 5, 10, 25, 50]}
PARAM_SEARCH_RFC = {'criterion':['gini', 'entropy'], 'n_estimators':[10, 50, 100, 500]}
PARAM_SEARCH_GBC = {'n_estimators':[10, 100, 500], 'max_depth':[3, 5, 10]}
PARAM_SEARCH_BC = {'n_estimators':[10, 25, 50, 100, 500]}
PARAM_SEARCH_ETC = {'criterion':['gini', 'entropy'], 'max_depth':[None, 5, 10, 25, 50]}
PARAM_SEARCH_XGB = {'max_depth':[3, 5, 10], 'learning_rate':[0.1, 0.5, 0.01, 0.05], 'n_estimators':[10, 50, 100]}
PARAM_SEARCH_LGB = {'num_leaves':[16, 32, 64], 'max_depth':[-1, 3, 5, 10], 'learning_rate':[0.1, 0.5, 0.01, 0.05], 'n_estimators':[10, 50, 100]}
PARAM_SEARCH_NN = {'hidden_layer_sizes':[(16,), (32,), (64,), (128,), (32,8), (32,16), (32,32), (64,16), (64,32), (64,64), (128,32), (128,64), (128,128), (32,32,32), (128,32,32), (128,64,32)], 'alpha':(10.0 ** -np.arange(1, 7)), 'learning_rate':['constant','invscaling','adaptive'], 'solver':['lbfgs','sgd','adam']}

In [8]:
grid_search_LR = GridSearchCV(LR, PARAM_SEARCH_LR, scoring='roc_auc', cv=10, n_jobs=-1).fit(X,y)
grid_search_LR.best_estimator_

LogisticRegression(C=100, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=10,
          multi_class='warn', n_jobs=-1, penalty='l2', random_state=1,
          solver='warn', tol=0.001, verbose=0, warm_start=False)

In [9]:
grid_search_BNB = GridSearchCV(BNB, PARAM_SEARCH_BNB, scoring='roc_auc', cv=10, n_jobs=-1).fit(X,y)
grid_search_BNB.best_estimator_

BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)

In [10]:
grid_search_DTC = GridSearchCV(DTC, PARAM_SEARCH_DTC, scoring='roc_auc', cv=10, n_jobs=-1).fit(X,y)
grid_search_DTC.best_estimator_

DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
            max_depth=5, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [11]:
grid_search_RFC = GridSearchCV(RFC, PARAM_SEARCH_RFC, scoring='roc_auc', cv=10, n_jobs=-1).fit(X,y)
grid_search_RFC.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [12]:
grid_search_GBC = GridSearchCV(GBC, PARAM_SEARCH_GBC, scoring='roc_auc', cv=10, n_jobs=-1).fit(X,y)
grid_search_GBC.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=1,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [13]:
grid_search_BC = GridSearchCV(BC, PARAM_SEARCH_BC, scoring='roc_auc', cv=10, n_jobs=-1).fit(X,y)
grid_search_BC.best_estimator_

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=500, n_jobs=-1, oob_score=False, random_state=1,
         verbose=0, warm_start=False)

In [14]:
grid_search_ETC = GridSearchCV(ETC, PARAM_SEARCH_ETC, scoring='roc_auc', cv=10, n_jobs=-1).fit(X,y)
grid_search_ETC.best_estimator_

ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
           criterion='gini', max_depth=25, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [15]:
grid_search_XGB = GridSearchCV(XGB, PARAM_SEARCH_XGB, scoring='roc_auc', cv=10, n_jobs=-1).fit(X,y)
grid_search_XGB.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [16]:
rand_search_LGB = RandomizedSearchCV(LGB, PARAM_SEARCH_LGB, scoring='roc_auc', cv=10, n_iter=100, random_state=1, n_jobs=-1).fit(X,y)
rand_search_LGB.best_estimator_

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
        colsample_bytree=1.0, learning_rate=0.05, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=32, objective=None,
        random_state=1, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=1)

In [25]:
grid_search_NN = GridSearchCV(NN, PARAM_SEARCH_NN, scoring='roc_auc', cv=10, n_jobs=-1).fit(X,y)
grid_search_NN.best_estimator_

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [24]:
rand_search_NN = RandomizedSearchCV(NN, PARAM_SEARCH_NN, scoring='roc_auc', cv=10, n_jobs=-1, n_iter=100, random_state=100).fit(X,y)
rand_search_NN.best_estimator_

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(16,), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [26]:
PARAM_SEARCH_NN

{'hidden_layer_sizes': [(16,),
  (32,),
  (64,),
  (128,),
  (32, 8),
  (32, 16),
  (32, 32),
  (64, 16),
  (64, 32),
  (64, 64),
  (128, 32),
  (128, 64),
  (128, 128),
  (32, 32, 32),
  (128, 32, 32),
  (128, 64, 32)],
 'alpha': array([1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06]),
 'learning_rate': ['constant', 'invscaling', 'adaptive'],
 'solver': ['lbfgs', 'sgd', 'adam']}