In [6]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from functools import partial
import optuna
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

use_stack = False
new_study = True
do_lda = False
fix_data_skew = False

In [7]:
train_df = pd.read_csv('input/train' + ('_stack' if use_stack else '') + '.csv')
test_df = pd.read_csv('input/test'  + ('_stack' if use_stack else '') + '.csv')

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
train_df.head()

X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [8]:
if do_lda:    
    lda = LDA(solver='svd', n_components=5, store_covariance=True)
    X_lda = pd.DataFrame(lda.fit_transform(X, y))
    X_test_lda = pd.DataFrame(lda.transform(X_test))
    X["lda"] = X_lda
    X_test["lda"] = X_test_lda

In [9]:
n_splits = 5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

def objective(X, y, trial):
    score = 0.0

    # based on kernel params
    params = {
        "objective" : "binary",
        "metric" : "auc",
        "boosting": 'gbdt',
        "max_depth" : -1,
        "num_leaves" : trial.suggest_int('num_leaves', 11, 15), #13
        "learning_rate" : 0.01,
        "bagging_freq": trial.suggest_int('bagging_freq', 3, 7), #5
        "bagging_fraction" : trial.suggest_uniform('bagging_fraction', 0.3, 0.5), #0.4
        "feature_fraction" : trial.suggest_uniform('feature_fraction', 0.04, 0.06), #0.05
        "min_data_in_leaf": trial.suggest_int('min_data_in_leaf', 75, 85), #80
        "min_sum_heassian_in_leaf": trial.suggest_int('min_sum_heassian_in_leaf', 8, 12), #10
        "tree_learner": "serial",
        "boost_from_average": "false",
        #"lambda_l1" : 5,
        #"lambda_l2" : 5,
        "bagging_seed" : 13,
        "verbosity" : 1,
        "seed": 42
    }

    for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid)
            
        model = lgb.train(params,train_data,num_boost_round=20000,
                        valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds=100)
        score += model.best_score['valid_1']['auc'] / n_splits
    print(score)
    return 1.0 - score

In [None]:
def main():
    f = partial(objective, X, y)
    if new_study:
        study = optuna.create_study(study_name='lgbm_kfold_study', storage='sqlite:///storage.db')
    else:
        study = optuna.Study(study_name='lgbm_kfold_study', storage='sqlite:///storage.db')
    study.optimize(f, n_trials=3000)
    print('params:', study.best_params)

if __name__ == '__main__':
    main()

[I 2019-03-29 08:58:24,217] A new study created with name: lgbm_kfold_study


Fold 0 started at Fri Mar 29 08:58:24 2019
Training until validation scores don't improve for 100 rounds.
[300]	training's auc: 0.883779	valid_1's auc: 0.866422
[600]	training's auc: 0.896171	valid_1's auc: 0.878117
[900]	training's auc: 0.902332	valid_1's auc: 0.884015
[1200]	training's auc: 0.906038	valid_1's auc: 0.886536
[1500]	training's auc: 0.908767	valid_1's auc: 0.888683
[1800]	training's auc: 0.911187	valid_1's auc: 0.890317
[2100]	training's auc: 0.913356	valid_1's auc: 0.89163
[2400]	training's auc: 0.91575	valid_1's auc: 0.893049
[2700]	training's auc: 0.917734	valid_1's auc: 0.894053
[3000]	training's auc: 0.919597	valid_1's auc: 0.894824
[3300]	training's auc: 0.921395	valid_1's auc: 0.895586
[3600]	training's auc: 0.92306	valid_1's auc: 0.896248
[3900]	training's auc: 0.924554	valid_1's auc: 0.896733
[4200]	training's auc: 0.926101	valid_1's auc: 0.897261
[4500]	training's auc: 0.927549	valid_1's auc: 0.897723
[4800]	training's auc: 0.92901	valid_1's auc: 0.898041
[5100