In [13]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [14]:
train_df = pd.read_csv('input/train.csv')
# train_df = pd.read_csv('input/train_min.csv')
test_df = pd.read_csv('input/test.csv')

X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [15]:
n_splits = 10
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [16]:
# cv: 0.89951
params = {
         'num_leaves': 8,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4 
        }


# cv: 0.90775 (full)
# lb: 0.896 with fold_n=10 => overfit?
params = {
         'objective': 'binary',
         'max_depth': 127,
         'metric': 'auc',
         'boosting':'gbdt',
         }

'''
# 1st round looks good: cv 0.900043
# lb: 0.899 with n_splits=10
# try larger max_bin?
params = {
         'num_leaves': 8,
         'min_data_in_leaf': 1000,
         'objective': 'binary',
         'max_depth': -1,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4,
         'max_bin': 511,
        }
'''

"\n# 1st round looks good: cv 0.900043\n# lb: 0.899 with n_splits=10\n# try larger max_bin?\nparams = {\n         'num_leaves': 8,\n         'min_data_in_leaf': 1000,\n         'objective': 'binary',\n         'max_depth': -1,\n         'learning_rate': 0.0123,\n         'boosting': 'gbdt',\n         'bagging_freq': 5,\n         'bagging_fraction': 0.8,\n         'feature_fraction': 0.8201,\n         'bagging_seed': 11,\n         'reg_alpha': 1.728910519108444,\n         'reg_lambda': 4.9847051755586085,\n         'random_state': 42,\n         'metric': 'auc',\n         'verbosity': -1,\n         'subsample': 0.81,\n         'min_gain_to_split': 0.01077313523861969,\n         'min_child_weight': 19.428902804238373,\n         'num_threads': 4,\n         'max_bin': 511,\n        }\n"

In [17]:
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=200000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 200)
    
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_splits

Fold 0 started at Sat Mar  2 07:33:51 2019
Training until validation scores don't improve for 200 rounds.
[300]	training's auc: 0.96627	valid_1's auc: 0.887899
Early stopping, best iteration is:
[366]	training's auc: 0.974201	valid_1's auc: 0.889333
Fold 1 started at Sat Mar  2 07:35:33 2019
Training until validation scores don't improve for 200 rounds.
[300]	training's auc: 0.96629	valid_1's auc: 0.886376
[600]	training's auc: 0.991282	valid_1's auc: 0.887753
Early stopping, best iteration is:
[494]	training's auc: 0.98571	valid_1's auc: 0.888385
Fold 2 started at Sat Mar  2 07:37:41 2019
Training until validation scores don't improve for 200 rounds.
[300]	training's auc: 0.966766	valid_1's auc: 0.886374
[600]	training's auc: 0.991187	valid_1's auc: 0.888281
Early stopping, best iteration is:
[483]	training's auc: 0.984847	valid_1's auc: 0.888717
Fold 3 started at Sat Mar  2 07:39:46 2019
Training until validation scores don't improve for 200 rounds.
[300]	training's auc: 0.966864	val

In [18]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)