In [3]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [4]:
train_df = pd.read_csv('input/train.csv')
#train_df = pd.read_csv('input/train_min.csv')
test_df = pd.read_csv('input/test.csv')

do_lda = True

fix_data_skew = False

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
train_df.head()

X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [5]:
if do_lda:    
    lda = LDA(solver='svd', n_components=5, store_covariance=True)
    X_lda = pd.DataFrame(lda.fit_transform(X, y))
    X_test_lda = pd.DataFrame(lda.transform(X_test))
    X["lda"] = X_lda
    X_test["lda"] = X_test_lda

In [6]:
n_splits = 10
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [10]:
'''
# cv: 0.89951
params = {
         'num_leaves': 8,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4 
        }
'''


# 1st round looks good: cv 0.900043
# lb: 0.899 with n_splits=10
# try larger max_bin?
params = {
         'num_leaves': 8,
         'min_data_in_leaf': 1000,
         'objective': 'binary',
         'max_depth': -1,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4,
         'max_bin': 511,
        }


params = {
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_data_in_leaf': 118,
    'max_depth': 1,
    'num_leaves': 23,
    'learning_rate': 0.04019176517639987,
    'bagging_freq': 3,
    #'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
    'feature_fraction': 0.8990901412442585,
    'bagging_seed': 11,
    'reg_alpha':  1.1173044727720816,
    'reg_lambda': 6.9285776442737514,
    'random_state': 42,
    'verbosity': -1,
    'subsample': 0.8054415526396443,
    'min_child_weight': 1.0653789180368052e-05,
    'num_threads': 4,
}

In [8]:
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=2000000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 1000)
    
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_splits

Fold 0 started at Tue Mar  5 13:48:51 2019
Training until validation scores don't improve for 1000 rounds.
[300]	training's auc: 0.867673	valid_1's auc: 0.869299
[600]	training's auc: 0.873971	valid_1's auc: 0.874732
[900]	training's auc: 0.878379	valid_1's auc: 0.878296
[1200]	training's auc: 0.88185	valid_1's auc: 0.881292
[1500]	training's auc: 0.884675	valid_1's auc: 0.883758
[1800]	training's auc: 0.887101	valid_1's auc: 0.885704
[2100]	training's auc: 0.889179	valid_1's auc: 0.887401
[2400]	training's auc: 0.891	valid_1's auc: 0.888808
[2700]	training's auc: 0.892623	valid_1's auc: 0.889793
[3000]	training's auc: 0.894053	valid_1's auc: 0.890932
[3300]	training's auc: 0.895291	valid_1's auc: 0.891807
[3600]	training's auc: 0.896449	valid_1's auc: 0.892572
[3900]	training's auc: 0.897425	valid_1's auc: 0.893316
[4200]	training's auc: 0.898439	valid_1's auc: 0.89391
[4500]	training's auc: 0.899291	valid_1's auc: 0.894494
[4800]	training's auc: 0.900107	valid_1's auc: 0.894942
[5100

[900]	training's auc: 0.878689	valid_1's auc: 0.875229
[1200]	training's auc: 0.882145	valid_1's auc: 0.878139
[1500]	training's auc: 0.884934	valid_1's auc: 0.880709
[1800]	training's auc: 0.887352	valid_1's auc: 0.882761
[2100]	training's auc: 0.889402	valid_1's auc: 0.884627
[2400]	training's auc: 0.891204	valid_1's auc: 0.885891
[2700]	training's auc: 0.892797	valid_1's auc: 0.887083
[3000]	training's auc: 0.894219	valid_1's auc: 0.888204
[3300]	training's auc: 0.895471	valid_1's auc: 0.889234
[3600]	training's auc: 0.896653	valid_1's auc: 0.890092
[3900]	training's auc: 0.89766	valid_1's auc: 0.890912
[4200]	training's auc: 0.898623	valid_1's auc: 0.89164
[4500]	training's auc: 0.899484	valid_1's auc: 0.89227
[4800]	training's auc: 0.900328	valid_1's auc: 0.892828
[5100]	training's auc: 0.90104	valid_1's auc: 0.893194
[5400]	training's auc: 0.901739	valid_1's auc: 0.893673
[5700]	training's auc: 0.902391	valid_1's auc: 0.894014
[6000]	training's auc: 0.902983	valid_1's auc: 0.8942

[4500]	training's auc: 0.898603	valid_1's auc: 0.900293
[4800]	training's auc: 0.899347	valid_1's auc: 0.900917
[5100]	training's auc: 0.900097	valid_1's auc: 0.901472
[5400]	training's auc: 0.900792	valid_1's auc: 0.902016
[5700]	training's auc: 0.901386	valid_1's auc: 0.902313
[6000]	training's auc: 0.901983	valid_1's auc: 0.902751
[6300]	training's auc: 0.902557	valid_1's auc: 0.903209
[6600]	training's auc: 0.903124	valid_1's auc: 0.903611
[6900]	training's auc: 0.903605	valid_1's auc: 0.903964
[7200]	training's auc: 0.904046	valid_1's auc: 0.904071
[7500]	training's auc: 0.904471	valid_1's auc: 0.904336
[7800]	training's auc: 0.904923	valid_1's auc: 0.904602
[8100]	training's auc: 0.905327	valid_1's auc: 0.904777
[8400]	training's auc: 0.905701	valid_1's auc: 0.904998
[8700]	training's auc: 0.906087	valid_1's auc: 0.905193
[9000]	training's auc: 0.906464	valid_1's auc: 0.90533
[9300]	training's auc: 0.906798	valid_1's auc: 0.905554
[9600]	training's auc: 0.907124	valid_1's auc: 0.

[14700]	training's auc: 0.911764	valid_1's auc: 0.903389
[15000]	training's auc: 0.911949	valid_1's auc: 0.903432
[15300]	training's auc: 0.912126	valid_1's auc: 0.90349
[15600]	training's auc: 0.912316	valid_1's auc: 0.903544
[15900]	training's auc: 0.91249	valid_1's auc: 0.903583
[16200]	training's auc: 0.912656	valid_1's auc: 0.903519
[16500]	training's auc: 0.912844	valid_1's auc: 0.903545
[16800]	training's auc: 0.912995	valid_1's auc: 0.903495
Early stopping, best iteration is:
[15951]	training's auc: 0.912515	valid_1's auc: 0.90361
Fold 7 started at Tue Mar  5 14:56:20 2019
Training until validation scores don't improve for 1000 rounds.
[300]	training's auc: 0.868629	valid_1's auc: 0.861476
[600]	training's auc: 0.87481	valid_1's auc: 0.867107
[900]	training's auc: 0.879241	valid_1's auc: 0.870952
[1200]	training's auc: 0.882716	valid_1's auc: 0.873844
[1500]	training's auc: 0.885564	valid_1's auc: 0.876206
[1800]	training's auc: 0.887951	valid_1's auc: 0.878371
[2100]	training'

[4200]	training's auc: 0.898849	valid_1's auc: 0.891257
[4500]	training's auc: 0.89972	valid_1's auc: 0.891874
[4800]	training's auc: 0.900525	valid_1's auc: 0.892134
[5100]	training's auc: 0.901274	valid_1's auc: 0.892638
[5400]	training's auc: 0.901957	valid_1's auc: 0.892918
[5700]	training's auc: 0.902595	valid_1's auc: 0.893212
[6000]	training's auc: 0.90322	valid_1's auc: 0.893479
[6300]	training's auc: 0.903788	valid_1's auc: 0.89372
[6600]	training's auc: 0.904307	valid_1's auc: 0.893885
[6900]	training's auc: 0.904811	valid_1's auc: 0.894059
[7200]	training's auc: 0.905275	valid_1's auc: 0.894206
[7500]	training's auc: 0.9057	valid_1's auc: 0.894405
[7800]	training's auc: 0.906141	valid_1's auc: 0.894533
[8100]	training's auc: 0.906532	valid_1's auc: 0.894705
[8400]	training's auc: 0.906919	valid_1's auc: 0.89478
[8700]	training's auc: 0.907336	valid_1's auc: 0.894987
[9000]	training's auc: 0.907724	valid_1's auc: 0.895151
[9300]	training's auc: 0.908031	valid_1's auc: 0.89518

In [9]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)