In [1]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [2]:
train_df = pd.read_csv('input/train.csv')
#train_df = pd.read_csv('input/train_min.csv')
test_df = pd.read_csv('input/test.csv')

do_lda = False

fix_data_skew = False

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
train_df.head()

X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [3]:
if do_lda:    
    lda = LDA(solver='svd', n_components=5, store_covariance=True)
    X_lda = pd.DataFrame(lda.fit_transform(X, y))
    X_test_lda = pd.DataFrame(lda.transform(X_test))
    X["lda"] = X_lda
    X_test["lda"] = X_test_lda

In [4]:
n_splits = 5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [5]:
params = {
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_data_in_leaf': 2881,
    'max_depth': 0,
    'num_leaves': 3,
    'learning_rate': 0.01,
    'bagging_freq': 3,
    #'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
    'feature_fraction': 0.8453828656355421,
    'bagging_seed': 11,
    'reg_alpha':  1.1173044727720816,
    'reg_lambda': 6.9285776442737514,
    'random_state': 42,
    'verbosity': -1,
    'subsample':0.8421287738494433,
    'min_child_weight': 36.93038816860224,
    'num_threads': 4,
    'max_bin': 483
}

In [6]:
score = 0.0
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=2000000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 800)
    
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_splits
    score += model.best_score['valid_1']['auc'] / n_splits

print(score)

Fold 0 started at Fri Mar  8 15:14:34 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.763894	valid_1's auc: 0.755421
[600]	training's auc: 0.803638	valid_1's auc: 0.796102
[900]	training's auc: 0.824243	valid_1's auc: 0.816561
[1200]	training's auc: 0.838573	valid_1's auc: 0.830135
[1500]	training's auc: 0.848948	valid_1's auc: 0.839926
[1800]	training's auc: 0.856728	valid_1's auc: 0.84737
[2100]	training's auc: 0.86311	valid_1's auc: 0.853287
[2400]	training's auc: 0.868888	valid_1's auc: 0.858499
[2700]	training's auc: 0.873314	valid_1's auc: 0.862411
[3000]	training's auc: 0.877112	valid_1's auc: 0.865793
[3300]	training's auc: 0.880634	valid_1's auc: 0.868908
[3600]	training's auc: 0.883577	valid_1's auc: 0.871514
[3900]	training's auc: 0.886398	valid_1's auc: 0.873991
[4200]	training's auc: 0.888777	valid_1's auc: 0.876039
[4500]	training's auc: 0.890779	valid_1's auc: 0.877823
[4800]	training's auc: 0.892801	valid_1's auc: 0.879531
[51

[13800]	training's auc: 0.914783	valid_1's auc: 0.897688
[14100]	training's auc: 0.915085	valid_1's auc: 0.897843
[14400]	training's auc: 0.915357	valid_1's auc: 0.898004
[14700]	training's auc: 0.915583	valid_1's auc: 0.898145
[15000]	training's auc: 0.915868	valid_1's auc: 0.898274
[15300]	training's auc: 0.916147	valid_1's auc: 0.898473
[15600]	training's auc: 0.916368	valid_1's auc: 0.898552
[15900]	training's auc: 0.916608	valid_1's auc: 0.898611
[16200]	training's auc: 0.916838	valid_1's auc: 0.898739
[16500]	training's auc: 0.917066	valid_1's auc: 0.898843
[16800]	training's auc: 0.917311	valid_1's auc: 0.898919
[17100]	training's auc: 0.917533	valid_1's auc: 0.899009
[17400]	training's auc: 0.917778	valid_1's auc: 0.899074
[17700]	training's auc: 0.917991	valid_1's auc: 0.899171
[18000]	training's auc: 0.918199	valid_1's auc: 0.899244
[18300]	training's auc: 0.918417	valid_1's auc: 0.899285
[18600]	training's auc: 0.918618	valid_1's auc: 0.899348
[18900]	training's auc: 0.91881

[27300]	training's auc: 0.922325	valid_1's auc: 0.905947
[27600]	training's auc: 0.92249	valid_1's auc: 0.905967
[27900]	training's auc: 0.922651	valid_1's auc: 0.905958
[28200]	training's auc: 0.922798	valid_1's auc: 0.905933
Early stopping, best iteration is:
[27518]	training's auc: 0.922452	valid_1's auc: 0.905972
Fold 3 started at Fri Mar  8 15:48:06 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.767513	valid_1's auc: 0.751512
[600]	training's auc: 0.803304	valid_1's auc: 0.788101
[900]	training's auc: 0.823503	valid_1's auc: 0.808762
[1200]	training's auc: 0.837394	valid_1's auc: 0.822844
[1500]	training's auc: 0.84779	valid_1's auc: 0.833545
[1800]	training's auc: 0.856382	valid_1's auc: 0.842137
[2100]	training's auc: 0.863134	valid_1's auc: 0.84884
[2400]	training's auc: 0.868402	valid_1's auc: 0.854127
[2700]	training's auc: 0.872846	valid_1's auc: 0.858633
[3000]	training's auc: 0.8768	valid_1's auc: 0.862373
[3300]	training's auc:

[14100]	training's auc: 0.915677	valid_1's auc: 0.893715
[14400]	training's auc: 0.915959	valid_1's auc: 0.893931
[14700]	training's auc: 0.916211	valid_1's auc: 0.894087
[15000]	training's auc: 0.916482	valid_1's auc: 0.894219
[15300]	training's auc: 0.916762	valid_1's auc: 0.894449
[15600]	training's auc: 0.917024	valid_1's auc: 0.894656
[15900]	training's auc: 0.917268	valid_1's auc: 0.894795
[16200]	training's auc: 0.917511	valid_1's auc: 0.894931
[16500]	training's auc: 0.917752	valid_1's auc: 0.895006
[16800]	training's auc: 0.917978	valid_1's auc: 0.895126
[17100]	training's auc: 0.918212	valid_1's auc: 0.895219
[17400]	training's auc: 0.918425	valid_1's auc: 0.89529
[17700]	training's auc: 0.918656	valid_1's auc: 0.895388
[18000]	training's auc: 0.918886	valid_1's auc: 0.895496
[18300]	training's auc: 0.919081	valid_1's auc: 0.895614
[18600]	training's auc: 0.919278	valid_1's auc: 0.895719
[18900]	training's auc: 0.919465	valid_1's auc: 0.895796
[19200]	training's auc: 0.919659

In [7]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)