In [75]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split

In [76]:
more_features = True
use_staking = False

train_df = pd.read_csv('input/train' + ('_more_features' if more_features else '') + '.csv')
test_df = pd.read_csv('input/test'  + ('_more_features' if more_features else '') + '.csv')

do_lda = False

fix_data_skew = False

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [77]:
if do_lda:    
    lda = LDA(solver='svd', n_components=5, store_covariance=True)
    X_lda = pd.DataFrame(lda.fit_transform(X, y))
    X_test_lda = pd.DataFrame(lda.transform(X_test))
    X["lda"] = X_lda
    X_test["lda"] = X_test_lda

In [78]:
_, X_bottomhalf, _, y_bottomhalf = train_test_split(X, y, test_size=0.8, random_state=10)

In [79]:
if use_staking:
    X = X_bottomhalf
    y = y_bottomhalf

In [80]:
n_splits = 5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=3)

In [81]:
params = {
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_data_in_leaf': 2881,
    'max_depth': 0,
    'num_leaves': 3,
    'learning_rate': 0.01,
    'bagging_freq': 3,
    #'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
    'feature_fraction': 0.8453828656355421,
    'bagging_seed': 11,
    'reg_alpha':  1.1173044727720816,
    'reg_lambda': 6.9285776442737514,
    'random_state': 42,
    'verbosity': -1,
    'subsample':0.8421287738494433,
    'min_child_weight': 36.93038816860224,
    'num_threads': 4,
    'max_bin': 483
}

In [82]:
score = 0.0
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=2000000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 800)
    
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_splits
    score += model.best_score['valid_1']['auc'] / n_splits

print(score)

Fold 0 started at Mon Mar 18 14:42:22 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.765057	valid_1's auc: 0.760191
[600]	training's auc: 0.802376	valid_1's auc: 0.79666
[900]	training's auc: 0.823052	valid_1's auc: 0.817255
[1200]	training's auc: 0.837289	valid_1's auc: 0.831402
[1500]	training's auc: 0.848333	valid_1's auc: 0.842851
[1800]	training's auc: 0.856246	valid_1's auc: 0.850237
[2100]	training's auc: 0.862402	valid_1's auc: 0.856187
[2400]	training's auc: 0.867928	valid_1's auc: 0.861487
[2700]	training's auc: 0.87232	valid_1's auc: 0.865986
[3000]	training's auc: 0.876172	valid_1's auc: 0.869648
[3300]	training's auc: 0.879667	valid_1's auc: 0.873021
[3600]	training's auc: 0.88263	valid_1's auc: 0.875569
[3900]	training's auc: 0.885514	valid_1's auc: 0.878202
[4200]	training's auc: 0.887751	valid_1's auc: 0.880309
[4500]	training's auc: 0.889806	valid_1's auc: 0.882144
[4800]	training's auc: 0.891697	valid_1's auc: 0.883865
[510

[19200]	training's auc: 0.919689	valid_1's auc: 0.896673
[19500]	training's auc: 0.919883	valid_1's auc: 0.896755
[19800]	training's auc: 0.920081	valid_1's auc: 0.896801
[20100]	training's auc: 0.920284	valid_1's auc: 0.896904
[20400]	training's auc: 0.920473	valid_1's auc: 0.896977
[20700]	training's auc: 0.920669	valid_1's auc: 0.897032
[21000]	training's auc: 0.920872	valid_1's auc: 0.897065
[21300]	training's auc: 0.921052	valid_1's auc: 0.897082
[21600]	training's auc: 0.921236	valid_1's auc: 0.897128
[21900]	training's auc: 0.921418	valid_1's auc: 0.897123
[22200]	training's auc: 0.9216	valid_1's auc: 0.897142
[22500]	training's auc: 0.921784	valid_1's auc: 0.897194
[22800]	training's auc: 0.921963	valid_1's auc: 0.89722
[23100]	training's auc: 0.922134	valid_1's auc: 0.897253
[23400]	training's auc: 0.922291	valid_1's auc: 0.897241
[23700]	training's auc: 0.922458	valid_1's auc: 0.897259
Early stopping, best iteration is:
[23055]	training's auc: 0.922104	valid_1's auc: 0.897267

[12300]	training's auc: 0.913006	valid_1's auc: 0.895745
[12600]	training's auc: 0.91334	valid_1's auc: 0.895988
[12900]	training's auc: 0.913694	valid_1's auc: 0.89626
[13200]	training's auc: 0.914026	valid_1's auc: 0.896541
[13500]	training's auc: 0.914343	valid_1's auc: 0.89676
[13800]	training's auc: 0.914646	valid_1's auc: 0.897001
[14100]	training's auc: 0.914934	valid_1's auc: 0.897157
[14400]	training's auc: 0.91519	valid_1's auc: 0.897278
[14700]	training's auc: 0.9155	valid_1's auc: 0.897488
[15000]	training's auc: 0.915806	valid_1's auc: 0.897643
[15300]	training's auc: 0.916072	valid_1's auc: 0.897829
[15600]	training's auc: 0.916325	valid_1's auc: 0.897934
[15900]	training's auc: 0.916603	valid_1's auc: 0.898077
[16200]	training's auc: 0.91683	valid_1's auc: 0.898174
[16500]	training's auc: 0.917071	valid_1's auc: 0.898313
[16800]	training's auc: 0.917318	valid_1's auc: 0.898426
[17100]	training's auc: 0.917563	valid_1's auc: 0.898556
[17400]	training's auc: 0.917782	valid

[25800]	training's auc: 0.922989	valid_1's auc: 0.899563
[26100]	training's auc: 0.923163	valid_1's auc: 0.899585
[26400]	training's auc: 0.923333	valid_1's auc: 0.899591
[26700]	training's auc: 0.923493	valid_1's auc: 0.8996
[27000]	training's auc: 0.923661	valid_1's auc: 0.899587
[27300]	training's auc: 0.923833	valid_1's auc: 0.899562
Early stopping, best iteration is:
[26751]	training's auc: 0.923524	valid_1's auc: 0.899612
0.899363708306008


In [83]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)