In [1]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split

In [2]:
more_features = False
use_staking = False

train_df = pd.read_csv('input/train' + ('_more_features' if more_features else '') + '.csv')
test_df = pd.read_csv('input/test'  + ('_more_features' if more_features else '') + '.csv')

do_lda = False

fix_data_skew = False

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [3]:
if do_lda:    
    lda = LDA(solver='svd', n_components=5, store_covariance=True)
    X_lda = pd.DataFrame(lda.fit_transform(X, y))
    X_test_lda = pd.DataFrame(lda.transform(X_test))
    X["lda"] = X_lda
    X_test["lda"] = X_test_lda

In [4]:
_, X_bottomhalf, _, y_bottomhalf = train_test_split(X, y, test_size=0.8, random_state=10)

In [5]:
if use_staking:
    X = X_bottomhalf
    y = y_bottomhalf

In [6]:
n_splits = 10
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=3)

In [7]:
params = {
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_data_in_leaf': 2881,
    'max_depth': 0,
    'num_leaves': 3,
    'learning_rate': 0.01,
    'bagging_freq': 3,
    #'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
    'feature_fraction': 0.8453828656355421,
    'bagging_seed': 11,
    'reg_alpha':  1.1173044727720816,
    'reg_lambda': 6.9285776442737514,
    'random_state': 42,
    'verbosity': -1,
    'subsample':0.8421287738494433,
    'min_child_weight': 36.93038816860224,
    'num_threads': 4,
    'max_bin': 483
}

In [8]:
score = 0.0
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=2000000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 800)
    
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_splits
    score += model.best_score['valid_1']['auc'] / n_splits

print(score)

Fold 0 started at Mon Mar 18 20:40:38 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.765601	valid_1's auc: 0.755843
[600]	training's auc: 0.80259	valid_1's auc: 0.793208
[900]	training's auc: 0.823635	valid_1's auc: 0.814386
[1200]	training's auc: 0.837263	valid_1's auc: 0.827718
[1500]	training's auc: 0.847513	valid_1's auc: 0.838356
[1800]	training's auc: 0.855469	valid_1's auc: 0.84634
[2100]	training's auc: 0.862186	valid_1's auc: 0.853164
[2400]	training's auc: 0.867878	valid_1's auc: 0.85854
[2700]	training's auc: 0.872018	valid_1's auc: 0.862647
[3000]	training's auc: 0.875914	valid_1's auc: 0.866399
[3300]	training's auc: 0.879244	valid_1's auc: 0.869393
[3600]	training's auc: 0.882339	valid_1's auc: 0.872414
[3900]	training's auc: 0.884851	valid_1's auc: 0.874619
[4200]	training's auc: 0.887169	valid_1's auc: 0.876653
[4500]	training's auc: 0.889407	valid_1's auc: 0.878681
[4800]	training's auc: 0.891216	valid_1's auc: 0.880199
[510

[16800]	training's auc: 0.915608	valid_1's auc: 0.905875
[17100]	training's auc: 0.915831	valid_1's auc: 0.905942
[17400]	training's auc: 0.916041	valid_1's auc: 0.906022
[17700]	training's auc: 0.916253	valid_1's auc: 0.906139
[18000]	training's auc: 0.916458	valid_1's auc: 0.906171
[18300]	training's auc: 0.916694	valid_1's auc: 0.906235
[18600]	training's auc: 0.916876	valid_1's auc: 0.906256
[18900]	training's auc: 0.917062	valid_1's auc: 0.906294
[19200]	training's auc: 0.917264	valid_1's auc: 0.906326
[19500]	training's auc: 0.91745	valid_1's auc: 0.9064
[19800]	training's auc: 0.917647	valid_1's auc: 0.906461
[20100]	training's auc: 0.917838	valid_1's auc: 0.906467
[20400]	training's auc: 0.91802	valid_1's auc: 0.90652
[20700]	training's auc: 0.918192	valid_1's auc: 0.906604
[21000]	training's auc: 0.918375	valid_1's auc: 0.906639
[21300]	training's auc: 0.918545	valid_1's auc: 0.906672
[21600]	training's auc: 0.918703	valid_1's auc: 0.906707
[21900]	training's auc: 0.918868	val

[4200]	training's auc: 0.887897	valid_1's auc: 0.871471
[4500]	training's auc: 0.890024	valid_1's auc: 0.873287
[4800]	training's auc: 0.892017	valid_1's auc: 0.875018
[5100]	training's auc: 0.893668	valid_1's auc: 0.876528
[5400]	training's auc: 0.895203	valid_1's auc: 0.877884
[5700]	training's auc: 0.896607	valid_1's auc: 0.879169
[6000]	training's auc: 0.897977	valid_1's auc: 0.880288
[6300]	training's auc: 0.899178	valid_1's auc: 0.881217
[6600]	training's auc: 0.900244	valid_1's auc: 0.882172
[6900]	training's auc: 0.901235	valid_1's auc: 0.882866
[7200]	training's auc: 0.902258	valid_1's auc: 0.883673
[7500]	training's auc: 0.903229	valid_1's auc: 0.884443
[7800]	training's auc: 0.904101	valid_1's auc: 0.885193
[8100]	training's auc: 0.904875	valid_1's auc: 0.885959
[8400]	training's auc: 0.905574	valid_1's auc: 0.88653
[8700]	training's auc: 0.906306	valid_1's auc: 0.887019
[9000]	training's auc: 0.906992	valid_1's auc: 0.887561
[9300]	training's auc: 0.907607	valid_1's auc: 0.

[19500]	training's auc: 0.917812	valid_1's auc: 0.898983
[19800]	training's auc: 0.918009	valid_1's auc: 0.899042
[20100]	training's auc: 0.9182	valid_1's auc: 0.899114
[20400]	training's auc: 0.918379	valid_1's auc: 0.899197
[20700]	training's auc: 0.918551	valid_1's auc: 0.899245
[21000]	training's auc: 0.918708	valid_1's auc: 0.899283
[21300]	training's auc: 0.918884	valid_1's auc: 0.899379
[21600]	training's auc: 0.919056	valid_1's auc: 0.899468
[21900]	training's auc: 0.919201	valid_1's auc: 0.899509
[22200]	training's auc: 0.919361	valid_1's auc: 0.899561
[22500]	training's auc: 0.91953	valid_1's auc: 0.899564
[22800]	training's auc: 0.919692	valid_1's auc: 0.899601
[23100]	training's auc: 0.919845	valid_1's auc: 0.899639
[23400]	training's auc: 0.919993	valid_1's auc: 0.899618
[23700]	training's auc: 0.920156	valid_1's auc: 0.89959
Early stopping, best iteration is:
[23103]	training's auc: 0.919846	valid_1's auc: 0.899646
Fold 5 started at Mon Mar 18 23:08:08 2019
Training until

[15600]	training's auc: 0.915136	valid_1's auc: 0.899003
[15900]	training's auc: 0.91538	valid_1's auc: 0.899149
[16200]	training's auc: 0.915604	valid_1's auc: 0.899216
[16500]	training's auc: 0.91583	valid_1's auc: 0.899267
[16800]	training's auc: 0.916083	valid_1's auc: 0.899412
[17100]	training's auc: 0.916302	valid_1's auc: 0.899539
[17400]	training's auc: 0.916522	valid_1's auc: 0.899584
[17700]	training's auc: 0.916728	valid_1's auc: 0.899645
[18000]	training's auc: 0.91693	valid_1's auc: 0.899747
[18300]	training's auc: 0.917142	valid_1's auc: 0.899747
[18600]	training's auc: 0.917346	valid_1's auc: 0.899814
[18900]	training's auc: 0.917522	valid_1's auc: 0.899832
[19200]	training's auc: 0.917712	valid_1's auc: 0.89986
[19500]	training's auc: 0.917911	valid_1's auc: 0.899915
[19800]	training's auc: 0.918076	valid_1's auc: 0.899967
[20100]	training's auc: 0.918265	valid_1's auc: 0.899971
[20400]	training's auc: 0.918444	valid_1's auc: 0.900003
[20700]	training's auc: 0.918618	va

[6900]	training's auc: 0.900525	valid_1's auc: 0.888953
[7200]	training's auc: 0.901573	valid_1's auc: 0.89
[7500]	training's auc: 0.902531	valid_1's auc: 0.890869
[7800]	training's auc: 0.903368	valid_1's auc: 0.891658
[8100]	training's auc: 0.9042	valid_1's auc: 0.8925
[8400]	training's auc: 0.904933	valid_1's auc: 0.893153
[8700]	training's auc: 0.905677	valid_1's auc: 0.893879
[9000]	training's auc: 0.906329	valid_1's auc: 0.894475
[9300]	training's auc: 0.906997	valid_1's auc: 0.895011
[9600]	training's auc: 0.907576	valid_1's auc: 0.895684
[9900]	training's auc: 0.908177	valid_1's auc: 0.896132
[10200]	training's auc: 0.908679	valid_1's auc: 0.896511
[10500]	training's auc: 0.909178	valid_1's auc: 0.897016
[10800]	training's auc: 0.909665	valid_1's auc: 0.897371
[11100]	training's auc: 0.910121	valid_1's auc: 0.897807
[11400]	training's auc: 0.910558	valid_1's auc: 0.898187
[11700]	training's auc: 0.91098	valid_1's auc: 0.898534
[12000]	training's auc: 0.911358	valid_1's auc: 0.8

[22200]	training's auc: 0.919874	valid_1's auc: 0.896301
[22500]	training's auc: 0.920032	valid_1's auc: 0.896304
[22800]	training's auc: 0.920193	valid_1's auc: 0.896322
[23100]	training's auc: 0.920349	valid_1's auc: 0.896382
[23400]	training's auc: 0.9205	valid_1's auc: 0.896451
[23700]	training's auc: 0.920656	valid_1's auc: 0.896455
[24000]	training's auc: 0.920799	valid_1's auc: 0.896527
[24300]	training's auc: 0.920943	valid_1's auc: 0.896531
[24600]	training's auc: 0.921092	valid_1's auc: 0.896532
[24900]	training's auc: 0.921256	valid_1's auc: 0.896569
[25200]	training's auc: 0.921404	valid_1's auc: 0.896625
[25500]	training's auc: 0.921561	valid_1's auc: 0.896633
[25800]	training's auc: 0.92168	valid_1's auc: 0.896648
[26100]	training's auc: 0.92182	valid_1's auc: 0.896646
[26400]	training's auc: 0.921967	valid_1's auc: 0.896675
[26700]	training's auc: 0.922098	valid_1's auc: 0.896676
[27000]	training's auc: 0.922242	valid_1's auc: 0.896688
[27300]	training's auc: 0.922387	va

In [9]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)