In [48]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split

In [49]:
more_features = True

train_df = pd.read_csv('input/train' + ('_more_features' if more_features else '') + '.csv')
test_df = pd.read_csv('input/test'  + ('_more_features' if more_features else '') + '.csv')

do_lda = False

fix_data_skew = False

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [50]:
if do_lda:    
    lda = LDA(solver='svd', n_components=5, store_covariance=True)
    X_lda = pd.DataFrame(lda.fit_transform(X, y))
    X_test_lda = pd.DataFrame(lda.transform(X_test))
    X["lda"] = X_lda
    X_test["lda"] = X_test_lda

In [51]:
_, X_bottomhalf, _, y_bottomhalf = train_test_split(X, y, test_size=0.5, random_state=10)

In [52]:
X_bottomhalf.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_198,var_199,lg,mlp,rf,gnb,qda,models_mean,models_min,models_max
188807,13.945,2.3554,14.3314,4.1451,11.254,-1.215,5.1511,11.9268,0.9658,7.7772,...,16.1058,9.0279,0.080034,0.088045,0.06,0.00812,0.026667,0.05335,0.026667,0.080034
199122,13.9229,4.5074,12.294,4.2445,12.477,-1.9773,6.7787,13.7641,-1.8372,8.5231,...,16.8185,1.5988,0.038435,0.085816,0.18,0.028455,0.004738,0.021587,0.004738,0.038435
17944,7.4262,-6.5208,14.0846,4.1349,7.9542,-17.4894,4.6857,12.2016,-2.582,7.1201,...,17.0191,9.7219,0.135709,0.107551,0.12,0.034572,0.006636,0.071173,0.006636,0.135709
199504,14.6156,2.0123,10.6881,8.3708,12.5464,-2.9699,5.8589,20.3338,-4.7277,5.8077,...,14.8694,0.371,0.763758,0.258768,0.22,0.925023,0.833709,0.798733,0.763758,0.833709
168703,8.7499,-3.1035,9.5713,6.0757,12.9089,0.5579,5.5508,17.4529,-5.2844,6.3188,...,9.5926,5.9859,0.120143,0.11015,0.08,0.112423,0.061366,0.090754,0.061366,0.120143


In [53]:
n_splits = 5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=3)

In [54]:
params = {
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_data_in_leaf': 2881,
    'max_depth': 0,
    'num_leaves': 3,
    'learning_rate': 0.01,
    'bagging_freq': 3,
    #'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
    'feature_fraction': 0.8453828656355421,
    'bagging_seed': 11,
    'reg_alpha':  1.1173044727720816,
    'reg_lambda': 6.9285776442737514,
    'random_state': 42,
    'verbosity': -1,
    'subsample':0.8421287738494433,
    'min_child_weight': 36.93038816860224,
    'num_threads': 4,
    'max_bin': 483
}

In [55]:
score = 0.0
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_bottomhalf,y_bottomhalf)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X_bottomhalf.iloc[train_index], X_bottomhalf.iloc[valid_index]
    y_train, y_valid = y_bottomhalf.iloc[train_index], y_bottomhalf.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=2000000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 800)
    
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_splits
    score += model.best_score['valid_1']['auc'] / n_splits

print(score)

Fold 0 started at Mon Mar 18 11:49:36 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.886505	valid_1's auc: 0.886111
[600]	training's auc: 0.888686	valid_1's auc: 0.888369
[900]	training's auc: 0.890376	valid_1's auc: 0.88953
[1200]	training's auc: 0.891735	valid_1's auc: 0.890122
[1500]	training's auc: 0.892995	valid_1's auc: 0.89066
[1800]	training's auc: 0.8941	valid_1's auc: 0.891077
[2100]	training's auc: 0.895101	valid_1's auc: 0.891418
[2400]	training's auc: 0.896051	valid_1's auc: 0.891721
[2700]	training's auc: 0.897019	valid_1's auc: 0.891938
[3000]	training's auc: 0.897888	valid_1's auc: 0.892161
[3300]	training's auc: 0.898739	valid_1's auc: 0.892371
[3600]	training's auc: 0.899526	valid_1's auc: 0.892583
[3900]	training's auc: 0.900284	valid_1's auc: 0.892824
[4200]	training's auc: 0.90104	valid_1's auc: 0.892931
[4500]	training's auc: 0.901811	valid_1's auc: 0.893059
[4800]	training's auc: 0.902515	valid_1's auc: 0.893221
[5100]

[1500]	training's auc: 0.895279	valid_1's auc: 0.881275
[1800]	training's auc: 0.896361	valid_1's auc: 0.881728
[2100]	training's auc: 0.897403	valid_1's auc: 0.88216
[2400]	training's auc: 0.898355	valid_1's auc: 0.88246
[2700]	training's auc: 0.899275	valid_1's auc: 0.882712
[3000]	training's auc: 0.90013	valid_1's auc: 0.882907
[3300]	training's auc: 0.90094	valid_1's auc: 0.883133
[3600]	training's auc: 0.901722	valid_1's auc: 0.883276
[3900]	training's auc: 0.902471	valid_1's auc: 0.883511
[4200]	training's auc: 0.903205	valid_1's auc: 0.883725
[4500]	training's auc: 0.903882	valid_1's auc: 0.883866
[4800]	training's auc: 0.904555	valid_1's auc: 0.884081
[5100]	training's auc: 0.905243	valid_1's auc: 0.884209
[5400]	training's auc: 0.905877	valid_1's auc: 0.884386
[5700]	training's auc: 0.906476	valid_1's auc: 0.884578
[6000]	training's auc: 0.907073	valid_1's auc: 0.884648
[6300]	training's auc: 0.907685	valid_1's auc: 0.884803
[6600]	training's auc: 0.908292	valid_1's auc: 0.884

In [56]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)