In [1]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [2]:
train_df = pd.read_csv('input/train.csv')
#train_df = pd.read_csv('input/train_min.csv')
test_df = pd.read_csv('input/test.csv')

do_lda = False

fix_data_skew = False

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
train_df.head()

X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [3]:
if do_lda:    
    lda = LDA(solver='svd', n_components=5, store_covariance=True)
    X_lda = pd.DataFrame(lda.fit_transform(X, y))
    X_test_lda = pd.DataFrame(lda.transform(X_test))
    X["lda"] = X_lda
    X_test["lda"] = X_test_lda

In [4]:
n_splits = 5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [5]:
# local cv: 0.9000721582954704
# lb: 0.899 (best)
params = {
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_data_in_leaf': 2881,
    'max_depth': 0,
    'num_leaves': 3,
    'learning_rate': 0.04019176517639987,
    'bagging_freq': 3,
    #'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
    'feature_fraction': 0.8990901412442585,
    'bagging_seed': 11,
    'reg_alpha':  1.1173044727720816,
    'reg_lambda': 6.9285776442737514,
    'random_state': 42,
    'verbosity': -1,
    'subsample': 0.8054415526396443,
    'min_child_weight': 38.138072621096654,
    'num_threads': 4,
    'max_bin': 483
}

# best with lower learning rate
params = {
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_data_in_leaf': 2881,
    'max_depth': 0,
    'num_leaves': 3,
    'learning_rate': 0.01,
    'bagging_freq': 3,
    #'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
    'feature_fraction': 0.8990901412442585,
    'bagging_seed': 11,
    'reg_alpha':  1.1173044727720816,
    'reg_lambda': 6.9285776442737514,
    'random_state': 42,
    'verbosity': -1,
    'subsample': 0.8054415526396443,
    'min_child_weight': 38.138072621096654,
    'num_threads': 4,
    'max_bin': 483
}

In [6]:
score = 0.0
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=2000000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 800)
    
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_splits
    score += model.best_score['valid_1']['auc'] / n_splits

print(score)

Fold 0 started at Fri Mar  8 08:42:42 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.764669	valid_1's auc: 0.756533
[600]	training's auc: 0.803224	valid_1's auc: 0.796304
[900]	training's auc: 0.824397	valid_1's auc: 0.817409
[1200]	training's auc: 0.838128	valid_1's auc: 0.830186
[1500]	training's auc: 0.848913	valid_1's auc: 0.840308
[1800]	training's auc: 0.85702	valid_1's auc: 0.847915
[2100]	training's auc: 0.863008	valid_1's auc: 0.853545
[2400]	training's auc: 0.868707	valid_1's auc: 0.85876
[2700]	training's auc: 0.873313	valid_1's auc: 0.862967
[3000]	training's auc: 0.877123	valid_1's auc: 0.866334
[3300]	training's auc: 0.8805	valid_1's auc: 0.869347
[3600]	training's auc: 0.883479	valid_1's auc: 0.872079
[3900]	training's auc: 0.886282	valid_1's auc: 0.874396
[4200]	training's auc: 0.888709	valid_1's auc: 0.876496
[4500]	training's auc: 0.890804	valid_1's auc: 0.878329
[4800]	training's auc: 0.892702	valid_1's auc: 0.879899
[5100

[18000]	training's auc: 0.917984	valid_1's auc: 0.899581
[18300]	training's auc: 0.918199	valid_1's auc: 0.899644
[18600]	training's auc: 0.918412	valid_1's auc: 0.899657
[18900]	training's auc: 0.918602	valid_1's auc: 0.899702
[19200]	training's auc: 0.918809	valid_1's auc: 0.899775
[19500]	training's auc: 0.918986	valid_1's auc: 0.899798
[19800]	training's auc: 0.919178	valid_1's auc: 0.899822
[20100]	training's auc: 0.919363	valid_1's auc: 0.899909
[20400]	training's auc: 0.919538	valid_1's auc: 0.89993
[20700]	training's auc: 0.919725	valid_1's auc: 0.899963
[21000]	training's auc: 0.919905	valid_1's auc: 0.899998
[21300]	training's auc: 0.920066	valid_1's auc: 0.900035
[21600]	training's auc: 0.92023	valid_1's auc: 0.90006
[21900]	training's auc: 0.920378	valid_1's auc: 0.900067
[22200]	training's auc: 0.920544	valid_1's auc: 0.900053
[22500]	training's auc: 0.92075	valid_1's auc: 0.900068
[22800]	training's auc: 0.92093	valid_1's auc: 0.900117
[23100]	training's auc: 0.921098	val

[8400]	training's auc: 0.906589	valid_1's auc: 0.888916
[8700]	training's auc: 0.907294	valid_1's auc: 0.889469
[9000]	training's auc: 0.908035	valid_1's auc: 0.890006
[9300]	training's auc: 0.908634	valid_1's auc: 0.890452
[9600]	training's auc: 0.909264	valid_1's auc: 0.891005
[9900]	training's auc: 0.909849	valid_1's auc: 0.891484
[10200]	training's auc: 0.910352	valid_1's auc: 0.891816
[10500]	training's auc: 0.910834	valid_1's auc: 0.892141
[10800]	training's auc: 0.911316	valid_1's auc: 0.892571
[11100]	training's auc: 0.911779	valid_1's auc: 0.892999
[11400]	training's auc: 0.912202	valid_1's auc: 0.893285
[11700]	training's auc: 0.912647	valid_1's auc: 0.893601
[12000]	training's auc: 0.912992	valid_1's auc: 0.89381
[12300]	training's auc: 0.913366	valid_1's auc: 0.894079
[12600]	training's auc: 0.913734	valid_1's auc: 0.894269
[12900]	training's auc: 0.914046	valid_1's auc: 0.894528
[13200]	training's auc: 0.914369	valid_1's auc: 0.894761
[13500]	training's auc: 0.914678	valid

[21600]	training's auc: 0.920867	valid_1's auc: 0.896409
[21900]	training's auc: 0.921038	valid_1's auc: 0.896453
[22200]	training's auc: 0.92121	valid_1's auc: 0.896472
[22500]	training's auc: 0.921389	valid_1's auc: 0.896564
[22800]	training's auc: 0.92156	valid_1's auc: 0.896571
[23100]	training's auc: 0.921738	valid_1's auc: 0.896567
[23400]	training's auc: 0.92188	valid_1's auc: 0.896607
[23700]	training's auc: 0.922043	valid_1's auc: 0.896639
[24000]	training's auc: 0.922213	valid_1's auc: 0.896708
[24300]	training's auc: 0.922391	valid_1's auc: 0.896696
[24600]	training's auc: 0.922528	valid_1's auc: 0.896695
[24900]	training's auc: 0.922691	valid_1's auc: 0.89668
[25200]	training's auc: 0.922861	valid_1's auc: 0.896683
[25500]	training's auc: 0.923008	valid_1's auc: 0.896701
[25800]	training's auc: 0.923195	valid_1's auc: 0.896725
[26100]	training's auc: 0.923367	valid_1's auc: 0.896726
[26400]	training's auc: 0.923535	valid_1's auc: 0.896711
[26700]	training's auc: 0.923688	va

In [7]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)