In [37]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split

In [38]:
more_features = False
use_staking = False
use_kernal_params = True

train_df = pd.read_csv('input/train' + ('_more_features' if more_features else '') + '.csv')
test_df = pd.read_csv('input/test'  + ('_more_features' if more_features else '') + '.csv')

do_lda = False

fix_data_skew = False

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [39]:
if do_lda:    
    lda = LDA(solver='svd', n_components=5, store_covariance=True)
    X_lda = pd.DataFrame(lda.fit_transform(X, y))
    X_test_lda = pd.DataFrame(lda.transform(X_test))
    X["lda"] = X_lda
    X_test["lda"] = X_test_lda

In [40]:
_, X_bottomhalf, _, y_bottomhalf = train_test_split(X, y, test_size=0.8, random_state=10)

In [41]:
if use_staking:
    X = X_bottomhalf
    y = y_bottomhalf

In [42]:
n_splits = 5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=3)

In [43]:
params = {
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_data_in_leaf': 2881,
    'max_depth': 0,
    'num_leaves': 3,
    'learning_rate': 0.01,
    'bagging_freq': 3,
    #'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
    'feature_fraction': 0.8453828656355421,
    'bagging_seed': 11,
    'reg_alpha':  1.1173044727720816,
    'reg_lambda': 6.9285776442737514,
    'random_state': 42,
    'verbosity': -1,
    'subsample':0.8421287738494433,
    'min_child_weight': 36.93038816860224,
    'num_threads': 4,
    'max_bin': 483
}

if use_kernal_params:
    # https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment (lb: 0.901)
    params = {
        "objective" : "binary",
        "metric" : "auc",
        "boosting": 'gbdt',
        "max_depth" : -1,
        "num_leaves" : 13,
        "learning_rate" : 0.01,
        "bagging_freq": 5,
        "bagging_fraction" : 0.4,
        "feature_fraction" : 0.05,
        "min_data_in_leaf": 80,
        "min_sum_heassian_in_leaf": 10,
        "tree_learner": "serial",
        "boost_from_average": "false",
        #"lambda_l1" : 5,
        #"lambda_l2" : 5,
        "bagging_seed" : 13,
        "verbosity" : 1,
        "seed": 42
    }

In [44]:
score = 0.0
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=2000000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 800)
    
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_splits
    score += model.best_score['valid_1']['auc'] / n_splits

print(score)

Fold 0 started at Tue Mar 19 14:23:29 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.881715	valid_1's auc: 0.870899
[600]	training's auc: 0.892154	valid_1's auc: 0.87966
[900]	training's auc: 0.89841	valid_1's auc: 0.885077
[1200]	training's auc: 0.902342	valid_1's auc: 0.888326
[1500]	training's auc: 0.906518	valid_1's auc: 0.891301
[1800]	training's auc: 0.910003	valid_1's auc: 0.893669
[2100]	training's auc: 0.913104	valid_1's auc: 0.895598
[2400]	training's auc: 0.915798	valid_1's auc: 0.89727
[2700]	training's auc: 0.918297	valid_1's auc: 0.898321
[3000]	training's auc: 0.920696	valid_1's auc: 0.899423
[3300]	training's auc: 0.922835	valid_1's auc: 0.90029
[3600]	training's auc: 0.924885	valid_1's auc: 0.901183
[3900]	training's auc: 0.92677	valid_1's auc: 0.90182
[4200]	training's auc: 0.928538	valid_1's auc: 0.902258
[4500]	training's auc: 0.930244	valid_1's auc: 0.902603
[4800]	training's auc: 0.932002	valid_1's auc: 0.902859
[5100]	

[1800]	training's auc: 0.911065	valid_1's auc: 0.887265
[2100]	training's auc: 0.914063	valid_1's auc: 0.889402
[2400]	training's auc: 0.916791	valid_1's auc: 0.891052
[2700]	training's auc: 0.919327	valid_1's auc: 0.892586
[3000]	training's auc: 0.921675	valid_1's auc: 0.893787
[3300]	training's auc: 0.9237	valid_1's auc: 0.894773
[3600]	training's auc: 0.92565	valid_1's auc: 0.895467
[3900]	training's auc: 0.927507	valid_1's auc: 0.896244
[4200]	training's auc: 0.929301	valid_1's auc: 0.89686
[4500]	training's auc: 0.931028	valid_1's auc: 0.89734
[4800]	training's auc: 0.932672	valid_1's auc: 0.897809
[5100]	training's auc: 0.934259	valid_1's auc: 0.898128
[5400]	training's auc: 0.935781	valid_1's auc: 0.898433
[5700]	training's auc: 0.937224	valid_1's auc: 0.89879
[6000]	training's auc: 0.938621	valid_1's auc: 0.899035
[6300]	training's auc: 0.940078	valid_1's auc: 0.899188
[6600]	training's auc: 0.941476	valid_1's auc: 0.899372
[6900]	training's auc: 0.942863	valid_1's auc: 0.89946

In [45]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)