In [12]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [13]:
train_df = pd.read_csv('input/train.csv')
#train_df = pd.read_csv('input/train_min.csv')
test_df = pd.read_csv('input/test.csv')

do_lda = True

fix_data_skew = False

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
train_df.head()

X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [14]:
if do_lda:    
    lda = LDA(solver='svd', n_components=5, store_covariance=True)
    X_lda = pd.DataFrame(lda.fit_transform(X, y))
    X_test_lda = pd.DataFrame(lda.transform(X_test))
    X["lda"] = X_lda
    X_test["lda"] = X_test_lda

In [15]:
n_splits = 5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [18]:
'''
# cv: 0.89951
params = {
         'num_leaves': 8,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4 
        }
'''


# 1st round looks good: cv 0.900043
# lb: 0.899 with n_splits=10
# try larger max_bin?
params = {
         'num_leaves': 8,
         'min_data_in_leaf': 1000,
         'objective': 'binary',
         'max_depth': -1,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4,
         'max_bin': 511,
        }

# now trying
params = {
         'num_leaves': 8,
         'min_data_in_leaf': 1000,
         'objective': 'binary',
         'max_depth': -1,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4,
         'max_bin': 1023,
        }

In [19]:
score = 0.0
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=2000000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 200)
    
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_splits
    score += model.best_score['valid_1']['auc'] / n_splits

print(score)

Fold 0 started at Tue Mar  5 21:10:34 2019
Training until validation scores don't improve for 200 rounds.
[300]	training's auc: 0.86561	valid_1's auc: 0.863795
[600]	training's auc: 0.870838	valid_1's auc: 0.868624
[900]	training's auc: 0.874566	valid_1's auc: 0.871854
[1200]	training's auc: 0.877519	valid_1's auc: 0.874434
[1500]	training's auc: 0.88	valid_1's auc: 0.876541
[1800]	training's auc: 0.882109	valid_1's auc: 0.87836
[2100]	training's auc: 0.883937	valid_1's auc: 0.879918
[2400]	training's auc: 0.885566	valid_1's auc: 0.881286
[2700]	training's auc: 0.887001	valid_1's auc: 0.882512
[3000]	training's auc: 0.888314	valid_1's auc: 0.883558
[3300]	training's auc: 0.889481	valid_1's auc: 0.884506
[3600]	training's auc: 0.890547	valid_1's auc: 0.885346
[3900]	training's auc: 0.89154	valid_1's auc: 0.886113
[4200]	training's auc: 0.892448	valid_1's auc: 0.886801
[4500]	training's auc: 0.89328	valid_1's auc: 0.887427
[4800]	training's auc: 0.894073	valid_1's auc: 0.887983
[5100]	tr

[14400]	training's auc: 0.905344	valid_1's auc: 0.895685
[14700]	training's auc: 0.905527	valid_1's auc: 0.895797
[15000]	training's auc: 0.905708	valid_1's auc: 0.895886
[15300]	training's auc: 0.905883	valid_1's auc: 0.895991
[15600]	training's auc: 0.906052	valid_1's auc: 0.896069
[15900]	training's auc: 0.906221	valid_1's auc: 0.896174
[16200]	training's auc: 0.906384	valid_1's auc: 0.896239
[16500]	training's auc: 0.906543	valid_1's auc: 0.89633
[16800]	training's auc: 0.906701	valid_1's auc: 0.896415
[17100]	training's auc: 0.90685	valid_1's auc: 0.89648
[17400]	training's auc: 0.906997	valid_1's auc: 0.896566
[17700]	training's auc: 0.907141	valid_1's auc: 0.896629
[18000]	training's auc: 0.907284	valid_1's auc: 0.896701
[18300]	training's auc: 0.907417	valid_1's auc: 0.896765
[18600]	training's auc: 0.90756	valid_1's auc: 0.896839
[18900]	training's auc: 0.907692	valid_1's auc: 0.896909
[19200]	training's auc: 0.90782	valid_1's auc: 0.896978
[19500]	training's auc: 0.907951	val

KeyboardInterrupt: 

In [None]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)