In [75]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [76]:
more_features = True
use_staking = False
use_kernal_params = False
do_augment = False
fast = True

train_df = pd.read_csv('input/train' + ('_more_features' if more_features else '') + '.csv')
test_df = pd.read_csv('input/test'  + ('_more_features' if more_features else '') + '.csv')

do_lda = False

fix_data_skew = False

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [77]:
if do_lda:    
    lda = LDA(solver='svd', n_components=5, store_covariance=True)
    X_lda = pd.DataFrame(lda.fit_transform(X, y))
    X_test_lda = pd.DataFrame(lda.transform(X_test))
    X["lda"] = X_lda
    X_test["lda"] = X_test_lda

In [78]:
_, X_bottomhalf, _, y_bottomhalf = train_test_split(X, y, test_size=0.8, random_state=10)

In [79]:
if use_staking:
    X = X_bottomhalf
    y = y_bottomhalf

In [80]:
n_splits = 5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [81]:
params = {
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_data_in_leaf': 2881,
    'max_depth': 0,
    'num_leaves': 3,
    'learning_rate': 0.01,
    'bagging_freq': 3,
    #'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
    'feature_fraction': 0.8990901412442585,
    'bagging_seed': 11,
    'reg_alpha':  1.1173044727720816,
    'reg_lambda': 6.9285776442737514,
    'random_state': 42,
    'verbosity': -1,
    'subsample': 0.8054415526396443,
    'min_child_weight': 38.138072621096654,
    'num_threads': 4,
    'max_bin': 483
}

if use_kernal_params:
    # https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment (lb: 0.901)
    params = {
        "objective" : "binary",
        "metric" : "auc",
        "boosting": 'gbdt',
        "max_depth" : -1,
        "num_leaves" : 13,
        "learning_rate" : 0.01,
        "bagging_freq": 5,
        "bagging_fraction" : 0.4,
        "feature_fraction" : 0.05,
        "min_data_in_leaf": 80,
        "min_sum_heassian_in_leaf": 10,
        "tree_learner": "serial",
        "boost_from_average": "false",
        #"lambda_l1" : 5,
        #"lambda_l2" : 5,
        "bagging_seed" : 13,
        "verbosity" : 1,
        "seed": 42
    }

In [82]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [83]:
oof = train_df[['ID_code', 'target']]
oof['predict'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [84]:
predictions = test_df[['ID_code']]
val_aucs = []

In [85]:
score = 0.0
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    N = 5
    
    if not do_augment:
        N = 1
    
    p_valid,yp = 0,0
    for i in range(N):
        if do_augment:
            X_t, y_t = augment(X_train.values, y_train.values)
            X_t = pd.DataFrame(X_t)
            X_t = X_t.add_prefix('var_')

            trn_data = lgb.Dataset(X_t, label=y_t)
            val_data = lgb.Dataset(X_valid, label=y_valid)
        else:
            trn_data = lgb.Dataset(X_train, label=y_train)
            val_data = lgb.Dataset(X_valid, label=y_valid)
        
        evals_result = {}
        
        early_stopping_rounds = 800 if fast else 3000
        
        lgb_clf = lgb.train(params,
                        trn_data,
                        num_boost_round=2000000,
                        valid_sets = [trn_data, val_data],
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval = 300,
                        evals_result=evals_result
                       )
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
    
    oof['predict'][valid_index] = p_valid/N
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)
    
    predictions['fold{}'.format(fold_n+1)] = yp/N

Fold 0 started at Fri Mar 22 16:22:12 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.764777	valid_1's auc: 0.754708
[600]	training's auc: 0.801878	valid_1's auc: 0.793412
[900]	training's auc: 0.823175	valid_1's auc: 0.814761
[1200]	training's auc: 0.837529	valid_1's auc: 0.828442
[1500]	training's auc: 0.848245	valid_1's auc: 0.838525
[1800]	training's auc: 0.856823	valid_1's auc: 0.846712
[2100]	training's auc: 0.862918	valid_1's auc: 0.852529
[2400]	training's auc: 0.868462	valid_1's auc: 0.85769
[2700]	training's auc: 0.873175	valid_1's auc: 0.861985
[3000]	training's auc: 0.876928	valid_1's auc: 0.865428
[3300]	training's auc: 0.880535	valid_1's auc: 0.868727
[3600]	training's auc: 0.883634	valid_1's auc: 0.871564
[3900]	training's auc: 0.886354	valid_1's auc: 0.873877
[4200]	training's auc: 0.88884	valid_1's auc: 0.875921
[4500]	training's auc: 0.89095	valid_1's auc: 0.877839
[4800]	training's auc: 0.892821	valid_1's auc: 0.879377
[510

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fold 1 started at Fri Mar 22 16:33:15 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.766511	valid_1's auc: 0.761758
[600]	training's auc: 0.803149	valid_1's auc: 0.796904
[900]	training's auc: 0.823796	valid_1's auc: 0.817009
[1200]	training's auc: 0.838587	valid_1's auc: 0.8311
[1500]	training's auc: 0.848766	valid_1's auc: 0.841203
[1800]	training's auc: 0.856959	valid_1's auc: 0.848955
[2100]	training's auc: 0.863382	valid_1's auc: 0.854976
[2400]	training's auc: 0.868769	valid_1's auc: 0.85986
[2700]	training's auc: 0.873271	valid_1's auc: 0.864044
[3000]	training's auc: 0.877154	valid_1's auc: 0.867585
[3300]	training's auc: 0.880644	valid_1's auc: 0.87086
[3600]	training's auc: 0.883672	valid_1's auc: 0.873418
[3900]	training's auc: 0.886089	valid_1's auc: 0.87549
[4200]	training's auc: 0.888464	valid_1's auc: 0.877432
[4500]	training's auc: 0.890709	valid_1's auc: 0.879394
[4800]	training's auc: 0.892546	valid_1's auc: 0.880908
[5100]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fold 2 started at Fri Mar 22 16:45:37 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.764992	valid_1's auc: 0.76254
[600]	training's auc: 0.802184	valid_1's auc: 0.798797
[900]	training's auc: 0.822872	valid_1's auc: 0.818979
[1200]	training's auc: 0.836707	valid_1's auc: 0.832685
[1500]	training's auc: 0.847416	valid_1's auc: 0.843119
[1800]	training's auc: 0.855516	valid_1's auc: 0.850741
[2100]	training's auc: 0.86204	valid_1's auc: 0.857002
[2400]	training's auc: 0.867479	valid_1's auc: 0.862152
[2700]	training's auc: 0.872001	valid_1's auc: 0.866595
[3000]	training's auc: 0.875776	valid_1's auc: 0.870212
[3300]	training's auc: 0.878997	valid_1's auc: 0.873362
[3600]	training's auc: 0.881973	valid_1's auc: 0.876165
[3900]	training's auc: 0.884512	valid_1's auc: 0.878457
[4200]	training's auc: 0.887048	valid_1's auc: 0.881085
[4500]	training's auc: 0.88915	valid_1's auc: 0.883087
[4800]	training's auc: 0.890948	valid_1's auc: 0.884722
[510

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fold 3 started at Fri Mar 22 16:57:21 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.770218	valid_1's auc: 0.755082
[600]	training's auc: 0.802459	valid_1's auc: 0.787287
[900]	training's auc: 0.822779	valid_1's auc: 0.807788
[1200]	training's auc: 0.836732	valid_1's auc: 0.821527
[1500]	training's auc: 0.84804	valid_1's auc: 0.833122
[1800]	training's auc: 0.855998	valid_1's auc: 0.841373
[2100]	training's auc: 0.862898	valid_1's auc: 0.847892
[2400]	training's auc: 0.868214	valid_1's auc: 0.853247
[2700]	training's auc: 0.87291	valid_1's auc: 0.85771
[3000]	training's auc: 0.876814	valid_1's auc: 0.861486
[3300]	training's auc: 0.880092	valid_1's auc: 0.864741
[3600]	training's auc: 0.88346	valid_1's auc: 0.867919
[3900]	training's auc: 0.886171	valid_1's auc: 0.870501
[4200]	training's auc: 0.888497	valid_1's auc: 0.872599
[4500]	training's auc: 0.890782	valid_1's auc: 0.8746
[4800]	training's auc: 0.89271	valid_1's auc: 0.876392
[5100]	t

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fold 4 started at Fri Mar 22 17:12:13 2019
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.768854	valid_1's auc: 0.760335
[600]	training's auc: 0.804107	valid_1's auc: 0.793325
[900]	training's auc: 0.825234	valid_1's auc: 0.812691
[1200]	training's auc: 0.839882	valid_1's auc: 0.825796
[1500]	training's auc: 0.850473	valid_1's auc: 0.835349
[1800]	training's auc: 0.858693	valid_1's auc: 0.842478
[2100]	training's auc: 0.865107	valid_1's auc: 0.848347
[2400]	training's auc: 0.870244	valid_1's auc: 0.852804
[2700]	training's auc: 0.874863	valid_1's auc: 0.857096
[3000]	training's auc: 0.878837	valid_1's auc: 0.860577
[3300]	training's auc: 0.882206	valid_1's auc: 0.863774
[3600]	training's auc: 0.88492	valid_1's auc: 0.866264
[3900]	training's auc: 0.887545	valid_1's auc: 0.868587
[4200]	training's auc: 0.889792	valid_1's auc: 0.870701
[4500]	training's auc: 0.891709	valid_1's auc: 0.872529
[4800]	training's auc: 0.893459	valid_1's auc: 0.874132
[5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [86]:
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
all_auc = roc_auc_score(oof['target'], oof['predict'])
print("Mean auc: %.9f, std: %.9f. All auc: %.9f." % (mean_auc, std_auc, all_auc))

Mean auc: 0.900017835, std: 0.003169575. All auc: 0.899931864.


In [87]:
# submission
predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('lgb_all_predictions.csv', index=None)
sub_df = pd.DataFrame({"ID_code":test_df["ID_code"].values})
sub_df["target"] = predictions['target']
sub_df.to_csv("lgb_submission.csv", index=False)
oof.to_csv('lgb_oof.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
