In [20]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numba import jit
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [21]:
%%time

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
features = [c for c in train.columns if c not in ['ID_code', 'target']]
target = train['target']

Wall time: 2min 14s
Parser   : 274 ms


In [22]:
print ("Test ",test.shape)
print ("Train ",train.shape)

Test  (200000, 201)
Train  (200000, 202)


In [23]:
@jit
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [24]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0043,
    'max_depth': 2,
    'metric':'auc',
    'min_data_in_leaf': 180,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 3,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    'max_bin' : 900,
}

In [25]:
#kfold = 15
#folds = StratifiedKFold(n_splits=kfold, shuffle=False, random_state=44000)
num_folds = 5
features = [c for c in train.columns if c not in ['ID_code', 'target']]

folds = KFold(n_splits=num_folds, random_state=3319)
oof = np.zeros(len(train))
getVal = np.zeros(len(train))
predictions = np.zeros(len(target))
feature_importance_df = pd.DataFrame()

In [26]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    
    X_train, y_train = train.iloc[trn_idx][features], target.iloc[trn_idx]
    X_valid, y_valid = train.iloc[val_idx][features], target.iloc[val_idx]
    
    X_tr, y_tr = augment(X_train.values, y_train.values)
    X_tr = pd.DataFrame(X_tr)
    
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=7000, 
                    early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    getVal[val_idx]+= clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

Fold idx:1
Training until validation scores don't improve for 3000 rounds.
[7000]	training's auc: 0.875832	valid_1's auc: 0.864444
[14000]	training's auc: 0.894115	valid_1's auc: 0.88152
[21000]	training's auc: 0.902383	valid_1's auc: 0.889548
[28000]	training's auc: 0.906893	valid_1's auc: 0.893995
[35000]	training's auc: 0.9097	valid_1's auc: 0.896568
[42000]	training's auc: 0.91168	valid_1's auc: 0.898086
[49000]	training's auc: 0.913191	valid_1's auc: 0.898895
[56000]	training's auc: 0.914504	valid_1's auc: 0.899371
[63000]	training's auc: 0.915691	valid_1's auc: 0.899608
[70000]	training's auc: 0.916788	valid_1's auc: 0.899742
Early stopping, best iteration is:
[72458]	training's auc: 0.91716	valid_1's auc: 0.899774
Fold idx:2
Training until validation scores don't improve for 3000 rounds.
[7000]	training's auc: 0.876868	valid_1's auc: 0.864832
[14000]	training's auc: 0.894905	valid_1's auc: 0.882017


KeyboardInterrupt: 

In [16]:
print("\n >> CV score: {:<8.5f}".format(roc_auc_score(target, oof)))


 >> CV score: 0.90153 


# Submission

In [17]:
submission = pd.DataFrame({"ID_code": test.ID_code.values})
submission["target"] = predictions
submission.to_csv("submission_902.csv", index=False)

In [19]:
submission.head()

Unnamed: 0,ID_code,target
0,test_0,0.138552
1,test_1,0.28076
2,test_2,0.246681
3,test_3,0.264208
4,test_4,0.057968


In [19]:
submission.head()

Unnamed: 0,ID_code,target
0,test_0,0.142333
1,test_1,0.274733
2,test_2,0.241985
3,test_3,0.291388
4,test_4,0.058313


In [16]:
import pandas as pd
submission = pd.read_csv("submission_902.csv")

In [17]:
submission.head()

Unnamed: 0,ID_code,target
0,test_0,0.138552
1,test_1,0.28076
2,test_2,0.246681
3,test_3,0.264208
4,test_4,0.057968


In [19]:
submission.head()

Unnamed: 0,ID_code,target
0,test_0,1.016
1,test_1,2.058812
2,test_2,1.808912
3,test_3,1.937436
4,test_4,0.425076


In [13]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': 2,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 3,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [14]:
#kfold = 15
#folds = StratifiedKFold(n_splits=kfold, shuffle=False, random_state=44000)
num_folds = 10
features = [c for c in train.columns if c not in ['ID_code', 'target']]

folds = KFold(n_splits=num_folds, random_state=3319)
oof = np.zeros(len(train))
getVal = np.zeros(len(train))
predictions = np.zeros(len(target))
feature_importance_df = pd.DataFrame()

In [15]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    
    X_train, y_train = train.iloc[trn_idx][features], target.iloc[trn_idx]
    X_valid, y_valid = train.iloc[val_idx][features], target.iloc[val_idx]
    
    X_tr, y_tr = augment(X_train.values, y_train.values)
    X_tr = pd.DataFrame(X_tr)
    
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=7000, 
                    early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    getVal[val_idx]+= clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

Fold idx:1
Training until validation scores don't improve for 3000 rounds.
[7000]	training's auc: 0.890823	valid_1's auc: 0.879476
[14000]	training's auc: 0.904244	valid_1's auc: 0.89344
[21000]	training's auc: 0.908918	valid_1's auc: 0.897871
[28000]	training's auc: 0.911416	valid_1's auc: 0.899439
[35000]	training's auc: 0.913292	valid_1's auc: 0.90013
[42000]	training's auc: 0.915	valid_1's auc: 0.900344
Early stopping, best iteration is:
[42418]	training's auc: 0.915093	valid_1's auc: 0.900388
Fold idx:2
Training until validation scores don't improve for 3000 rounds.
[7000]	training's auc: 0.89124	valid_1's auc: 0.88223
[14000]	training's auc: 0.904714	valid_1's auc: 0.894908
[21000]	training's auc: 0.909455	valid_1's auc: 0.899065
[28000]	training's auc: 0.911939	valid_1's auc: 0.900483
[35000]	training's auc: 0.913806	valid_1's auc: 0.900779
Early stopping, best iteration is:
[35271]	training's auc: 0.913878	valid_1's auc: 0.900807
Fold idx:3
Training until validation scores don'

In [13]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': 2,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 3,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [14]:
#kfold = 15
#folds = StratifiedKFold(n_splits=kfold, shuffle=False, random_state=44000)
num_folds = 10
features = [c for c in train.columns if c not in ['ID_code', 'target']]

folds = KFold(n_splits=num_folds, random_state=3319)
oof = np.zeros(len(train))
getVal = np.zeros(len(train))
predictions = np.zeros(len(target))
feature_importance_df = pd.DataFrame()

In [15]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    
    X_train, y_train = train.iloc[trn_idx][features], target.iloc[trn_idx]
    X_valid, y_valid = train.iloc[val_idx][features], target.iloc[val_idx]
    
    X_tr, y_tr = augment(X_train.values, y_train.values)
    X_tr = pd.DataFrame(X_tr)
    
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=7000, 
                    early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    getVal[val_idx]+= clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

Fold idx:1
Training until validation scores don't improve for 3000 rounds.
[7000]	training's auc: 0.890823	valid_1's auc: 0.879476
[14000]	training's auc: 0.904244	valid_1's auc: 0.89344
[21000]	training's auc: 0.908918	valid_1's auc: 0.897871
[28000]	training's auc: 0.911416	valid_1's auc: 0.899439
[35000]	training's auc: 0.913292	valid_1's auc: 0.90013
[42000]	training's auc: 0.915	valid_1's auc: 0.900344
Early stopping, best iteration is:
[42418]	training's auc: 0.915093	valid_1's auc: 0.900388
Fold idx:2
Training until validation scores don't improve for 3000 rounds.
[7000]	training's auc: 0.89124	valid_1's auc: 0.88223
[14000]	training's auc: 0.904714	valid_1's auc: 0.894908
[21000]	training's auc: 0.909455	valid_1's auc: 0.899065
[28000]	training's auc: 0.911939	valid_1's auc: 0.900483
[35000]	training's auc: 0.913806	valid_1's auc: 0.900779
Early stopping, best iteration is:
[35271]	training's auc: 0.913878	valid_1's auc: 0.900807
Fold idx:3
Training until validation scores don'

In [None]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [15]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    
    X_train, y_train = train.iloc[trn_idx][features], target.iloc[trn_idx]
    X_valid, y_valid = train.iloc[val_idx][features], target.iloc[val_idx]
    
    X_tr, y_tr = augment(X_train.values, y_train.values)
    X_tr = pd.DataFrame(X_tr)
    
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    getVal[val_idx]+= clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

Fold idx:1
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.911087	valid_1's auc: 0.896164
[10000]	training's auc: 0.921085	valid_1's auc: 0.900371
[15000]	training's auc: 0.9287	valid_1's auc: 0.901346
[20000]	training's auc: 0.93561	valid_1's auc: 0.901193
Early stopping, best iteration is:
[16533]	training's auc: 0.930866	valid_1's auc: 0.901554
Fold idx:2
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.912634	valid_1's auc: 0.895351
[10000]	training's auc: 0.922375	valid_1's auc: 0.898337
[15000]	training's auc: 0.929878	valid_1's auc: 0.8988
Early stopping, best iteration is:
[15532]	training's auc: 0.930621	valid_1's auc: 0.898902
Fold idx:3
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.912753	valid_1's auc: 0.889083
[10000]	training's auc: 0.922531	valid_1's auc: 0.892518
[15000]	training's auc: 0.930076	valid_1's auc: 0.892495
Early stopping, best iteratio