In [1]:
# Data manipulation and set-up
import numpy as np
import pandas as pd

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1)

# Modelling (including set-up & evaluation)
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score

# Ignore warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
random_state = 42
np.random.seed(random_state)
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
# Data augmentation
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [4]:
# Model parameters
lgb_params = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 31,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 150,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "bagging_seed" : random_state,
    "verbosity" : 1,
    "seed": random_state}

In [5]:
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=random_state)
oof = train[['ID_code', 'target']]
oof['predict'] = 0
predictions = test[['ID_code']]
val_aucs = []
feature_importance = pd.DataFrame()

In [6]:
features = [col for col in train.columns if col not in ['target', 'ID_code']]
X_test = test[features].values

In [7]:
for fold, (trn_idx, val_idx) in enumerate(skf.split(train, train['target'])):
    X_train, y_train = train.iloc[trn_idx][features], train.iloc[trn_idx]['target']
    X_valid, y_valid = train.iloc[val_idx][features], train.iloc[val_idx]['target']
    
    N = 3
    p_valid,yp = 0,0
    for i in range(N):
        X_t, y_t = augment(X_train.values, y_train.values)
        X_t = pd.DataFrame(X_t)
        X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_t, label=y_t)
        val_data = lgb.Dataset(X_valid, label=y_valid)
        evals_result = {}
        lgb_clf = lgb.train(lgb_params,
                        trn_data,
                        100000,
                        valid_sets = [trn_data, val_data],
                        early_stopping_rounds=3000,
                        verbose_eval = 1000,
                        evals_result=evals_result
                       )
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
    fold_importance = pd.DataFrame()
    fold_importance["feature"] = features
    fold_importance["importance"] = lgb_clf.feature_importance()
    fold_importance["fold"] = fold + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    oof['predict'][val_idx] = p_valid/N
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)
    
    predictions['fold{}'.format(fold+1)] = yp/N

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.908775	valid_1's auc: 0.892624
[2000]	training's auc: 0.91704	valid_1's auc: 0.897436
[3000]	training's auc: 0.923209	valid_1's auc: 0.899713
[4000]	training's auc: 0.92848	valid_1's auc: 0.900872
[5000]	training's auc: 0.93321	valid_1's auc: 0.901501
[6000]	training's auc: 0.937671	valid_1's auc: 0.901821
[7000]	training's auc: 0.941879	valid_1's auc: 0.901806
[8000]	training's auc: 0.94588	valid_1's auc: 0.901771
[9000]	training's auc: 0.949656	valid_1's auc: 0.901779
Early stopping, best iteration is:
[6173]	training's auc: 0.938408	valid_1's auc: 0.901895
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.907191	valid_1's auc: 0.89206
[2000]	training's auc: 0.915736	valid_1's auc: 0.897656
[3000]	training's auc: 0.921922	valid_1's auc: 0.899839
[4000]	training's auc: 0.927245	valid_1's auc: 0.901311
[5000]	training's auc: 0.932113	valid_1's auc: 0.901944
[6

In [8]:
# Submission
predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('lgb_all_predictions.csv', index=None)
sub = pd.DataFrame({"ID_code":test["ID_code"].values})
sub["target"] = predictions['target']
sub.to_csv("lgb_submission.csv", index=False)
oof.to_csv('lgb_oof.csv', index=False)

In [9]:
sub.head()

Unnamed: 0,ID_code,target
0,test_0,0.14349
1,test_1,0.284526
2,test_2,0.241314
3,test_3,0.274705
4,test_4,0.061411
