# Project 

### Import module

In [1]:
import pandas as pd
import numpy as np
import gc
import random
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.cross_validation import StratifiedKFold, cross_val_score, KFold
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier



In [2]:
from lightgbm import sklearn as lgbmsk
from lightgbm import LGBMRegressor, LGBMClassifier
import lightgbm as lgb

### Functions

In [3]:
def stacking(X_train, X_test, y_train, skf, clfs):
    meta_train = np.zeros((X_train.shape[0], len(clfs)))
    meta_test  = np.zeros((X_test.shape[0],  len(clfs)))
    
    for j, clf in enumerate(clfs):
        print('Clf', j+1)
        meta_test_j = np.zeros((X_test.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print('Fold', i+1)
            X_tr = X_train[train]
            y_tr = y_train[train]
            X_ts = X_train[test]
            y_ts = y_train[test]
            clf.fit(X_tr, y_tr)
            y_submission = clf.predict_proba(X_ts)[:, 1]
            meta_train[test, j] = y_submission
            meta_test_j[:, i] = clf.predict_proba(X_test)[:, 1]
        meta_test[:, j] = meta_test_j.mean(1)
        gc.collect()
        
    return meta_train, meta_test

## Stacking level 1

### one hot encoding + drop columns

In [8]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

y_train = train.target.values
id_test = test['id'].values


In [9]:
#train['ps_ind_sum_bin'] = train[BIN].sum(axis = 1 )
#train['ps_reg_mult'] = train.ps_reg_01 * train.ps_reg_02 * train.ps_reg_03
#train['ps_car_13_ps_reg_03']= train.ps_car_13* train.ps_reg_03
train['ps_car_15'] = (train.ps_car_15)**2
train['ps_car_14'] = (train.ps_car_14)**2
train['ps_car_12'] = round((train.ps_car_12)**2,4) * 10000
train['ps_car_13'] = (train.ps_car_13)**2 * 48400
train['ps_reg_03'] = (4*train.ps_reg_03)**2

#test['ps_ind_sum_bin'] = test[BIN].sum(axis = 1 )
#test['ps_reg_mult'] = test.ps_reg_01 * test.ps_reg_02 * test.ps_reg_03
#test['ps_car_13_ps_reg_03']= test.ps_car_13* test.ps_reg_03
test['ps_car_15'] = (test.ps_car_15)**2
test['ps_car_14'] = (test.ps_car_14)**2
test['ps_car_12'] = round((test.ps_car_12)**2,4) * 10000
test['ps_car_13'] = (test.ps_car_13)**2 * 48400
test['ps_reg_03'] = (4*test.ps_reg_03)**2

In [10]:
skf = list(StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=115))

In [11]:
drop_columns = ['id', 'ps_calc_10', 'ps_calc_01', 'ps_calc_04', 'ps_car_02_cat', 'ps_calc_14',
               'ps_calc_08', 'ps_calc_17_bin', 'ps_car_10_cat', 'ps_ind_11_bin', 'ps_calc_12',
               'ps_calc_09', 'ps_car_06_cat', 'ps_calc_05','ps_calc_16_bin', 'ps_calc_20_bin',
                'ps_calc_18_bin']

train.drop(drop_columns, axis = 1, inplace = True)
train.drop('target', axis = 1, inplace = True)
test.drop(drop_columns, axis = 1, inplace = True)

In [12]:
CAT = []
for col in train.columns:
    if 'cat' in col:
        CAT.append(col)
        
BIN = []
for col in train.columns:
    if 'bin' in col:
        BIN.append(col)

In [13]:
data = pd.concat((train, test), axis=0, ignore_index=True)
for col in CAT:
    data = pd.concat((data, pd.get_dummies(data[col], prefix=col)), axis=1)
    data.drop(col, axis=1, inplace=True)

In [14]:
train = data.iloc[:train.shape[0],:]
test = data.iloc[train.shape[0]:,:]
test.index = range(len(test))

In [15]:
X_train_n = np.array(train, dtype=np.float32)
X_test_n = np.array(test, dtype=np.float32)

In [None]:
clfs = [ExtraTreesClassifier(n_estimators=800, criterion='gini', max_depth=37, max_features=25, 
                             min_samples_split=4, min_samples_leaf=2, n_jobs=-1, random_state=888),                  
        XGBClassifier(n_estimators=600, learning_rate=0.03, max_depth=10, colsample_bytree=0.4, 
                      min_child_weight=1, seed=88888), 
        RandomForestClassifier(n_estimators=800, criterion='gini')]

meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_1 = pd.DataFrame(meta_train, index=X_train.index, columns=['base_et', 'base_xgb', 'base_rf'])
meta_test_1 = pd.DataFrame(meta_test, index=X_test.index, columns=['base_et', 'base_xgb', 'base_rf'])

Clf 1
Fold 1


In [21]:
meta_train_1 = pd.DataFrame(meta_train, index=train.index, columns=['base_et', 'base_xgb', 'base_rf'])
meta_test_1 = pd.DataFrame(meta_test, index=test.index, columns=['base_et', 'base_xgb', 'base_rf'])

In [22]:
meta_train_1.to_csv('save_meta/meta_train_1.csv', index = False)
meta_test_1.to_csv('save_meta/meta_test_1.csv', index = False)

#### Base Logistic Regression

In [23]:
clfs = [LogisticRegression(C=1.0, penalty='l2', n_jobs=-1)]

meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_2 = pd.DataFrame(meta_train, index=train.index, columns=['base_lr'])
meta_test_2 = pd.DataFrame(meta_test, index=test.index, columns=['base_lr'])

Clf 1
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


In [99]:
meta_train_2.to_csv('save_meta/meta_train_2.csv', index = False)
meta_test_2.to_csv('save_meta/meta_test_2.csv', index = False)

### add target encoding

In [26]:
import time
from sklearn.preprocessing import LabelEncoder
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

gc.enable()

trn_df = pd.read_csv("data/train.csv")
sub_df = pd.read_csv("data/test.csv")

id_test = sub_df.id
target = trn_df["target"]
trn_df.drop("target", axis = 1 , inplace = True)

train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
	"ps_reg_03",  #            : 1408.42 / shadow  511.15
	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
	"ps_ind_03",  #            : 1219.47 / shadow  230.55
	"ps_ind_15",  #            :  922.18 / shadow  242.00
	"ps_reg_02",  #            :  920.65 / shadow  267.50
	"ps_car_14",  #            :  798.48 / shadow  549.58
	"ps_car_12",  #            :  731.93 / shadow  293.62
	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
	"ps_reg_01",  #            :  598.60 / shadow  178.57
	"ps_car_15",  #            :  593.35 / shadow  226.43
	"ps_ind_01",  #            :  547.32 / shadow  154.58
	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]
start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
    sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
    trn_df[name1] = lbl.transform(list(trn_df[name1].values))
    sub_df[name1] = lbl.transform(list(sub_df[name1].values))

    train_features.append(name1)
    
trn_df = trn_df[train_features]
sub_df = sub_df[train_features]

f_cats = [f for f in trn_df.columns if "_cat" in f]

for f in f_cats:
    trn_df[f + "_avg"], sub_df[f + "_avg"] = target_encode(trn_series=trn_df[f],
                                         tst_series=sub_df[f],
                                         target=target,
                                         min_samples_leaf=200,
                                         smoothing=10,
                                         noise_level=0)

trn_df.ps_car_15 = trn_df.ps_car_15**2
sub_df.ps_car_15 = sub_df.ps_car_15**2

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0

In [27]:
X_train_n = np.array(trn_df, dtype=np.float32)
X_test_n = np.array(sub_df, dtype=np.float32)

In [28]:
        
# LightGBM params
lgb_params_1 = {
    'learning_rate': 0.01,
    'n_estimators': 1250,
    'max_bin': 10,
    'subsample': 0.8,
    'subsample_freq': 10,
    'colsample_bytree': 0.8, 
    'min_child_samples': 500
}

lgb_params_2 = {
    'learning_rate': 0.005,
    'n_estimators': 3700,
    'subsample': 0.7,
    'subsample_freq': 2,
    'colsample_bytree': 0.3,  
    'num_leaves': 16
}

lgb_params_3 = {
    'learning_rate': 0.02,
    'n_estimators': 800,
    'max_depth': 4
}


# RandomForest params
rf_params = {}
rf_params['n_estimators'] = 200
rf_params['max_depth'] = 6
rf_params['min_samples_split'] = 70
rf_params['min_samples_leaf'] = 30


# ExtraTrees params
et_params = {}
et_params['n_estimators'] = 155
et_params['max_features'] = 0.3
et_params['max_depth'] = 6
et_params['min_samples_split'] = 40
et_params['min_samples_leaf'] = 18


# XGBoost params
xgb_params = {
        'objective' : 'binary:logistic',
        'learning_rate' : 0.02,
        'n_estimators' : 1000,
        'gamma' : 9,
        'max_depth' : 4,
        'subsample' : 0.9,
        'colsample_bytree' : 0.9,  
        'min_child_weight' : 10
}

# CatBoost params
cat_params = {
    'iterations' : 900,
        'depth' : 8,
        'rsm' : 0.95,
        'learning_rate' : 0.03,
        'l2_leaf_reg' : 3.5 , 
        'border_count' : 8,
        'gradient_iterations' : 4
}
lgb_model_1 = LGBMClassifier(**lgb_params_1)

lgb_model_2 = LGBMClassifier(**lgb_params_2)

lgb_model_3 = LGBMClassifier(**lgb_params_3)

rf_model = RandomForestClassifier(**rf_params)

et_model = ExtraTreesClassifier(**et_params)
        
xgb_model = XGBClassifier(**xgb_params)

cat_model = CatBoostClassifier(**cat_params)

In [29]:
clfs = [lgb_model_1,
        lgb_model_2,
        lgb_model_3,
        xgb_model]

In [None]:
meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_3 = pd.DataFrame(meta_train, index=train.index, columns=['base_lgb_1', 'base_lgb_2', 'base_lgb_3','base_xgb_2'])
meta_test_3 = pd.DataFrame(meta_test, index=test.index, columns=['base_lgb_1', 'base_lgb_2', 'base_lgb_3','base_xgb_2'])

Clf 1
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 2
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 3
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 4
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


In [None]:
meta_train_3.to_csv('save_meta/meta_train_3.csv', index = False)
meta_test_3.to_csv('save_meta/meta_test_3.csv', index = False)

### add new features

In [100]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

y_train = train.target.values
id_test = test['id'].values


train['ps_ind_0609_bin'] = train.apply(lambda x: 1 if x['ps_ind_06_bin'] == 1 else (2 if x['ps_ind_07_bin'] == 1 else 
    (
        3 if x['ps_ind_08_bin'] == 1 else (4 if x['ps_ind_09_bin'] == 1 else 5)

)), axis = 1)

test['ps_ind_0609_bin'] = test.apply(lambda x: 1 if x['ps_ind_06_bin'] == 1 else (2 if x['ps_ind_07_bin'] == 1 else 
(
3 if x['ps_ind_08_bin'] == 1 else (4 if x['ps_ind_09_bin'] == 1 else 5)

)), axis = 1)

train.drop(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin'], axis = 1, inplace = True)

test.drop(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin'], axis = 1, inplace = True)

train['ps_car_13'] = (train['ps_car_13']*train['ps_car_13']* 48400).round(0)

test['ps_car_13'] = (test['ps_car_13']*test['ps_car_13']* 48400).round(0)

train['ps_car_12'] = (train['ps_car_12']*train['ps_car_12']).round(4) * 10000

test['ps_car_12'] = (test['ps_car_12']*test['ps_car_12']).round(4) * 10000

for c in train[[c for c in train.columns if 'bin' in c]].columns:
    for cc in train[[c for c in train.columns if 'bin' in c]].columns:
            if train[train[cc] * train[c] == 0].shape[0] == train.shape[0]:
                print(c, cc)

train['ps_ind_161718_bin'] = train.apply(lambda x: 1 if x['ps_ind_16_bin'] == 1 else
                                        (2 if x['ps_ind_17_bin'] == 1 else 3), axis = 1
                                        )

test['ps_ind_161718_bin'] = test.apply(lambda x: 1 if x['ps_ind_16_bin'] == 1 else
                                        (2 if x['ps_ind_17_bin'] == 1 else 3), axis = 1
                                        )

train.drop(['ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'], axis = 1, inplace = True)

test.drop(['ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'], axis = 1, inplace = True)


ps_ind_16_bin ps_ind_17_bin
ps_ind_16_bin ps_ind_18_bin
ps_ind_17_bin ps_ind_16_bin
ps_ind_17_bin ps_ind_18_bin
ps_ind_18_bin ps_ind_16_bin
ps_ind_18_bin ps_ind_17_bin


In [104]:
col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(col_to_drop, axis=1)  
test = test.drop(col_to_drop, axis=1)  

for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)    

print(train.shape, test.shape)

# custom objective function (similar to auc)

def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

(595212, 34) (892816, 33)


In [106]:
# xgb
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}

X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values
sub=test['id'].to_frame()
sub['target']=0

sub_train = train['id'].to_frame()
sub_train['target']=0

nrounds=10**6  # need to change to 2000
kfold = 5  # need to change to 5
skf = StratifiedShuffleSplit( n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=100)
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
    
    sub_train['target'] += xgb_model.predict(xgb.DMatrix(train[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
    
gc.collect()
sub.head(2)

#sub.to_csv('save_meta/test_sub_xgb.csv', index=False, float_format='%.5f')
#sub_train.to_csv('save_meta/train_sub_xgb.csv', index=False, float_format='%.5f')




 xgb kfold: 1  of  5 : 
[0]	train-auc:0.597993	valid-auc:0.577773	train-gini:0.194886	valid-gini:0.157185
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-auc:0.624611	valid-auc:0.601221	train-gini:0.249222	valid-gini:0.202427
[200]	train-auc:0.637785	valid-auc:0.615977	train-gini:0.27557	valid-gini:0.231954
[300]	train-auc:0.646992	valid-auc:0.625348	train-gini:0.293984	valid-gini:0.250696
[400]	train-auc:0.65341	valid-auc:0.629984	train-gini:0.30682	valid-gini:0.259967
[500]	train-auc:0.658329	valid-auc:0.632275	train-gini:0.316658	valid-gini:0.264549
[600]	train-auc:0.66221	valid-auc:0.633597	train-gini:0.324421	valid-gini:0.267194
[700]	train-auc:0.665484	valid-auc:0.634453	train-gini:0.330969	valid-gini:0.268907
[800]	train-auc:0.668549	valid-auc:0.635254	train-gini:0.337099	valid-gini:0.270508
[900]	train-auc:0.671388	valid-auc:0.635552	train-gini:0.342776	valid-gini:0.271

NameError: name 'sub_lgb' is not defined

In [109]:
# lgb
sub_lgb =test['id'].to_frame()
sub_train_lgb = train['id'].to_frame()

sub_lgb['target']=0
sub_train_lgb['target']=0

params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':8, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':5,  'min_data': 500}

skf = StratifiedShuffleSplit( n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  feval=gini_lgb, early_stopping_rounds=100)
    sub_lgb['target'] += lgb_model.predict(test[features].values, 
                        num_iteration=lgb_model.best_iteration) / (kfold)
    sub_train_lgb['target'] += lgb_model.predict(train[features].values, 
                        num_iteration=lgb_model.best_iteration) / (kfold)
    
#sub.to_csv('save_meta/test_sub_lgb.csv', index=False, float_format='%.5f') 
#sub_train.to_csv('save_meta/train_sub_lgb.csv', index=False, float_format='%.5f')



 lgb kfold: 1  of  5 : 
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.621403	valid_0's gini: 0.242806
[200]	valid_0's auc: 0.622771	valid_0's gini: 0.245543
[300]	valid_0's auc: 0.625825	valid_0's gini: 0.25165
[400]	valid_0's auc: 0.628044	valid_0's gini: 0.256088
[500]	valid_0's auc: 0.630923	valid_0's gini: 0.261846
[600]	valid_0's auc: 0.633248	valid_0's gini: 0.266496
[700]	valid_0's auc: 0.634877	valid_0's gini: 0.269754
[800]	valid_0's auc: 0.635977	valid_0's gini: 0.271955
[900]	valid_0's auc: 0.636836	valid_0's gini: 0.273672
[1000]	valid_0's auc: 0.637252	valid_0's gini: 0.274504
[1100]	valid_0's auc: 0.637248	valid_0's gini: 0.274496
Early stopping, best iteration is:
[1003]	valid_0's auc: 0.637299	valid_0's gini: 0.274598
 lgb kfold: 2  of  5 : 
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.611369	valid_0's gini: 0.222739
[200]	valid_0's auc: 0.614874	valid_0's gini: 0.229747
[300]	valid_0's auc

1871

In [119]:
meta_train_4 = pd.concat((sub_train, sub_train_lgb), axis =1).drop('id', axis = 1 )
meta_test_4 = pd.concat((sub, sub_lgb), axis = 1).drop('id', axis = 1 )
gc.collect()

5403

In [120]:
meta_train_4.columns= ['xgb2','lgb2']
meta_test_4.columns = ['xgb2','lgb2']

In [122]:
meta_train_4.to_csv('save_meta/meta_train_4.csv', index = False)
meta_test_4.to_csv('save_meta/meta_test_4.csv', index = False)

## Stacking level 2

In [8]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

y_train = train['target'].values
id_test = test['id'].values

train.drop(['id', 'target'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)
y_train = pd.read_csv('data/train.csv')['target'].values

CAT = []
for col in train.columns:
    if 'cat' in col:
        CAT.append(col)
        
BIN = []
for col in train.columns:
    if 'bin' in col:
        BIN.append(col)

X_train, X_test = preprocess_data(train, test, BIN)

In [11]:
data = pd.concat((X_train, X_test), axis=0, ignore_index=True)
for col in CAT:
    data = pd.concat((data, pd.get_dummies(data[col], prefix=col)), axis=1)
    data.drop(col, axis=1, inplace=True)

In [12]:
X_train = data.ix[:(X_train.shape[0]-1),:]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [13]:
X_test = data.ix[X_train.shape[0]:,:]

In [14]:
X_train['ps_car_15_carre'] = (X_train.ps_car_15)**2
X_test['ps_car_15_carre'] = (X_test.ps_car_15)**2

In [15]:
X_train = X_train.drop(['ps_ind_10_bin','ps_ind_11_bin','ps_ind_12_bin',\
                'ps_ind_13_bin','ps_car_15'], axis=1)
X_test = X_test.drop(['ps_ind_10_bin','ps_ind_11_bin','ps_ind_12_bin',\
                'ps_ind_13_bin','ps_car_15'], axis=1)

In [20]:
X_train_2 = pd.concat([X_train, meta_train_level1], axis=1)

In [21]:
X_test_2 = pd.concat([X_test, meta_test_level1], axis = 1, ignore_index= True)

In [26]:
X_train_n = np.array(X_train_2, dtype=np.float32)
X_test_n = np.array(X_test_2, dtype=np.float32)

In [28]:
skf = list(StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=115))

In [None]:
clfs = [ExtraTreesClassifier(n_estimators=1000, max_features=25, criterion='gini', min_samples_split=2, 
                             max_depth=36, min_samples_leaf=2, n_jobs=-1, random_state=888),
        XGBClassifier(n_estimators=600, learning_rate=0.03, max_depth=10, colsample_bytree=0.4, 
                      min_child_weight=1, seed=129),
        CatBoostClassifier(iterations= 800, learning_rate=0.01),
        LGBMClassifier(max_depth=10, learning_rate=0.03, n_estimators=500, subsample_for_bin=500000, 
                       min_child_weight=5, min_child_samples=10, subsample=1.0, subsample_freq=1, 
                       colsample_bytree=1.0, random_state= 130, n_jobs=-1)]

meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_0 = pd.DataFrame(meta_train, index=X_train.index, columns=['main_et', 'main_xgb','main_catboost','main_lgb'])
meta_test_0 = pd.DataFrame(meta_test, index=X_test.index, columns=['main_et', 'main_xgb','main_catboost','main_lgb'])

Clf 1
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 2
Fold 1
Fold 2
Fold 3


In [None]:
meta_train_0.to_csv('save_meta/meta_train_0.csv', index = False)
meta_test_0.to_csv('save_meta/meta_test_0.csv', index= False)

In [6]:
meta_train_0 = pd.read_csv('save_meta/meta_train_0.csv')
meta_test_0 = pd.read_csv('save_meta/meta_test_0.csv')

##  Final model

In [14]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

y_train = train['target'].values
id_test = test['id'].values

train.drop(['id', 'target'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

CAT = []
for col in train.columns:
    if 'cat' in col:
        CAT.append(col)
        
BIN = []
for col in train.columns:
    if 'bin' in col:
        BIN.append(col)

#X_train, X_test = preprocess_data(train, test, BIN)

In [15]:
X_train = train
X_test = test

In [16]:
def recon(reg):
    integer = int(np.round((40*reg)**2)) # gives 2060 for our example
    for f in range(28):
        if (integer - f) % 27 == 0:
            F = f
    M = (integer - F)//27
    return F, M

In [17]:
#train['ps_ind_sum_bin'] = train[BIN].sum(axis = 1 )
#train['ps_reg_mult'] = train.ps_reg_01 * train.ps_reg_02 * train.ps_reg_03
#train['ps_car_13_ps_reg_03']= train.ps_car_13* train.ps_reg_03
X_train['ps_car_15_carre'] = (X_train.ps_car_15)**2
X_train['ps_ind_19_bin'] = X_train[['ps_ind_16_bin','ps_ind_17_bin','ps_ind_18_bin']].sum(axis = 1).apply(lambda x: 1 if x == 0 else 0)
X_train['ps_reg_F'] = X_train['ps_reg_03'].apply(lambda x: recon(x)[0])
X_train['ps_reg_M'] = X_train['ps_reg_03'].apply(lambda x: recon(x)[1])

#test['ps_ind_sum_bin'] = test[BIN].sum(axis = 1 )
#test['ps_reg_mult'] = test.ps_reg_01 * test.ps_reg_02 * test.ps_reg_03
#test['ps_car_13_ps_reg_03']= test.ps_car_13* test.ps_reg_03
X_test['ps_car_15_carre'] = (X_test.ps_car_15)**2
X_test['ps_ind_19_bin'] = X_test[['ps_ind_16_bin','ps_ind_17_bin','ps_ind_18_bin']].sum(axis = 1).apply(lambda x: 1 if x == 0 else 0)
X_test['ps_reg_F'] = X_test['ps_reg_03'].apply(lambda x: recon(x)[0])
X_test['ps_reg_M'] = X_test['ps_reg_03'].apply(lambda x: recon(x)[1])

drop_columns = ['ps_calc_10', 'ps_calc_01', 'ps_calc_04', 'ps_car_02_cat', 'ps_calc_14',
               'ps_calc_08', 'ps_calc_17_bin', 'ps_car_10_cat', 'ps_ind_11_bin', 'ps_calc_12',
               'ps_calc_09', 'ps_car_06_cat', 'ps_calc_05','ps_calc_16_bin', 'ps_calc_20_bin',
                'ps_calc_18_bin']

X_train = X_train.drop(drop_columns, axis=1)
X_test = X_test.drop(drop_columns, axis=1)

In [18]:
meta_train_1 = pd.read_csv('save_meta/meta_train_1.csv', index_col= False)
meta_train_2 = pd.read_csv('save_meta/meta_train_2.csv', index_col= False)
meta_train_3 = pd.read_csv('save_meta/meta_train_3.csv', index_col= False)
meta_train_4 = pd.read_csv('save_meta/meta_train_4.csv', index_col= False)

meta_test_1 = pd.read_csv('save_meta/meta_test_1.csv', index_col= False)
meta_test_2 = pd.read_csv('save_meta/meta_test_2.csv', index_col= False)
meta_test_3 = pd.read_csv('save_meta/meta_test_3.csv', index_col= False)
meta_test_4 = pd.read_csv('save_meta/meta_test_4.csv', index_col= False)

In [19]:
X_train_3 = pd.concat((X_train, meta_train_1, meta_train_2, meta_train_3), axis=1)
X_test_3 = pd.concat((X_test, meta_test_1, meta_test_2, meta_test_3), axis=1)

In [20]:
cv = StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=115)

In [21]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score
    
# We drop these variables as we don't want to train on them
# The other 57 columns are all numerical and can be trained on without preprocessing

In [22]:
from sklearn.model_selection import StratifiedShuffleSplit
y = y_train

X = X_train_3
x_test = X_test_3.values
xgbscores = []

# Create a submission file
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = np.zeros_like(id_test)

# Set xgb parameters

params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.03
params['gamma'] = 9
params['silent'] = True
params['max_depth'] = 6
params['subsample'] = 0.9
params['colsample_bytree'] = 0.85
params['colsample_bylevel'] = 0.9
params['tree_method'] = 'exact'

# Take a random 30% of the dataset as validation data

kfold = 5
sss = StratifiedShuffleSplit(n_splits=kfold, test_size=0.25, random_state=0)
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X.values[train_index], X.values[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into LGBoost format
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_test = xgb.DMatrix(x_test)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    # Train the model! We pass in a max of 2,000 rounds (with early stopping after 100)
    # and the custom metric (maximize=True tells xgb that higher metric is better)
    mdl = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=100, feval=gini_xgb, maximize=True, verbose_eval=100)

    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_test)
    sub['target'] += p_test/kfold

# Create a submission file
sub.to_csv('stacking4.csv', index=False)

[Fold 1/5]
[0]	train-error:0.036437	valid-error:0.036444	train-gini:0.278544	valid-gini:0.269835
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-error:0.036446	valid-error:0.036444	train-gini:0.299351	valid-gini:0.283691
Stopping. Best iteration:
[52]	train-error:0.036433	valid-error:0.036437	train-gini:0.294394	valid-gini:0.283761

[Fold 1/5 Prediciton:]
[Fold 2/5]
[0]	train-error:0.03644	valid-error:0.036451	train-gini:0.27671	valid-gini:0.272083
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-error:0.036449	valid-error:0.036444	train-gini:0.298639	valid-gini:0.288787
[200]	train-error:0.036442	valid-error:0.036458	train-gini:0.326282	valid-gini:0.288337
Stopping. Best iteration:
[116]	train-error:0.036449	valid-error:0.036444	train-gini:0.30217	valid-gini:0.289168

[