# Approach 1

## import module

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import cross_val_score, train_test_split

In [2]:
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

In [3]:
from dodoml.pipeline import (
    ColumnsSelector, UniqueCountColumnSelector, TolerantLabelEncoder, FillNaN,
    ColumnApplier, OrdinalEncoder, CountFrequencyEncoder, Logify, BoxCoxTransformer,
    YToLog)
from dodoml import compute_features_impact, compute_partial_dependence, lift_curve
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import auc, roc_curve, roc_auc_score
from dodoml import compute_ace
import sys
sys.path.append("../../src")

## Create functions

### gini function

In [4]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score
    
# We drop these variables as we don't want to train on them
# The other 57 columns are all numerical and can be trained on without preprocessing

In [5]:
def check_dimension(train, test):
    '''
    check if number of columns in train set == number of columns in test set
    '''
    if (train.shape[1] == test.shape[1]):
        return True
    else:
        print ('shape of train:', train.shape)
        print ('shape of test:', test.shape)
        return False
        

### funtions adding features

## import data

In [9]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print('Train shape:', train.shape)
print('Test shape:', test.shape)


Train shape: (595212, 59)
Test shape: (892816, 58)


## preprocess data

In [10]:
id_train = train['id'].values
y = train.target.values
id_test = test['id'].values

#train['ps_ind_sum_bin'] = train[BIN].sum(axis = 1 )
#train['ps_reg_mult'] = train.ps_reg_01 * train.ps_reg_02 * train.ps_reg_03
#train['ps_car_13_ps_reg_03']= train.ps_car_13* train.ps_reg_03
train['ps_car_15'] = (train.ps_car_15)**2
train['ps_car_14'] = (train.ps_car_14)**2
train['ps_car_12'] = round((train.ps_car_12)**2,4) * 10000
train['ps_car_13'] = (train.ps_car_13)**2 * 48400
train['ps_reg_03'] = (4*train.ps_reg_03)**2

#test['ps_ind_sum_bin'] = test[BIN].sum(axis = 1 )
#test['ps_reg_mult'] = test.ps_reg_01 * test.ps_reg_02 * test.ps_reg_03
#test['ps_car_13_ps_reg_03']= test.ps_car_13* test.ps_reg_03
test['ps_car_15'] = (test.ps_car_15)**2
test['ps_car_14'] = (test.ps_car_14)**2
test['ps_car_12'] = round((test.ps_car_12)**2,4) * 10000
test['ps_car_13'] = (test.ps_car_13)**2 * 48400
test['ps_reg_03'] = (4*test.ps_reg_03)**2

In [8]:
drop_columns = ['id', 'ps_calc_10', 'ps_calc_01', 'ps_calc_04', 'ps_car_02_cat', 'ps_calc_14',
               'ps_calc_08', 'ps_calc_17_bin', 'ps_car_10_cat', 'ps_ind_11_bin', 'ps_calc_12',
               'ps_calc_09', 'ps_car_06_cat', 'ps_calc_05','ps_calc_16_bin', 'ps_calc_20_bin',
                'ps_calc_18_bin']

train.drop(drop_columns, axis = 1, inplace = True)
train.drop('target', axis = 1, inplace = True)
test.drop(drop_columns, axis = 1, inplace = True)

### OHE with get_numpies

In [9]:
CAT = []
for col in train.columns:
    if 'cat' in col:
        CAT.append(col)
        
BIN = []
for col in train.columns:
    if 'bin' in col:
        BIN.append(col)
        
CALC = []
for col in train.columns:
    if 'calc' in col:
        CALC.append(col)

In [10]:
data = pd.concat((train, test), axis=0, ignore_index=True)
for col in CAT:
    data = pd.concat((data, pd.get_dummies(data[col], prefix=col)), axis=1)
    data.drop(col, axis=1, inplace=True)

In [11]:
train = data.iloc[:train.shape[0],:]
test = data.iloc[train.shape[0]:,:]

In [12]:
test.index = range(len(test))

## Model

In [13]:
check_dimension(train, test)

True

In [14]:
train.shape

(595212, 190)

In [15]:
# Create a submission file
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = np.zeros_like(id_test)

In [None]:
X = train.values
x_test = test.values
xgbscores = []

# Set xgb parameters

params = {'eta': 0.02, 'max_depth': 5, 'subsample': 0.9, 'colsample_bytree':
      0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent':
      True, 'tree_method' : 'exact'}

kfold = 5
sss = StratifiedKFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    print('------------------Beginning Fold %d/%d ------------------' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_test = xgb.DMatrix(x_test)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    mdl = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=100, 
                    feval=gini_xgb, maximize=True, verbose_eval=100)

    print('------------------End Fold %d/%d-------------------' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_test)
    sub['target'] += p_test/kfold

sub.to_csv('xgb3.csv', index=False)

------------------Beginning Fold 1/5 ------------------
[0]	train-auc:0.607515	valid-auc:0.604684	train-gini:0.214695	valid-gini:0.209841
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-auc:0.631212	valid-auc:0.623651	train-gini:0.262424	valid-gini:0.247304
[200]	train-auc:0.648701	valid-auc:0.631466	train-gini:0.297402	valid-gini:0.262931
[300]	train-auc:0.662035	valid-auc:0.636979	train-gini:0.324071	valid-gini:0.273959
[400]	train-auc:0.672204	valid-auc:0.639461	train-gini:0.344408	valid-gini:0.278923
[500]	train-auc:0.680473	valid-auc:0.640381	train-gini:0.360945	valid-gini:0.280762
[600]	train-auc:0.687634	valid-auc:0.640669	train-gini:0.375269	valid-gini:0.281338
[700]	train-auc:0.693963	valid-auc:0.641112	train-gini:0.387926	valid-gini:0.282223
Stopping. Best iteration:
[685]	train-auc:0.693056	valid-auc:0.641116	train-gini:0.386113	valid-gini:0.282233

-----------------

# Approach 2

## import module

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import gc
#from numba import jit
from sklearn.preprocessing import LabelEncoder
import time 

## Define functions

### Gini Function

In [2]:
"""
This simple scripts demonstrates the use of xgboost eval results to get the best round
for the current fold and accross folds. 
It also shows an upsampling method that limits cross-validation overfitting.
"""

def eval_gini(y_true, y_prob):
    """
    Original author CPMP : https://www.kaggle.com/cpmpml
    In kernel : https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
    """
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return [('gini', gini_score)]

### target encoding

In [3]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by= trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [5]:
gc.enable()

trn_df = pd.read_csv("data/train.csv", index_col=0)
sub_df = pd.read_csv("data/test.csv", index_col=0)

target = trn_df["target"]
trn_df.drop("target", axis =1, inplace = True)

train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
     "ps_reg_03",  #            : 1408.42 / shadow  511.15
     "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
     "ps_ind_03",  #            : 1219.47 / shadow  230.55
     "ps_ind_15",  #            :  922.18 / shadow  242.00
     "ps_reg_02",  #            :  920.65 / shadow  267.50
     "ps_car_14",  #            :  798.48 / shadow  549.58
     "ps_car_12",  #            :  731.93 / shadow  293.62
     "ps_car_01_cat",  #        :  698.07 / shadow  178.72
     "ps_car_07_cat",  #        :  694.53 / shadow   36.35
     "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
     "ps_car_03_cat",  #        :  611.73 / shadow   50.67
     "ps_reg_01",  #            :  598.60 / shadow  178.57
     "ps_car_15",  #            :  593.35 / shadow  226.43
     "ps_ind_01",  #            :  547.32 / shadow  154.58
     "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
     "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
]

# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]
start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
    sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
    trn_df[name1] = lbl.transform(list(trn_df[name1].values))
    sub_df[name1] = lbl.transform(list(sub_df[name1].values))

    train_features.append(name1)

trn_df = trn_df[train_features]
sub_df = sub_df[train_features]

f_cats = [f for f in trn_df.columns if "_cat" in f]

for f in f_cats:
    trn_df[f + "_avg"], sub_df[f + "_avg"] = target_encode(trn_series=trn_df[f],
                                         tst_series=sub_df[f],
                                         target=target,
                                         min_samples_leaf=200,
                                         smoothing=10,
                                         noise_level=0)

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0

In [None]:
n_splits = 5
n_estimators = 200
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(trn_df.columns), n_splits))
xgb_evals = np.zeros((n_estimators, n_splits))
oof = np.empty(len(trn_df))
sub_preds = np.zeros(len(sub_df))
increase = True
np.random.seed(0)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(target, target)):
    trn_dat, trn_tgt = trn_df.iloc[trn_idx], target.iloc[trn_idx]
    val_dat, val_tgt = trn_df.iloc[val_idx], target.iloc[val_idx]

    clf = XGBClassifier(n_estimators=n_estimators,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=.1, 
                        subsample=.8, 
                        colsample_bytree=.8,
                        gamma=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        nthread=2)
    # Upsample during cross validation to avoid having the same samples
    # in both train and validation sets
    # Validation set is not up-sampled to monitor overfitting
    if increase:
        # Get positive examples
        pos = pd.Series(trn_tgt == 1)
        # Add positive examples
        trn_dat = pd.concat([trn_dat, trn_dat.loc[pos]], axis=0)
        trn_tgt = pd.concat([trn_tgt, trn_tgt.loc[pos]], axis=0)
        # Shuffle data
        idx = np.arange(len(trn_dat))
        np.random.shuffle(idx)
        trn_dat = trn_dat.iloc[idx]
        trn_tgt = trn_tgt.iloc[idx]
        
    clf.fit(trn_dat, trn_tgt, 
            eval_set=[(trn_dat, trn_tgt), (val_dat, val_tgt)],
            eval_metric=gini_xgb,
            early_stopping_rounds=None,
            verbose=False)
            
    # Keep feature importances
    imp_df[:, fold_] = clf.feature_importances_

    # Find best round for validation set
    xgb_evals[:, fold_] = clf.evals_result_["validation_1"]["gini"]
    # Xgboost provides best round starting from 0 so it has to be incremented
    best_round = np.argsort(xgb_evals[:, fold_])[::-1][0]

    # Predict OOF and submission probas with the best round
    oof[val_idx] = clf.predict_proba(val_dat, ntree_limit=best_round)[:, 1]
    # Update submission
    sub_preds += clf.predict_proba(sub_df, ntree_limit=best_round)[:, 1] / n_splits

    # Display results
    print("Fold %2d : %.6f @%4d / best score is %.6f @%4d"
          % (fold_ + 1,
             eval_gini(val_tgt, oof[val_idx]),
             n_estimators,
             xgb_evals[best_round, fold_],
             best_round))
          
print("Full OOF score : %.6f" % eval_gini(target, oof))

# Compute mean score and std
mean_eval = np.mean(xgb_evals, axis=1)
std_eval = np.std(xgb_evals, axis=1)
best_round = np.argsort(mean_eval)[::-1][0]

print("Best mean score : %.6f + %.6f @%4d"
      % (mean_eval[best_round], std_eval[best_round], best_round))
    
importances = sorted([(trn_df.columns[i], imp) for i, imp in enumerate(imp_df.mean(axis=1))],
                     key=lambda x: x[1])

for f, imp in importances[::-1]:
    print("%-34s : %10.4f" % (f, imp))
    
sub_df["target"] = sub_preds

sub_df[["target"]].to_csv("submission.csv", index=True, float_format="%.9f")


Fold  1 : 0.287436 @ 200 / best score is 0.287500 @ 176


In [None]:
# Create a submission file
import xgboost as xgb
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = np.zeros_like(id_test)

X = trn_df.values
x_test = sub_df.values
xgbscores = []

# Set xgb parameters

params = {'eta': 0.02, 'max_depth': 5, 'subsample': 0.9, 'colsample_bytree':
      0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent':
      True, 'tree_method' : 'exact'}

kfold = 5
sss = StratifiedKFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    print('------------------Beginning Fold %d/%d ------------------' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_test = xgb.DMatrix(x_test)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    mdl = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=100, 
                    feval=gini_xgb, maximize=True, verbose_eval=100)

    print('------------------End Fold %d/%d-------------------' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_test)
    sub['target'] += p_test/kfold

sub.to_csv('xgb3.csv', index=False)

------------------Beginning Fold 1/5 ------------------
[0]	train-auc:0.609307	valid-auc:0.607336	train-gini:0.218241	valid-gini:0.214305
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-auc:0.635054	valid-auc:0.626248	train-gini:0.270107	valid-gini:0.252498
[200]	train-auc:0.651116	valid-auc:0.634765	train-gini:0.302231	valid-gini:0.269529
[300]	train-auc:0.662976	valid-auc:0.639395	train-gini:0.325951	valid-gini:0.27879
[400]	train-auc:0.672253	valid-auc:0.641786	train-gini:0.344506	valid-gini:0.283572
[500]	train-auc:0.679386	valid-auc:0.642935	train-gini:0.358773	valid-gini:0.285871
[600]	train-auc:0.685309	valid-auc:0.643074	train-gini:0.370619	valid-gini:0.286148
[700]	train-auc:0.690722	valid-auc:0.643379	train-gini:0.381445	valid-gini:0.286758
[800]	train-auc:0.695842	valid-auc:0.64324	train-gini:0.391684	valid-gini:0.28648
Stopping. Best iteration:
[736]	train-auc:0.692