# Allstate Stacking Baseline

## Import. We will use package 'vecstack' which implenets stacking routines

In [1]:
from __future__ import division
import gc
import numpy as np
np.set_printoptions(suppress = True)
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from vecstack import stacking



## Load data

In [2]:
dir_path = '../input/'
train_df = pd.read_csv(dir_path + 'train.csv', sep = ',', header = 0, low_memory = False, nrows = None)
test_df = pd.read_csv(dir_path + 'test.csv', sep = ',', header = 0, nrows = None)
subm_df = pd.read_csv(dir_path + 'sample_submission.csv', sep = ',', header = 0, nrows = None)

## Prepare column lists

In [3]:
id_col = 'id'
y_col = 'loss'
all_cols = train_df.columns.tolist()
all_cols.remove(id_col)
all_cols.remove(y_col)
obj_cols = train_df.dtypes[train_df.dtypes == 'object'].index.tolist()
num_cols = list(set(all_cols) - set(obj_cols))
num_cols.sort(key = lambda x: '{0:0>6}'.format(x)) # sort

## Factorize

In [4]:
r, c = train_df.shape
test_df.loc[:, 'loss'] = 0
z_df = pd.concat([train_df, test_df], ignore_index = True)

for col in obj_cols:
    z_df.loc[:, col] = pd.factorize(z_df[col], sort = True)[0]
        
train_df = z_df[:r]
test_df = z_df[r:]

## Get numpy arrays

In [5]:
X_train = train_df[all_cols].values
y_train = train_df[y_col].values
X_test = test_df[all_cols].values

## Free RAM

In [6]:
del train_df
del test_df
del z_df
_ = gc.collect()

## Initialize 1-st level models

In [7]:
models = [
    XGBRegressor(seed = 0, colsample_bytree = 0.7, subsample = 0.7, learning_rate = 0.075, 
                 max_depth = 7, min_child_weight = 1, n_estimators = 400),
    ExtraTreesRegressor(random_state = 0, n_jobs = 4, n_estimators = 100, max_features = 0.5, 
                        max_depth = 12, min_samples_leaf = 2),
    RandomForestRegressor(random_state = 0, n_jobs = 4, n_estimators = 100, max_features = 0.2, 
                          max_depth = 8, min_samples_leaf = 2),
    ]

## Get stacking features

In [8]:
S_train, S_test = stacking(models, X_train, y_train, X_test, 
    n_folds = 3, shuffle = True, transform_target = np.log, 
    transform_pred = np.exp, verbose = 2)

task:   [regression]
metric: [mean_absolute_error]

model 0: [XGBRegressor]
    fold 0: [1149.39250935]
    fold 1: [1156.79667203]
    fold 2: [1141.18669243]
    ----
    MEAN:   [1149.12533343]

model 1: [ExtraTreesRegressor]
    fold 0: [1236.34859807]
    fold 1: [1247.20592610]
    fold 2: [1231.92712626]
    ----
    MEAN:   [1238.49391835]

model 2: [RandomForestRegressor]
    fold 0: [1286.40670066]
    fold 1: [1298.92017554]
    fold 2: [1282.92941276]
    ----
    MEAN:   [1289.41879744]



## Initialize and apply 2-nd level model

In [9]:
model = XGBRegressor(seed = 0, colsample_bytree = 0.8, subsample = 0.6, 
                     learning_rate = 0.01, max_depth = 4, min_child_weight = 1, 
                     n_estimators = 1028)
model = model.fit(S_train, np.log(y_train))
subm_df.iloc[:, 1] = np.exp(model.predict(S_test))
subm_df.to_csv(dir_path + 'submission.csv', sep = ',', index = False)