In [1]:
%%javascript
$('<div id="toc"></div>').css({position: 'fixed', top: '120px', left: 0}).appendTo(document.body);
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js');

<IPython.core.display.Javascript object>

# Libraries

In [105]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np

from jupyterthemes import jtplot
jtplot.style()

import xgboost as xg
from xgboost import XGBModel
from xgboost import plot_importance
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, ShuffleSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE

from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import r2_score

from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation

from catboost import CatBoostRegressor
from tqdm import tqdm

%matplotlib inline
%load_ext autotime
%load_ext line_profiler
%matplotlib inline 

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler
time: 21.9 ms


# Processing

In [3]:
def plot_data(test, pred, sample, title, width=40, height=10, linewidth=0.5, color1='white', color2='orange'):
    """ Plotting method. """
    fig = plt.figure(figsize=(width, height))
    plt.plot(pred[:sample], color=color1, zorder=4, linewidth=linewidth, label='%s Prediction'%(title))
    plt.plot(test[:sample], color=color2, zorder=3, linewidth=linewidth, label='%s True Data'%(title))
    plt.title = title
    plt.legend()

# Frequency count
def get_frequency(data):
    # Gets the frequency of a column's values in 'data'. Pass on a series.
    vals = pd.merge(data.to_frame(), data.value_counts().reset_index(), 
                    how='left', left_on=data.to_frame().columns[0], right_on='index').iloc[:, -1:].values
    return vals
  
def time_data(data):
    data['transactiondate'] = pd.to_datetime(data['transactiondate'])
    data['day_of_week']     = data['transactiondate'].dt.dayofweek
    data['month_of_year']   = data['transactiondate'].dt.month
    data['quarter']         = data['transactiondate'].dt.quarter
    data['is_weekend']      = (data['day_of_week'] < 5).astype(int)
    data.drop('transactiondate', axis=1, inplace=True)
    
    print('Added time data')
    print('........')
    
    return data


def column_excluder(data, missing_perc_thresh=0.98):
    # Quick clean from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
    
    exclude_missing = []
    exclude_unique = []
    num_rows = data.shape[0]
    for c in data.columns:
        num_missing = data[c].isnull().sum()
        if num_missing == 0:
            continue
        missing_frac = num_missing / float(num_rows)
        if missing_frac > missing_perc_thresh:
            exclude_missing.append(c)

        num_uniques = len(data[c].unique())
        if data[c].isnull().sum() != 0:
            num_uniques -= 1
        if num_uniques == 1:
            exclude_unique.append(c)
            
    to_exclude = list(set(exclude_missing + exclude_unique))
    
    print('Excluded columns:')
    print(to_exclude)
    print('........')
    
    return to_exclude

def categorical_features(data):
    # Quick categories from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
        
    cat_feature_inds = []
    cat_unique_thresh = 1000
    for i, c in enumerate(data.columns):
        num_uniques = len(data[c].unique())
        if num_uniques < cat_unique_thresh \
            and not 'sqft'   in c \
            and not 'cnt'    in c \
            and not 'nbr'    in c \
            and not 'number' in c:
            cat_feature_inds.append(i)

    print("Categorical features:")
    print([data.columns[ind] for ind in cat_feature_inds])
    print('........')
    
    return cat_feature_inds


def complex_features(data):
    # Gets counts, label encoding and frequency estimates.
    
    # Frequency of occurances | length of codes | check if * is present
    data['propertyzoningdesc_frq'] = get_frequency(data['propertyzoningdesc'])
    data['propertyzoningdesc_len'] = data['propertyzoningdesc'].apply(lambda x: len(x) if pd.notnull(x) else x)
    #transactions_shuffled['propertyzoningdesc_str'] = transactions_shuffled['propertyzoningdesc'].apply(lambda x: (1 if '*' in str(x) else 0) if pd.notnull(x) else x)

    # Label encoding | length of code
    #transactions_shuffled['propertycountylandusecode_enc'] = transactions_shuffled[['propertycountylandusecode']].astype(str).apply(LabelEncoder().fit_transform)
    #transactions_shuffled['propertycountylandusecode_len'] = transactions_shuffled['propertycountylandusecode'].apply(lambda x: x if pd.isnull(x) else len(x))

    # Zip code area extraction
    data['regionidzip_ab']  = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:2]).astype(float)
    data['regionidzip_abc'] = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:3]).astype(float)

    # Region neighbourhood area extraction
    data['regionidneighborhood_ab'] = data['regionidneighborhood'].apply(lambda x: str(x)[:2] if pd.notnull(x) else x).astype(float)

    # Rawcensustractandblock transformed
    data['code_fips_cnt']  = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[:4]))
    data['code_tract_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[4:11]))
    data['code_block_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[11:]))
    data.drop('rawcensustractandblock', axis=1, inplace=True)
    
    # Encode string values
    data[['propertycountylandusecode', 'propertyzoningdesc']] = data[['propertycountylandusecode', 'propertyzoningdesc']].astype(str).apply(LabelEncoder().fit_transform)
    
    print('Generating complex features')
    print('........')
    
    return data

time: 142 ms


In [4]:
def cat_booster(x_train, y_train, x_valid, y_valid, cat_index):
    # Cat booster train and predict
    num_ensembles = 5
    y_pred_valid = 0.0
    y_pred_train = 0.0
    
    print('Initialising CAT Boost Regression')
    for i in tqdm(range(num_ensembles)):
        # Use CV, tune hyperparameters
        catb = CatBoostRegressor(
            iterations=200, learning_rate=0.01,
            depth=3, l2_leaf_reg=3,
            loss_function='MAE',
            eval_metric='MAE',
            random_seed=i)

        catb.fit(x_train, y_train, cat_features=cat_index)

        y_pred_valid += catb.predict(x_valid)
        y_pred_train += catb.predict(x_train)

    y_pred_valid /= num_ensembles
    y_pred_train /= num_ensembles

    print('Train MAE:', mean_absolute_error(y_train, y_pred_train))
    print('Valid MAE:', mean_absolute_error(y_valid, y_pred_valid))
    
    return catb, y_pred_valid

time: 9.72 ms


# Data Load

In [8]:
train = pd.read_csv("../Data/train_2016_v2.csv", parse_dates=["transactiondate"])
prop = pd.read_csv('../Data/properties_2016.csv')
sample = pd.read_csv('../Data/sample_submission.csv')

transactions = pd.merge(train, prop, how='left', on=['parcelid']).sample(frac=1)
#transactions[['propertycountylandusecode', 'propertyzoningdesc']] = transactions[['propertycountylandusecode', 'propertyzoningdesc']].astype(str).apply(LabelEncoder().fit_transform)
#transactions['taxdelinquencyflag'].replace('Y',1, inplace=True)
    
# Clean columns
to_drop = column_excluder(transactions)
transactions.drop(to_drop, axis=1, inplace=True)

# Time data
transactions = time_data(transactions)
transactions = complex_features(transactions)

x_all = transactions.drop(['parcelid', 'propertyzoningdesc', 'propertycountylandusecode', 'fireplacecnt'], axis=1)
y_all = transactions['logerror']
#x_all.drop(['hashottuborspa' 'taxdelinquencyflag' 'fireplaceflag'], axis=1)
#x_all['hashottuborspa'].astype(float, inplace=True)

#x_all.fillna(-1, inplace=True)#.astype(str)#.apply(LabelEncoder().fit_transform)

x_all.fillna(x_all.median(),inplace = True)

ratio = 0.1
x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=ratio, random_state=54)

x_train_label = x_train['logerror'].copy()
x_train_data = x_train.drop(['logerror'], axis=1).copy()

# Drop outliers 
x_train = x_train[(x_train['logerror'] > -0.4) & (x_train['logerror'] < 0.419)]
y_train = x_train['logerror']
x_train.drop('logerror', axis=1, inplace=True)
x_valid.drop('logerror', axis=1, inplace=True)

cat_index = categorical_features(x_train)
best_columns = x_train.columns

y_mean = np.mean(y_train)

del x_all
del y_all

  interactivity=interactivity, compiler=compiler, result=result)


Excluded columns:
['architecturalstyletypeid', 'basementsqft', 'taxdelinquencyyear', 'typeconstructiontypeid', 'pooltypeid7', 'finishedsquarefeet13', 'finishedsquarefeet6', 'pooltypeid10', 'poolcnt', 'storytypeid', 'yardbuildingsqft26', 'hashottuborspa', 'decktypeid', 'poolsizesum', 'taxdelinquencyflag', 'pooltypeid2', 'buildingclasstypeid', 'fireplaceflag']
........
Added time data
........
Generating complex features
........
Categorical features:
['airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'assessmentyear', 'day_of_week', 'month_of_year', 'quarter', 'is_weekend', 'propertyzoningdesc_frq', 'propertyzoningdesc_len', 'regionidzip_ab', 'regionidzip_abc', 'regionidneighborhood_ab']
........
time: 27.8 s


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
models = {}

time: 924 µs


# OLS

In [10]:
#### Base features, includes days, tax flag on.
# Removed: 'propertyzoningdesc','propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'

# OLS
    # OLS MAE LR Valid: 0.0671249660894 Train: 0.0684872590475
# Bayesian Ridge Regression
    # BR MAE LR Valid: 0.0670812710093 Train: 0.0684733515189

# Removes null features and ones with little variance

# Removes all and processes complex

# Best + label encoding

# OLS
model_lr = LinearRegression()
model_lr.fit(x_train, y_train)
y_pred_lr_valid = model_lr.predict(x_valid)
y_pred_lr_train = model_lr.predict(x_train_data)
models['LinearRegression'] = model_lr

# BayesianRidge Regression
model_br = BayesianRidge(compute_score=True)
model_br.fit(x_train, y_train)
y_pred_br_valid = model_br.predict(x_valid)
y_pred_br_train = model_br.predict(x_train_data)
models['LinearRegression'] = model_lr

# Make predictions on both test and validation with OLS and BR
predicted_mae_lr_valid = mean_absolute_error(y_valid, y_pred_lr_valid)
predicted_mae_lr_train = mean_absolute_error(x_train_label, y_pred_lr_train)
predicted_mae_br_valid = mean_absolute_error(y_valid, y_pred_br_valid)
predicted_mae_br_train = mean_absolute_error(x_train_label, y_pred_br_train)

print('OLS MAE LR Valid:', predicted_mae_lr_valid, 'Train:', predicted_mae_lr_train)
print('BR MAE LR Valid:',  predicted_mae_br_valid, 'Train:', predicted_mae_br_train)

OLS MAE LR Valid: 0.066726802458 Train: 0.0680344346439
BR MAE LR Valid: 0.0667426692537 Train: 0.0680417227779
time: 357 ms


# Cat Boost

In [11]:
#### Base features, includes days, tax flag on.
# Removed: 'propertyzoningdesc','propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'
    #Train MAE: 0.0669406345152
    #Valid MAE: 0.0674199299204

# Removes null features and ones with little variance

# Removes all and processes complex

# Best + label encoding
model_cb, preds = cat_booster(x_train, y_train, x_valid, y_valid, cat_index)
models['CatBoost'] = model_cb

  0%|          | 0/5 [00:00<?, ?it/s]

Initialising CAT Boost Regression


100%|██████████| 5/5 [02:31<00:00, 30.31s/it]

Train MAE: 0.0521187878452
Valid MAE: 0.0662922762167
time: 2min 31s





In [None]:
# All scorer objects follow the convention that higher return values are better than lower return 
# values. Thus metrics which measure the distance between the model and the data, like 
# metrics.mean_squared_error, are available as neg_mean_squared_error which return the negated 
# value of the metric.
scores = cross_validation.cross_val_score(model_cb, x_train, y_train, cv=5, scoring='neg_mean_absolute_error', verbose=1)
print("%s MAE: %0.5f (+/- %0.5f)" % (model_cb.__class__.__name__, scores.mean(), scores.std() * 2))

# XGB

In [15]:
#### Base features, includes days, tax flag on.
# Removed: 'propertyzoningdesc','propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'
    #train-mae:0.068375	valid-mae:0.068989


# Removes null features and ones with little variance

# Removes all and processes complex

# Best + label encoding

params_xgb = {
    'max_depth':        5, # shuld be 0.5 to 1% of the examples
    'subsample':        1,  # Ratio of observations to be used as samples for each tree
    'min_child_weight': 10, # Deals with imbalanced data and prevents overfitting as the value >
    'objective':        'reg:linear',
    'n_estimators':     1000, # Sequential trees to be modelled.
    'eta':              0.1, # Shrinkage. Typically between 0.1 - 0.2 - learning rate for gradient boost (D:0.3)
    'eval_metric':      'mae',
    'base_score':       y_mean,
}

d_train = xg.DMatrix(x_train, label=y_train, missing=-1)
d_valid = xg.DMatrix(x_valid, label=y_valid, missing=-1)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
xgb_gs = xg.train(params_xgb, d_train, len(x_valid), watchlist, early_stopping_rounds=100, verbose_eval=50)
models['XGB'] = xgb_gs

[0]	train-mae:0.053226	valid-mae:0.066893
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[50]	train-mae:0.051906	valid-mae:0.066233
[100]	train-mae:0.051354	valid-mae:0.066211
[150]	train-mae:0.050832	valid-mae:0.066203
Stopping. Best iteration:
[95]	train-mae:0.051395	valid-mae:0.066191

time: 38.7 s


# LightGBM

In [13]:
def light_gbm_folds(x_train, x_valid, y_train, y_valid, params, num_ensembles):
    # Light gbm n ensambles average predictions

    y_pred_valid = 0.0
    y_pred_train = 0.0
    
    d_train = lgb.Dataset(x_train, label=y_train)
    
    print('Initialising Light GBM')
    for i in tqdm(range(num_ensembles)):
        # Use CV, tune hyperparameters
        params['seed'] = i
        model_lgb = lgb.train(params, d_train, 430)
        
        lg_pred_valid = model_lgb.predict(x_valid)
        lg_pred_train = model_lgb.predict(x_train)

    lg_pred_valid /= num_ensembles
    lg_pred_train /= num_ensembles
    
    print('Train MAE:', mean_absolute_error(y_train, lg_pred_train))
    print('Valid MAE:', mean_absolute_error(y_valid, lg_pred_valid))
    
    return model_lgb

time: 9.01 ms


In [16]:
import random
import lightgbm as lgb

params_lg={
    'max_bin'          : 10,
    'learning_rate'    : 0.0021, # shrinkage_rate
    'boosting_type'    : 'gbdt',
    'objective'        : 'regression',
    'metric'           : 'mae',      
    'sub_feature'      : 0.345 ,   
    'bagging_fraction' : 0.85, 
    'bagging_freq'     : 40,
    'num_leaves'       : 512,       # num_leaf
    'min_data'         : 500,         # min_data_in_leaf
    'min_hessian'      : 0.05,     # min_sum_hessian_in_leaf
    'verbose'          : 1
}

model_lgb = light_gbm_folds(x_train, x_valid, y_train, y_valid, params_lg, num_ensembles=5)
models['LightGBM'] = model_lgb

  0%|          | 0/5 [00:00<?, ?it/s]

Initialising Light GBM


100%|██████████| 5/5 [01:00<00:00, 12.18s/it]

Train MAE: 0.0532560595725
Valid MAE: 0.0669780442807
time: 1min





# DNN

In [30]:
#### Base features, includes days, tax flag on.
# Removed: 'propertyzoningdesc','propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'
# NN

# Removes null features and ones with little variance

# Removes all and processes complex

# Best + label encoding

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.noise import GaussianDropout
from keras.optimizers import Adam
from sklearn.preprocessing import Imputer

def larger_model():
    # create model
    model = Sequential()
    model.add(Dense(size, input_dim=size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(size*2, kernel_initializer='normal', activation='relu'))
    model.add(Dense(size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
    return model

# define wider model
def wider_model():
    # create model
    model = Sequential()
    model.add(Dense(size*2, input_dim=size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
    return model


# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(size, input_dim=size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
    return model

def prebuilt_nn():
    nn = Sequential()
    nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = size))
    nn.add(PReLU())
    nn.add(Dropout(.4))
    nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.6))
    nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.5))
    nn.add(Dense(units = 26, kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.6))
    nn.add(Dense(1, kernel_initializer='normal'))
    nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))

    return nn

time: 52.5 ms


In [31]:
## Preprocessing
print("Preprocessing neural network data...")
imputer= Imputer()
imputer.fit(x_train.iloc[:, :])
x_train_nn = imputer.transform(x_train.iloc[:, :])

imputer.fit(x_valid.iloc[:, :])
x_valid_nn = imputer.transform(x_valid.iloc[:, :])

sc = StandardScaler()
x_train_nn = sc.fit_transform(x_train_nn)
x_valid_nn = sc.transform(x_valid_nn)

Preprocessing neural network data...
time: 178 ms


In [52]:
# fix random seed for reproducibility
seed = 7
size = x_train_nn.shape[1]
# Prebuit KAGGLE Kernel
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=prebuilt_nn, epochs=5, batch_size=50, verbose=0)))
pipeline = Pipeline(estimators)
pipeline.fit(x_train_nn, y_train)
mean_absolute_error(y_valid, pipeline.predict(x_valid_nn))
models['DNN'] = pipeline

0.066781628587123346

time: 31.6 s


# LSTM

In [82]:
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
 
#x_train = x_train.values
#x_valid = x_valid.values

# reshape input to be 3D [samples, timesteps, features]
x_train_lstm = x_train.reshape((x_train.shape[0], 1, x_train.shape[1]))
x_valid_lstm = x_valid.reshape((x_valid.shape[0], 1, x_valid.shape[1]))
 
# design network
lstm = Sequential()
lstm.add(LSTM(50, input_shape=(x_train_lstm.shape[1], x_train_lstm.shape[2])))
lstm.add(PReLU())
lstm.add(Dropout(.4))
lstm.add(Dense(units = 160 , kernel_initializer = 'normal'))
lstm.add(Dense(1))
lstm.compile(loss='mae', optimizer='adam')
# fit network
lstm.fit(x_train_lstm, y_train, epochs=10, batch_size=50, validation_data=(x_valid_lstm, y_valid), verbose=1, shuffle=False)
 
# make a prediction
yhat = lstm.predict(x_valid_lstm)
models['LSTM'] = lstm
mae = mean_absolute_error(y_valid, yhat)
print('Test MAE: %.3f' % mae)

Train on 79677 samples, validate on 9028 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MAE: 0.067
time: 2min 14s


# Stacking

In [159]:
import time
from sklearn.metrics import mean_absolute_error, make_scorer
from xgboost import XGBRegressor
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, \
        ExtraTreesRegressor, AdaBoostClassifier
from sklearn import grid_search

time: 31.2 ms




In [171]:
def mean_absolute_error_(ground_truth, predictions):
    return mean_absolute_error(ground_truth, predictions)

MAE = make_scorer(mean_absolute_error_, greater_is_better=False)

class Ensemble(object):
    
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=2016))
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        start_time = time.time()
        for i, clf in enumerate(self.base_models):

            print('Fitting For Base Model #%d / %d ---', i+1, len(self.base_models))
            for j, (train_idx, test_idx) in enumerate(folds):

                print('--- Fitting For Fold %d / %d ---', j+1, self.n_folds)

                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                # y_holdout = y[test_idx]
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]
                S_train[test_idx, i] = y_pred

                print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

            print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        print('--- Base Models Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        clf = self.stacker
        clf.fit(S_train, y)

        print('--- Stacker Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

    def preidct(self, X):
        X = np.array(X)
        folds = list(KFold(len(X), n_folds=self.n_folds, shuffle=True, random_state=2016))
        S_test = np.zeros((X.shape[0], len(self.base_models)))

        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((X.shape[0], len(folds)))
            for j, (train_idx, test_idx) in enumerate(folds):
                S_test_i[:, j] = clf.predict(X)[:]
            S_test[:, i] = S_test_i.mean(1)

        clf = self.stacker
        y_pred = clf.predict(S_test)[:]
        return y_pred

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        start_time = time.time()
        folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=2016))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))

        for i, clf in enumerate(self.base_models):

            print('Fitting For Base Model #{0} / {1} ---'.format(i+1, len(self.base_models)))

            S_test_i = np.zeros((T.shape[0], len(folds)))

            for j, (train_idx, test_idx) in enumerate(folds):

                print('--- Fitting For Fold #{0} / {1} ---'.format(j+1, self.n_folds))

                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]
                
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]

                print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

            S_test[:, i] = S_test_i.mean(1)

            print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        print('--- Base Models Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        param_grid = {
            'n_estimators': [100],
            'learning_rate': [0.05],
            'subsample': [0.75]
        }
        grid = grid_search.GridSearchCV(estimator=self.stacker, param_grid=param_grid, n_jobs=1, cv=5, verbose=20, scoring=MAE)
        grid.fit(S_train, y)

        # a little memo
        message = 'to determine local CV score of #28'

        try:
            print('Param grid:')
            print(param_grid)
            print('Best Params:')
            print(grid.best_params_)
            print('Best CV Score:')
            print(-grid.best_score_)
            print('Best estimator:')
            print(grid.best_estimator_)
            print(message)
        except:
            pass

        print('--- Stacker Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        y_pred = grid.predict(S_test)[:]

        return y_pred

time: 192 ms


In [179]:
base_models = [
    RandomForestRegressor(
        n_jobs=1, random_state=2016, verbose=1,
        n_estimators=500, max_features=12
    ),
    ExtraTreesRegressor(
        n_jobs=1, random_state=2016, verbose=1,
        n_estimators=500, max_features=12
    ),
    GradientBoostingRegressor(
        random_state=2016, verbose=1,
        n_estimators=500, max_features=12, max_depth=8,
        learning_rate=0.05, subsample=0.8
    ),
    XGBRegressor(
        seed=2016,
        n_estimators=200, max_depth=8,
        learning_rate=0.05, subsample=0.8, colsample_bytree=0.85
    )
]

ensemble = Ensemble(
    n_folds=5,
    stacker=GradientBoostingRegressor(
        random_state=2016, verbose=1
    ),
    base_models=base_models#models
)

time: 19.9 ms


In [None]:
ensemble_prediction = ensemble.fit_predict(x_train, y_train, x_valid)
print('MAE', mean_absolute_error(y_valid, ensemble_prediction))

Fitting For Base Model #1 / 4 ---
--- Fitting For Fold #1 / 5 ---


In [None]:
########## LAYER 1 ##########
# Submodel  1 : OLS                      # Ordinary least squares estimator Sklearn implementation
# Submodel  2 : BR                       # Bayesian ridge regression - Sklearn implementation
# Submodel  3 : DNN                      # Dense Neural Network - Keras - Dense layers 
# Submodel  4 : LightGBM                 # Light Gradient Boosting - https://github.com/Microsoft/LightGBM
# Submodel  5 : XGBoost                  # Extreme Gradient Boosting - http://xgboost.readthedocs.io/en/latest/model.html
# Submodel  6 : CatBoost                 # Categorical Boosting https://github.com/catboost/catboost
# Submodel  7 : LSTM                     # Long Short Term Memory Neural Network - Keras implementation
# Submodel  8 : RandomForestRegressor    # Sklearn implementation
# Submodel  9 : ExtraTreesRegressor      # Sklearn implementation
# Submodel 10 : SVR                      # Support vector machines for regression - Sklearn implementation
# Submodel 11 : AdaBoost                 # Adaptive Boosting Sklearn Implementation

########## LAYER 2 ##########
# https://www.kaggle.com/dragost/boosted-trees-lb-0-0643707/edit

# Save data

In [None]:
x_predict = transactions.copy()

In [None]:
submission_sample[m] = submission_sample['ParcelId'].to_frame().merge(x_predict[['parcelid', m]], how='left', left_on='ParcelId', right_on='parcelid')[m]

In [None]:
# https://www.kaggle.com/c/zillow-prize-1/discussion/33899, Oct,Nov,Dec
test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
#    '201710': pd.Timestamp('2017-09-30'),
#   '201711': pd.Timestamp('2017-10-31'),
#    '201712': pd.Timestamp('2017-11-30')
}

all_preds = pd.DataFrame()
for m in test_dates.keys():
    # Building predictions.
    print('Processing', m)
    x_predict = transactions.copy()
    x_predict = complex_features(x_predict)
    x_predict['transactiondate'] = test_dates[m]
    x_predict = time_data(x_predict)
    
    print('Cleaning data')
    print('........')
    
    x_predict = x_predict.fillna(-999).astype(str).apply(LabelEncoder().fit_transform)
    
    print('Predicting')
    print('........')
    
    x_predict[m] = xgb_gs2.predict(xg.DMatrix(x_predict[best_columns]))
    all_preds[m] = x_predict[m].copy()
    submission_sample[m] = submission_sample['ParcelId'].to_frame().merge(x_predict[['parcelid', m]], how='left', left_on='ParcelId', right_on='parcelid')[m]
    
#del x_predict

In [None]:
x_predict[list(test_dates.keys())] = all_preds[list(test_dates.keys())]

In [None]:
x_predict

In [None]:
m = '201610'
submission_sample[m] = submission_sample['ParcelId'].to_frame().merge(x_predict[['parcelid', m]], how='left', left_on='ParcelId', right_on='parcelid')[m]

In [None]:
submission_sample

In [None]:
submission_sample.to_csv('submission4.csv',index=False)
submission_sample.head()

In [None]:
# https://www.kaggle.com/c/zillow-prize-1/discussion/33899, Oct,Nov,Dec
test_dates = {'201610': pd.Timestamp('2016-09-30'),
              '201611': pd.Timestamp('2016-10-31'),
              '201612': pd.Timestamp('2016-11-30')}

x_predict = transactions.copy()
x_predict = complex_features(x_predict)

for m in test_dates.keys():
    print('Processing', m)  
    x_predict['transactiondate'] = test_dates[m]
    x_predict = time_data(x_predict)
    x_predict = x_predict[best_columns].fillna(-999).astype(str).apply(LabelEncoder().fit_transform)
    
    print('Predicting')
    print('........')
    
    # 5 iterations cat booster for each prediction.
    x_predict[m] = cat_booster(x_train, y_train, x_predict.values)
    submission_sample[m] = submission_sample['ParcelId'].to_frame().merge(x_predict[['parcelid', m]], how='left', left_on='ParcelId', right_on='parcelid')[m]
    
del x_predict
del predictions

# RFE