In [1]:
%%javascript
$('<div id="toc"></div>').css({position: 'fixed', top: '120px', left: 0}).appendTo(document.body);
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js');

<IPython.core.display.Javascript object>

# Libraries

In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np

from jupyterthemes import jtplot
jtplot.style()

import xgboost as xg
from xgboost import XGBModel
from xgboost import plot_importance
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, ShuffleSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE

from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import r2_score

from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation

from catboost import CatBoostRegressor
from tqdm import tqdm

%matplotlib inline
%load_ext autotime
%load_ext line_profiler
%matplotlib inline 

  return f(*args, **kwds)


# Processing

In [3]:
def plot_data(test, pred, sample, title, width=40, height=10, linewidth=0.5, color1='white', color2='orange'):
    """ Plotting method. """
    fig = plt.figure(figsize=(width, height))
    plt.plot(pred[:sample], color=color1, zorder=4, linewidth=linewidth, label='%s Prediction'%(title))
    plt.plot(test[:sample], color=color2, zorder=3, linewidth=linewidth, label='%s True Data'%(title))
    plt.title = title
    plt.legend()

# Frequency count
def get_frequency(data):
    # Gets the frequency of a column's values in 'data'. Pass on a series.
    vals = pd.merge(data.to_frame(), data.value_counts().reset_index(), 
                    how='left', left_on=data.to_frame().columns[0], right_on='index').iloc[:, -1:].values
    return vals
  
def time_data(data):
    data['transactiondate'] = pd.to_datetime(data['transactiondate'])
    data['day_of_week']     = data['transactiondate'].dt.dayofweek
    data['month_of_year']   = data['transactiondate'].dt.month
    data['quarter']         = data['transactiondate'].dt.quarter
    data['is_weekend']      = (data['day_of_week'] < 5).astype(int)
    data.drop('transactiondate', axis=1, inplace=True)
    
    print('Added time data')
    print('........')
    
    return data


def column_excluder(data, missing_perc_thresh=0.98):
    # Quick clean from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
    
    exclude_missing = []
    exclude_unique = []
    num_rows = data.shape[0]
    for c in data.columns:
        num_missing = data[c].isnull().sum()
        if num_missing == 0:
            continue
        missing_frac = num_missing / float(num_rows)
        if missing_frac > missing_perc_thresh:
            exclude_missing.append(c)

        num_uniques = len(data[c].unique())
        if data[c].isnull().sum() != 0:
            num_uniques -= 1
        if num_uniques == 1:
            exclude_unique.append(c)
            
    to_exclude = list(set(exclude_missing + exclude_unique))
    
    print('Excluded columns:')
    print(to_exclude)
    print('........')
    
    return to_exclude

def categorical_features(data):
    # Quick categories from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
        
    cat_feature_inds = []
    cat_unique_thresh = 1000
    for i, c in enumerate(data.columns):
        num_uniques = len(data[c].unique())
        if num_uniques < cat_unique_thresh \
            and not 'sqft'   in c \
            and not 'cnt'    in c \
            and not 'nbr'    in c \
            and not 'number' in c:
            cat_feature_inds.append(i)

    print("Categorical features:")
    print([data.columns[ind] for ind in cat_feature_inds])
    print('........')
    
    return cat_feature_inds


def complex_features(data):
    # Gets counts, label encoding and frequency estimates.
    
    # Frequency of occurances | length of codes | check if * is present
    data['propertyzoningdesc_frq'] = get_frequency(data['propertyzoningdesc'])
    data['propertyzoningdesc_len'] = data['propertyzoningdesc'].apply(lambda x: len(x) if pd.notnull(x) else x)
    #transactions_shuffled['propertyzoningdesc_str'] = transactions_shuffled['propertyzoningdesc'].apply(lambda x: (1 if '*' in str(x) else 0) if pd.notnull(x) else x)

    # Label encoding | length of code
    #transactions_shuffled['propertycountylandusecode_enc'] = transactions_shuffled[['propertycountylandusecode']].astype(str).apply(LabelEncoder().fit_transform)
    #transactions_shuffled['propertycountylandusecode_len'] = transactions_shuffled['propertycountylandusecode'].apply(lambda x: x if pd.isnull(x) else len(x))

    # Zip code area extraction
    data['regionidzip_ab']  = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:2]).astype(float)
    data['regionidzip_abc'] = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:3]).astype(float)

    # Region neighbourhood area extraction
    data['regionidneighborhood_ab'] = data['regionidneighborhood'].apply(lambda x: str(x)[:2] if pd.notnull(x) else x).astype(float)

    # Rawcensustractandblock transformed
    data['code_fips_cnt']  = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[:4]))
    data['code_tract_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[4:11]))
    data['code_block_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[11:]))
    data.drop('rawcensustractandblock', axis=1, inplace=True)
    
    # Encode string values
    data[['propertycountylandusecode', 'propertyzoningdesc']] = data[['propertycountylandusecode', 'propertyzoningdesc']].astype(str).apply(LabelEncoder().fit_transform)
    
    print('Generating complex features')
    print('........')
    
    return data

time: 138 ms


In [None]:
models = {}

# Data Load

In [24]:
seed = 11
np.random.seed(seed)

train = pd.read_csv("../Data/train_2016_v2.csv", parse_dates=["transactiondate"])
prop = pd.read_csv('../Data/properties_2016.csv')
sample = pd.read_csv('../Data/sample_submission.csv')

transactions = pd.merge(train, prop, how='left', on=['parcelid']).sample(frac=1)
#transactions[['propertycountylandusecode', 'propertyzoningdesc']] = transactions[['propertycountylandusecode', 'propertyzoningdesc']].astype(str).apply(LabelEncoder().fit_transform)
transactions['taxdelinquencyflag'].replace('Y',1, inplace=True)
    
# Clean columns
to_drop = column_excluder(transactions)
transactions.drop(to_drop, axis=1, inplace=True)

# Time data
transactions = time_data(transactions)
transactions = complex_features(transactions)

x_all = transactions.drop(['parcelid', 'propertyzoningdesc', 'propertycountylandusecode', 'fireplacecnt'], axis=1)
y_all = transactions['logerror']
#x_all.drop(['hashottuborspa' 'taxdelinquencyflag' 'fireplaceflag'], axis=1)
#x_all['hashottuborspa'].astype(float, inplace=True)

#x_all.fillna(-1, inplace=True)#.astype(str)#.apply(LabelEncoder().fit_transform)

x_all.fillna(x_all.median(),inplace = True)

ratio = 0.1
x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=ratio)

x_train_label = x_train['logerror'].copy()
x_train_data = x_train.drop(['logerror'], axis=1).copy()

# Drop outliers 
x_train = x_train[(x_train['logerror'] > -0.4) & (x_train['logerror'] < 0.419)]
y_train = x_train['logerror']
x_train.drop('logerror', axis=1, inplace=True)
x_valid.drop('logerror', axis=1, inplace=True)

cat_index = categorical_features(x_train)
best_columns = x_train.columns

y_mean = np.mean(y_train)

del x_all
del y_all
del transactions

  interactivity=interactivity, compiler=compiler, result=result)


Excluded columns:
['taxdelinquencyyear', 'buildingclasstypeid', 'yardbuildingsqft26', 'poolcnt', 'pooltypeid2', 'finishedsquarefeet13', 'poolsizesum', 'storytypeid', 'pooltypeid10', 'architecturalstyletypeid', 'decktypeid', 'typeconstructiontypeid', 'pooltypeid7', 'finishedsquarefeet6', 'taxdelinquencyflag', 'hashottuborspa', 'basementsqft', 'fireplaceflag']
........
Added time data
........
Generating complex features
........
Categorical features:
['airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'assessmentyear', 'day_of_week', 'month_of_year', 'quarter', 'is_weekend', 'propertyzoningdesc_frq', 'propertyzoningdesc_len', 'regionidzip_ab', 'regionidzip_abc', 'regionidneighborhood_ab']
........
time: 31.3 s


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# OLS

In [25]:
# OLS
model_lr = LinearRegression()
model_lr.fit(x_train, y_train)
y_pred_lr_valid = model_lr.predict(x_valid)
y_pred_lr_train = model_lr.predict(x_train_data)
models['LinearRegression'] = model_lr

# Make predictions on both test and validation with OLS and BR
predicted_mae_lr_valid = mean_absolute_error(y_valid, y_pred_lr_valid)
predicted_mae_lr_train = mean_absolute_error(x_train_label, y_pred_lr_train)

print('OLS MAE LR Valid:', predicted_mae_lr_valid, 'Train:', predicted_mae_lr_train)

del y_pred_lr_valid
del y_pred_lr_train

OLS MAE LR Valid: 0.0655499689458 Train: 0.0681734572472
time: 171 ms


# Bayesian Ridge

In [26]:
# BayesianRidge Regression
model_br = BayesianRidge(compute_score=True)
model_br.fit(x_train, y_train)
y_pred_br_valid = model_br.predict(x_valid)
y_pred_br_train = model_br.predict(x_train_data)
models['BayesianRidge'] = model_br

predicted_mae_br_valid = mean_absolute_error(y_valid,       y_pred_br_valid)
predicted_mae_br_train = mean_absolute_error(x_train_label, y_pred_br_train)

print('BR MAE BayesianRidge Valid: %s \nTrain: %s' % (predicted_mae_br_valid, predicted_mae_br_train))

del y_pred_br_valid
del y_pred_br_train

BR MAE BayesianRidge Valid: 0.065548413949 
Train: 0.0681805346714
time: 215 ms


# Random Forest

In [27]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(n_jobs=1, random_state=2016, verbose=1, n_estimators=500, 
                                 max_features=12)
model_rf.fit(x_train, y_train)
y_pred_rf_valid = model_rf.predict(x_valid)
y_pred_rf_train = model_rf.predict(x_train_data)
models['RandomForest'] = model_rf

predicted_mae_rf_valid = mean_absolute_error(y_valid,       y_pred_rf_valid)
predicted_mae_rf_train = mean_absolute_error(x_train_label, y_pred_rf_train)

print('BR MAE RandomForest Valid: %s \nTrain: %s' % (predicted_mae_rf_valid, predicted_mae_rf_train))

del y_pred_rf_train
del y_pred_rf_valid

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  5.7min finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   20.5s finished


BR MAE RandomForest Valid: 0.0656375533565 
Train: 0.035217631714
time: 6min 3s


# Extra Trees

In [28]:
from sklearn.ensemble import ExtraTreesRegressor

model_et = ExtraTreesRegressor(
        n_jobs=1, random_state=2016, verbose=1,
        n_estimators=500, max_features=12)

model_et.fit(x_train, y_train)
y_pred_et_valid = model_et.predict(x_valid)
y_pred_et_train = model_et.predict(x_train_data)
models['ExtraTrees'] = model_et

predicted_mae_et_valid = mean_absolute_error(y_valid,       y_pred_et_valid)
predicted_mae_et_train = mean_absolute_error(x_train_label, y_pred_et_train)

print('BR MAE ExtraTrees Valid: %s \nTrain: %s' % (predicted_mae_et_valid, predicted_mae_et_train))

del y_pred_et_valid
del y_pred_et_train

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    7.7s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   21.6s finished


BR MAE ExtraTrees Valid: 0.0665878837395 
Train: 0.0159463449264
time: 2min 46s


# SVR

In [None]:
model_svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
model_svr_lin = SVR(kernel='linear', C=1e3)
model_svr_poly = SVR(kernel='poly', C=1e3, degree=2)

y_rbf  =  model_svr_rbf.fit(x_train, y_train).predict(x_valid)
y_lin  =  model_svr_lin.fit(x_train, y_train).predict(x_valid)
y_poly = model_svr_poly.fit(x_train, y_train).predict(x_valid)

scores_r = cross_validation.cross_val_score(model_svr_rbf,  x_train_data, x_train_label, cv=5, scoring='neg_mean_absolute_error', verbose=1)
scores_l = cross_validation.cross_val_score(model_svr_lin,  x_train_data, x_train_label, cv=5, scoring='neg_mean_absolute_error', verbose=1)
scores_p = cross_validation.cross_val_score(model_svr_poly, x_train_data, x_train_label, cv=5, scoring='neg_mean_absolute_error', verbose=1)

print("%s MAE: %0.5f (+/- %0.5f)" % (model_svr_rbf.__class__.__name__,  scores_r.mean(), scores_r.std() * 2))
print("%s MAE: %0.5f (+/- %0.5f)" % (model_svr_lin.__class__.__name__,  scores_l.mean(), scores_l.std() * 2))
print("%s MAE: %0.5f (+/- %0.5f)" % (model_svr_poly.__class__.__name__, scores_p.mean(), scores_p.std() * 2))

print('RBF',   mean_absolute_error(y_valid, y_rbf))
print('Linear',mean_absolute_error(y_valid, y_lin))
print('Poly',  mean_absolute_error(y_valid, y_poly))

# AdaBoost

In [30]:
from sklearn.ensemble import AdaBoostRegressor

model_ab = AdaBoostRegressor()
model_ab.fit(x_train, y_train)
y_pred_ab_valid = model_ab.predict(x_valid)
y_pred_ab_train = model_ab.predict(x_train_data)
models['AdaBoost'] = model_ab

predicted_mae_ab_valid = mean_absolute_error(y_valid,       y_pred_ab_valid)
predicted_mae_ab_train = mean_absolute_error(x_train_label, y_pred_ab_train)

print('BR MAE AdaBoost Valid: %s \nTrain: %s' % (predicted_mae_ab_valid, predicted_mae_ab_train))

del y_pred_ab_valid
del y_pred_ab_train

BR MAE AdaBoost Valid: 0.0668425181751 
Train: 0.0695206978103
time: 8.38 s


# CatBoost

In [63]:
def cat_booster(x_train, y_train, x_valid, y_valid, cat_index, loss='MAE'):
    # Cat booster train and predict
    num_ensembles = 5
    y_pred_valid = 0.0
    y_pred_train = 0.0
    
    print('Initialising CAT Boost Regression')
    for i in tqdm(range(num_ensembles)):
        # Use CV, tune hyperparameters
        catb = CatBoostRegressor(
                iterations=630, learning_rate=0.03,
                depth=6, l2_leaf_reg=3,
                loss_function=loss,
                eval_metric='MAE',
                random_seed=i)

        catb.fit(x_train, y_train, cat_features=cat_index)

        y_pred_valid += catb.predict(x_valid)
        y_pred_train += catb.predict(x_train)

    y_pred_valid /= num_ensembles
    y_pred_train /= num_ensembles

    print('Train MAE:', mean_absolute_error(y_train, y_pred_train))
    print('Valid MAE:', mean_absolute_error(y_valid, y_pred_valid))
    
    return catb, y_pred_valid

time: 13.8 ms


In [64]:
model_cb, preds = cat_booster(x_train, y_train, x_valid, y_valid, cat_index)

models['CatBoost'] = model_cb

print('BR MAE CatBoost Valid: %s' % (mean_absolute_error(y_valid, preds)))

del preds

  0%|          | 0/5 [00:00<?, ?it/s]

Initialising CAT Boost Regression


100%|██████████| 5/5 [02:33<00:00, 30.69s/it]


Train MAE: 0.0522779106976
Valid MAE: 0.0650190201683
BR MAE CatBoost Valid: 0.0650190201683
time: 2min 42s


In [58]:
scores = cross_validation.cross_val_score(model_cb, x_train_data, x_train_label, cv=5, scoring='neg_mean_absolute_error', verbose=1)
print("%s MAE: %0.5f (+/- %0.5f)" % (model_cb.__class__.__name__, scores.mean(), scores.std() * 2))

CatBoostRegressor MAE: -0.06777 (+/- 0.00186)
time: 28.6 s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   28.6s finished


# GB

In [29]:
from sklearn.ensemble import GradientBoostingRegressor

model_gb = GradientBoostingRegressor(
             random_state=2016, verbose=1,
             n_estimators=500, max_features=12, max_depth=8,
             learning_rate=0.05, subsample=0.8)

model_gb.fit(x_train, y_train)
y_pred_gb_valid = model_gb.predict(x_valid)
y_pred_gb_train = model_gb.predict(x_train_data)
models['GradientBoosting'] = model_gb

predicted_mae_gb_valid = mean_absolute_error(y_valid,       y_pred_gb_valid)
predicted_mae_gb_train = mean_absolute_error(x_train_label, y_pred_gb_train)

print('BR MAE GradientBoosting Valid: %s \nTrain: %s' % (predicted_mae_gb_valid, predicted_mae_gb_train))

del y_pred_gb_valid
del y_pred_gb_train

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.0070           0.0000            2.92m
         2           0.0069           0.0000            2.80m
         3           0.0069           0.0000            2.75m
         4           0.0069           0.0000            2.79m
         5           0.0068           0.0000            2.80m
         6           0.0069           0.0000            2.81m
         7           0.0068           0.0000            2.69m
         8           0.0068           0.0000            2.70m
         9           0.0068           0.0000            2.69m
        10           0.0068           0.0000            2.68m
        20           0.0067           0.0000            2.55m
        30           0.0065          -0.0000            2.49m
        40           0.0064           0.0000            2.46m
        50           0.0063          -0.0000            2.38m
        60           0.0063          -0.0000            2.30m
       

# XGB

In [34]:
params_xgb = {
    'max_depth':        5,  # shuld be 0.5 to 1% of the examples
    'subsample':        1,  # Ratio of observations to be used as samples for each tree
    'min_child_weight': 10, # Deals with imbalanced data and prevents overfitting as the value >
    'objective':        'reg:linear',
    'n_estimators':     1000, # Sequential trees to be modelled.
    'eta':              0.1,  # Shrinkage. Typically between 0.1 - 0.2 - learning rate for gradient boost (D:0.3)
    'eval_metric':      'mae',
    'base_score':       y_mean,
}

d_train = xg.DMatrix(x_train, label=y_train, missing=-1)
d_valid = xg.DMatrix(x_valid, label=y_valid, missing=-1)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
xgb_gs = xg.train(params_xgb, d_train, len(x_valid), watchlist, early_stopping_rounds=100, verbose_eval=50)
models['XGB'] = xgb_gs

del d_train
del d_valid

[0]	train-mae:0.053332	valid-mae:0.065677
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[50]	train-mae:0.052084	valid-mae:0.06506
[100]	train-mae:0.051548	valid-mae:0.065021
[150]	train-mae:0.051058	valid-mae:0.065019
Stopping. Best iteration:
[94]	train-mae:0.051603	valid-mae:0.065

time: 36.2 s


# LightGBM

In [35]:
def light_gbm_folds(x_train, x_valid, y_train, y_valid, params, num_ensembles):
    # Light gbm n ensambles average predictions

    y_pred_valid = 0.0
    y_pred_train = 0.0
    
    d_train = lgb.Dataset(x_train, label=y_train)
    
    print('Initialising Light GBM')
    for i in tqdm(range(num_ensembles)):
        # Use CV, tune hyperparameters
        params['seed'] = i
        model_lgb = lgb.train(params, d_train, 430)
        
        lg_pred_valid = model_lgb.predict(x_valid)
        lg_pred_train = model_lgb.predict(x_train)

    lg_pred_valid /= num_ensembles
    lg_pred_train /= num_ensembles
    
    print('Train MAE:', mean_absolute_error(y_train, lg_pred_train))
    print('Valid MAE:', mean_absolute_error(y_valid, lg_pred_valid))
    
    return model_lgb

time: 8.6 ms


In [36]:
import random
import lightgbm as lgb

params_lg={
    'max_bin'          : 10,
    'learning_rate'    : 0.0021, # shrinkage_rate
    'boosting_type'    : 'gbdt',
    'objective'        : 'regression',
    'metric'           : 'mae',      
    'sub_feature'      : 0.345 ,   
    'bagging_fraction' : 0.85, 
    'bagging_freq'     : 40,
    'num_leaves'       : 512,       # num_leaf
    'min_data'         : 500,         # min_data_in_leaf
    'min_hessian'      : 0.05,     # min_sum_hessian_in_leaf
    'verbose'          : 1
}

model_lgb = light_gbm_folds(x_train, x_valid, y_train, y_valid, params_lg, num_ensembles=5)
models['LightGBM'] = model_lgb

  0%|          | 0/5 [00:00<?, ?it/s]

Initialising Light GBM


100%|██████████| 5/5 [00:38<00:00,  7.62s/it]

Train MAE: 0.0533712410193
Valid MAE: 0.0658219877093
time: 38.1 s





# DNN

In [37]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.noise import GaussianDropout
from keras.optimizers import Adam
from sklearn.preprocessing import Imputer

def larger_model():
    # create model
    model = Sequential()
    model.add(Dense(size, input_dim=size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(size*2, kernel_initializer='normal', activation='relu'))
    model.add(Dense(size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
    return model

# define wider model
def wider_model():
    # create model
    model = Sequential()
    model.add(Dense(size*2, input_dim=size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
    return model


# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(size, input_dim=size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
    return model

def prebuilt_nn():
    nn = Sequential()
    nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = size))
    nn.add(PReLU())
    nn.add(Dropout(.4))
    nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.6))
    nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.5))
    nn.add(Dense(units = 26, kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.6))
    nn.add(Dense(1, kernel_initializer='normal'))
    nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))

    return nn

time: 136 ms


In [38]:
## Preprocessing
print("Preprocessing neural network data...")
imputer= Imputer()
imputer.fit(x_train.iloc[:, :])
x_train_nn = imputer.transform(x_train.iloc[:, :])

imputer.fit(x_valid.iloc[:, :])
x_valid_nn = imputer.transform(x_valid.iloc[:, :])

sc = StandardScaler()
x_train_nn = sc.fit_transform(x_train_nn)
x_valid_nn = sc.transform(x_valid_nn)

Preprocessing neural network data...
time: 289 ms


In [39]:
# fix random seed for reproducibility
seed = 7
size = x_train_nn.shape[1]
# Prebuit KAGGLE Kernel
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=prebuilt_nn, epochs=5, batch_size=50, verbose=0)))
pipeline = Pipeline(estimators)
pipeline.fit(x_train_nn, y_train)
models['DNN'] = pipeline

print(mean_absolute_error(y_valid, pipeline.predict(x_valid_nn)))

time: 1min 5s


# LSTM

In [44]:
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
 
#x_train = x_train.values
#x_valid = x_valid.values

# reshape input to be 3D [samples, timesteps, features]
x_train_lstm = x_train_data.values.reshape((x_train_data.shape[0], 1, x_train_data.shape[1]))
x_valid_lstm = x_valid.values.reshape((x_valid.shape[0], 1, x_valid.shape[1]))
 
# design network
lstm = Sequential()
lstm.add(LSTM(50, input_shape=(x_train_lstm.shape[1], x_train_lstm.shape[2])))
lstm.add(PReLU())
lstm.add(Dropout(.2))
lstm.add(Dense(units = 100 , kernel_initializer = 'normal'))
lstm.add(PReLU())
lstm.add(Dropout(.2))
lstm.add(Dense(units = 50 , kernel_initializer = 'normal'))
lstm.add(PReLU())
lstm.add(Dense(1))
lstm.compile(loss='mae', optimizer='adam')
# fit network
lstm.fit(x_train_lstm, x_train_label, epochs=15, batch_size=50, validation_data=(x_valid_lstm, y_valid), verbose=1, shuffle=False)
 
# make a prediction
yhat = lstm.predict(x_valid_lstm)
models['LSTM'] = lstm
mae = mean_absolute_error(y_valid, yhat)
print('Test MAE: %.3f' % mae)

Train on 81247 samples, validate on 9028 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test MAE: 0.066
time: 3min 47s


# Stacking

In [140]:
# https://github.com/dnc1994/Kaggle-Playground/blob/master/home-depot/ensemble.py
import time
from sklearn.metrics import mean_absolute_error, make_scorer
from xgboost import XGBRegressor
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, \
        ExtraTreesRegressor, AdaBoostClassifier
from sklearn import grid_search

def mean_absolute_error_(ground_truth, predictions):
    return mean_absolute_error(ground_truth, predictions)

MAE = make_scorer(mean_absolute_error_, greater_is_better=False)

params_xgb = {
    'max_depth':        5,  # shuld be 0.5 to 1% of the examples
    'subsample':        1,  # Ratio of observations to be used as samples for each tree
    'min_child_weight': 10, # Deals with imbalanced data and prevents overfitting as the value >
    'objective':        'reg:linear',
    'n_estimators':     1000, # Sequential trees to be modelled.
    'eta':              0.1,  # Shrinkage. Typically between 0.1 - 0.2 - learning rate for gradient boost (D:0.3)
    'eval_metric':      'mae'
}

class Ensemble(object):
    
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=2016))
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        start_time = time.time()
        
        for i, c in enumerate(self.base_models):
            print('Fitting For Base Model {} ---'.format(c))       
            clf = self.base_models[c]
            
            for j, (train_idx, test_idx) in enumerate(folds):
                print('--- Fitting For Fold %d / %d ---', j+1, self.n_folds)
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                
                if c not in ['XGB', 'LightGBM', 'LSTM']:
                    clf.fit(X_train, y_train)
                    y_pred = clf.predict(X_holdout)[:]
                    
                    S_train[test_idx, i] = y_pred
                    
                elif c in ['LSTM']:
                    x_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))

                    clf.fit(x_train_lstm, y_train, epochs=15, batch_size=50, verbose=1, shuffle=False)
                    y_pred = clf.predict(X_holdout.reshape((X_holdout.shape[0], 1, X_holdout.shape[1])))[:]
                    
                    S_train[test_idx, i] = [i[0] for i in y_pred]

                else:
                    d_train = xg.DMatrix(X_train, label=y_train, missing=-1)
                    d_valid = xg.DMatrix(X_holdout, missing=-1)
                    
                    clf = xg.train(params_xgb, d_train)
                    y_pred = clf.predict(d_valid)[:]
                    S_train[test_idx, i] = y_pred
                    
                print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

            print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        print('--- Base Models Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        clf = self.stacker
        clf.fit(S_train, y)
        print('--- Stacker Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

    def predict(self, X):
        X = np.array(X)
        folds = list(KFold(len(X), n_folds=self.n_folds, shuffle=True, random_state=2016))
        S_test = np.zeros((X.shape[0], len(self.base_models)))

        for i, c in enumerate(self.base_models):
            clf = self.base_models[c]
            S_test_i = np.zeros((X.shape[0], len(folds)))
            for j, (train_idx, test_idx) in enumerate(folds):
                
                if c not in ['XGB', 'LightGBM', 'LSTM']:
                    S_test_i[:, j] = clf.predict(X)[:]
                    
                elif c in ['LSTM']:
                    S_test_i[:, j] = [i for i in clf.predict(X.reshape((X.shape[0], 1, X.shape[1])))[:]]
                    
                else:
                    S_test_i[:, j] = clf.predict(xg.DMatrix(X, missing=-1))[:]
                
            S_test[:, i] = S_test_i.mean(1)

        clf = self.stacker
        y_pred = clf.predict(S_test)[:]
        
        return y_pred

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        start_time = time.time()
        folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=2016))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test  = np.zeros((T.shape[0], len(self.base_models)))

        for i, c in enumerate(self.base_models):
            print('Fitting For Base Model {} ---'.format(c))
            clf = self.base_models[c]
            S_test_i = np.zeros((T.shape[0], len(folds)))

            for j, (train_idx, test_idx) in enumerate(folds):
                print('--- Fitting For Fold #{0} / {1} ---'.format(j+1, self.n_folds))
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                
                if c not in ['XGB', 'LightGBM', 'LSTM']:
                    clf.fit(X_train, y_train)
                    y_pred = clf.predict(X_holdout)[:]
                    
                    S_train[test_idx, i] = y_pred
                    S_test_i[:, j] = clf.predict(T)[:]
                    
                elif c in ['LSTM']:
                    x_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
                    
                    clf.fit(x_train_lstm, y_train, epochs=15, batch_size=50, verbose=1, shuffle=False)
                    y_pred = clf.predict(X_holdout.reshape((X_holdout.shape[0], 1, X_holdout.shape[1])))[:]
                    
                    S_train[test_idx, i] = [i[0] for i in y_pred]
                    S_test_i[:, j] = [i for i in clf.predict(T.reshape((T.shape[0], 1, T.shape[1])))[:]]
                    
                else:
                    d_train = xg.DMatrix(X_train, label=y_train, missing=-1)
                    d_valid = xg.DMatrix(X_holdout, missing=-1)
                    
                    clf = xg.train(params_xgb, d_train)
                    y_pred = clf.predict(d_valid)[:]
                    
                    S_train[test_idx, i] = y_pred
                    S_test_i[:, j] = clf.predict(xg.DMatrix(T, missing=-1))[:]

                print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

            S_test[:, i] = S_test_i.mean(1)
            print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        print('--- Base Models Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        param_grid = {'n_estimators':  [100],
                      'learning_rate': [0.05],
                      'subsample':     [0.75]}
        
        grid = grid_search.GridSearchCV(estimator=self.stacker, param_grid=param_grid, n_jobs=1, cv=5, verbose=20, scoring=MAE)
        grid.fit(S_train, y)

        try:
            print('Param grid:')
            print(param_grid)
            print('Best Params:')
            print(grid.best_params_)
            print('Best CV Score:')
            print(-grid.best_score_)
            print('Best estimator:')
            print(grid.best_estimator_)
            print(message)
        except:
            pass

        print('--- Stacker Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))
        y_pred = grid.predict(S_test)[:]

        return y_pred

time: 362 ms


In [142]:
ensemble = Ensemble(n_folds=5,
                    stacker=GradientBoostingRegressor(random_state=2016, verbose=1),
                    base_models=models)
                    
model_ensemble = ensemble.fit_predict(x_train[:1000], y_train[:1000], x_valid)

#print('MAE', mean_absolute_error(y_valid, ensemble_prediction))

Fitting For Base Model LinearRegression ---
--- Fitting For Fold #1 / 5 ---
Elapsed: 0.0 minutes ---
--- Fitting For Fold #2 / 5 ---
Elapsed: 0.0 minutes ---
--- Fitting For Fold #3 / 5 ---
Elapsed: 0.0 minutes ---
--- Fitting For Fold #4 / 5 ---
Elapsed: 0.0 minutes ---
--- Fitting For Fold #5 / 5 ---
Elapsed: 0.0 minutes ---
Elapsed: 0.0 minutes ---
Fitting For Base Model BayesianRidge ---
--- Fitting For Fold #1 / 5 ---
Elapsed: 0.0 minutes ---
--- Fitting For Fold #2 / 5 ---
Elapsed: 0.0 minutes ---
--- Fitting For Fold #3 / 5 ---
Elapsed: 0.0 minutes ---
--- Fitting For Fold #4 / 5 ---
Elapsed: 0.0 minutes ---
--- Fitting For Fold #5 / 5 ---
Elapsed: 0.0 minutes ---
Elapsed: 0.0 minutes ---
Fitting For Base Model RandomForest ---
--- Fitting For Fold #1 / 5 ---


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished


Elapsed: 0.05 minutes ---
--- Fitting For Fold #2 / 5 ---


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished


Elapsed: 0.1 minutes ---
--- Fitting For Fold #3 / 5 ---


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished


Elapsed: 0.15 minutes ---
--- Fitting For Fold #4 / 5 ---


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished


Elapsed: 0.19 minutes ---
--- Fitting For Fold #5 / 5 ---


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.5s finished


Elapsed: 0.24 minutes ---
Elapsed: 0.24 minutes ---
Fitting For Base Model ExtraTrees ---
--- Fitting For Fold #1 / 5 ---


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished


Elapsed: 0.27 minutes ---
--- Fitting For Fold #2 / 5 ---


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished


Elapsed: 0.3 minutes ---
--- Fitting For Fold #3 / 5 ---


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.5s finished


Elapsed: 0.33 minutes ---
--- Fitting For Fold #4 / 5 ---


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished


Elapsed: 0.36 minutes ---
--- Fitting For Fold #5 / 5 ---


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished


Elapsed: 0.4 minutes ---
Elapsed: 0.4 minutes ---
Fitting For Base Model GradientBoosting ---
--- Fitting For Fold #1 / 5 ---
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.0074          -0.0001            0.93s
         2           0.0073          -0.0001            0.85s
         3           0.0071          -0.0000            0.82s
         4           0.0072          -0.0000            0.84s
         5           0.0070          -0.0000            0.81s
         6           0.0067          -0.0000            0.80s
         7           0.0063          -0.0000            0.80s
         8           0.0066          -0.0000            0.83s
         9           0.0066           0.0000            0.82s
        10           0.0061          -0.0000            0.83s
        20           0.0043          -0.0000            0.76s
        30           0.0035          -0.0000            0.76s
        40           0.0028          -0.0000            0.76s
     

Elapsed: 0.5 minutes ---
--- Fitting For Fold #2 / 5 ---
Elapsed: 0.5 minutes ---
--- Fitting For Fold #3 / 5 ---
Elapsed: 0.5 minutes ---
--- Fitting For Fold #4 / 5 ---
Elapsed: 0.5 minutes ---
--- Fitting For Fold #5 / 5 ---
Elapsed: 0.5 minutes ---
Elapsed: 0.5 minutes ---
Fitting For Base Model LightGBM ---
--- Fitting For Fold #1 / 5 ---
Elapsed: 0.5 minutes ---
--- Fitting For Fold #2 / 5 ---
Elapsed: 0.5 minutes ---
--- Fitting For Fold #3 / 5 ---
Elapsed: 0.5 minutes ---
--- Fitting For Fold #4 / 5 ---
Elapsed: 0.5 minutes ---
--- Fitting For Fold #5 / 5 ---
Elapsed: 0.5 minutes ---
Elapsed: 0.5 minutes ---
Fitting For Base Model DNN ---
--- Fitting For Fold #1 / 5 ---
Elapsed: 0.82 minutes ---
--- Fitting For Fold #2 / 5 ---
Elapsed: 1.14 minutes ---
--- Fitting For Fold #3 / 5 ---
Elapsed: 1.46 minutes ---
--- Fitting For Fold #4 / 5 ---
Elapsed: 1.78 minutes ---
--- Fitting For Fold #5 / 5 ---
Elapsed: 2.11 minutes ---
Elapsed: 2.11 minutes ---
Fitting For Base Model LSTM -

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s


        70           0.0053          -0.0000            0.02s
        80           0.0054          -0.0000            0.02s
        90           0.0049          -0.0000            0.01s
       100           0.0048          -0.0000            0.00s
[CV]  learning_rate=0.05, n_estimators=100, subsample=0.75, score=-0.054956 -   0.1s
[CV] learning_rate=0.05, n_estimators=100, subsample=0.75 ............
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.0082          -0.0000            0.08s
         2           0.0077          -0.0000            0.08s
         3           0.0077          -0.0000            0.08s
         4           0.0077           0.0000            0.08s
         5           0.0082          -0.0000            0.10s
         6           0.0075           0.0000            0.10s
         7           0.0071           0.0000            0.10s
         8           0.0075          -0.0000            0.10s
         9           0.0074          

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s finished


In [144]:
model_ensemble.predict(x_valid)

ValueError: Number of features of the model must match the input. Model n_features is 10 and input n_features is 47 

time: 33.7 ms


In [None]:
########## LAYER 1 ##########
# Submodel  1 : OLS                      # Ordinary least squares estimator Sklearn implementation
# Submodel  2 : BR                       # Bayesian ridge regression - Sklearn implementation
# Submodel  3 : DNN                      # Dense Neural Network - Keras - Dense layers 
# Submodel  4 : LightGBM                 # Light Gradient Boosting - https://github.com/Microsoft/LightGBM
# Submodel  5 : XGBoost                  # Extreme Gradient Boosting - http://xgboost.readthedocs.io/en/latest/model.html
# Submodel  6 : CatBoost                 # Categorical Boosting https://github.com/catboost/catboost
# Submodel  7 : LSTM                     # Long Short Term Memory Neural Network - Keras implementation
# Submodel  8 : RandomForestRegressor    # Sklearn implementation
# Submodel  9 : ExtraTreesRegressor      # Sklearn implementation
# Submodel 10 : SVR                      # Support vector machines for regression - Sklearn implementation
# Submodel 11 : AdaBoost                 # Adaptive Boosting Sklearn Implementation

########## LAYER 2 ##########
# https://www.kaggle.com/dragost/boosted-trees-lb-0-0643707/edit

# Save data

In [None]:
x_predict = transactions.copy()

In [None]:
submission_sample[m] = submission_sample['ParcelId'].to_frame().merge(x_predict[['parcelid', m]], how='left', left_on='ParcelId', right_on='parcelid')[m]

In [None]:
# https://www.kaggle.com/c/zillow-prize-1/discussion/33899, Oct,Nov,Dec
test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}

all_preds = pd.DataFrame()
for m in test_dates.keys():
    # Building predictions.
    print('Processing', m)
    x_predict = transactions.copy()
    x_predict = complex_features(x_predict)
    x_predict['transactiondate'] = test_dates[m]
    x_predict = time_data(x_predict)
    
    print('Cleaning data')
    print('........')
    
    x_predict = x_predict.fillna(-999).astype(str).apply(LabelEncoder().fit_transform)
    
    print('Predicting')
    print('........')
    
    x_predict[m] = xgb_gs2.predict(xg.DMatrix(x_predict[best_columns]))
    all_preds[m] = x_predict[m].copy()
    submission_sample[m] = submission_sample['ParcelId'].to_frame().merge(x_predict[['parcelid', m]], how='left', left_on='ParcelId', right_on='parcelid')[m]
    
#del x_predict

In [None]:
x_predict[list(test_dates.keys())] = all_preds[list(test_dates.keys())]

In [None]:
x_predict

In [None]:
m = '201610'
submission_sample[m] = submission_sample['ParcelId'].to_frame().merge(x_predict[['parcelid', m]], how='left', left_on='ParcelId', right_on='parcelid')[m]

In [None]:
submission_sample

In [None]:
submission_sample.to_csv('submission4.csv',index=False)
submission_sample.head()

In [None]:
# https://www.kaggle.com/c/zillow-prize-1/discussion/33899, Oct,Nov,Dec
test_dates = {'201610': pd.Timestamp('2016-09-30'),
              '201611': pd.Timestamp('2016-10-31'),
              '201612': pd.Timestamp('2016-11-30')}

x_predict = transactions.copy()
x_predict = complex_features(x_predict)

for m in test_dates.keys():
    print('Processing', m)  
    x_predict['transactiondate'] = test_dates[m]
    x_predict = time_data(x_predict)
    x_predict = x_predict[best_columns].fillna(-999).astype(str).apply(LabelEncoder().fit_transform)
    
    print('Predicting')
    print('........')
    
    # 5 iterations cat booster for each prediction.
    x_predict[m] = cat_booster(x_train, y_train, x_predict.values)
    submission_sample[m] = submission_sample['ParcelId'].to_frame().merge(x_predict[['parcelid', m]], how='left', left_on='ParcelId', right_on='parcelid')[m]
    
del x_predict
del predictions

# RFE