In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime
import gc

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

In [2]:
def generate_test_probs(model, col_names=None, type = None):
    model_preds = None
#     change month of properties
#     properties['month'] = month
    for i in range(int(properties.shape[0] / 100000)):   
        # get current test features
        current_test_feats = feature_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])

        # predict on current test obs
        current_preds = DataFrame(model.predict_proba(current_test_feats), columns=col_names,
                              index = np.arange(i*100000,(i+1)*100000))

        if model_preds is not None:
            model_preds = pd.concat([model_preds, current_preds])
        else:
            model_preds = current_preds

    #  fencepost problem
    current_test_feats = feature_pipeline.transform(properties.iloc[2900000:])
    # predict on current test obs
    current_preds = DataFrame(model.predict_proba(current_test_feats), columns=col_names,
                             index = np.arange(2900000,2985217))
    model_preds = pd.concat([model_preds, current_preds])
#     del properties['month']
    return model_preds

In [3]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data

In [4]:
maindir = "/home/anerdi/Desktop/Zillow"
# create data DataFrame
logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
#proportion of living area
properties['N-LivingAreaProp'] = properties['calculatedfinishedsquarefeet']/properties['lotsizesquarefeet']

properties['N-NonLivingAreaProp'] = properties['garagetotalsqft']/properties['lotsizesquarefeet']

#Ratio of the built structure value to land area
properties['N-ValueProp'] = properties['structuretaxvaluedollarcnt']/properties['landtaxvaluedollarcnt']

#Ratio of tax of property over parcel
properties['N-ValueRatio'] = properties['taxvaluedollarcnt']/properties['taxamount']

# Pool
properties['poolsizesum'] = properties['poolsizesum'].fillna(0)
# properties['Pool'] = (properties['poolsizesum'] > 0).astype(int)
properties['Pool'] = (properties['pooltypeid2'].fillna(0) + properties['pooltypeid7'].fillna(0)).astype(int)

properties['regionidcounty'] = properties['regionidcounty'].fillna(9999)

# some more feature engineering
properties['age'] = 2017 - properties['yearbuilt']
properties['additional_rooms_count'] = np.maximum((properties['roomcnt'].values 
                                                   - properties['calculatedbathnbr'].values
                                                   - properties['bedroomcnt'].values),0)

### Feature pipeline 

In [6]:
# Setup variables considered in the model

# numerical variables
num_atts = ['bedroomcnt','calculatedbathnbr','age','additional_rooms_count',
           'calculatedfinishedsquarefeet','fullbathcnt','garagecarcnt','garagetotalsqft',
            'latitude','longitude','lotsizesquarefeet', 'roomcnt',
           'numberofstories','structuretaxvaluedollarcnt','taxvaluedollarcnt','landtaxvaluedollarcnt','taxamount',
           'N-ValueRatio', 'N-LivingAreaProp', 'N-NonLivingAreaProp','N-ValueProp']

# categorical varaibles
cat_atts = ['airconditioningtypeid','architecturalstyletypeid',
           'buildingclasstypeid','heatingorsystemtypeid','Pool','propertylandusetypeid','regionidcounty',
           'storytypeid','typeconstructiontypeid','fireplaceflag','taxdelinquencyflag']

# Dictionary of categorical variables and their default levels
cat_dict = {'airconditioningtypeid':[-1] + list(range(1,14)),
           'architecturalstyletypeid':[-1] + list(range(1,28)),
           'buildingclasstypeid':[-1] + list(range(1,6)),
            'heatingorsystemtypeid':[-1] + list(range(1,26)),
            'pooltypeid10': list(range(-1,2)),
            'pooltypeid2': list(range(-1,2)),
            'pooltypeid7': list(range(-1,2)),
            'propertylandusetypeid': [-1, 31,46,47,246,247,248,260,261,262,263,264,265,266,267,268,269,270,271,
                                     273,274,275,276,279,290,291],
            'regionidcounty': [2061,3101,1286],
            'storytypeid':[-1] + list(range(1,36)),
            'typeconstructiontypeid':[-1] + list(range(1,19)),
            'yearbuilt': [-1] + list(range(1885,2018)),
            'fireplaceflag': [-1] + ['True','False'],
            'taxdelinquencyflag': [-1] + ['Y','N']
           }

In [7]:
# Categorical pipeline
cat_pipeline = Pipeline([
        ('select_and_dummify', pipes.DF_Selector_GetDummies(cat_dict)),
    ])

# Numerical pipeline
num_pipeline = Pipeline([
        ('selector', pipes.DataFrameSelector(num_atts)),
        ('imputer', Imputer()),
        ('scaler', StandardScaler())
    ])

# Full pipeline
feature_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline)
    ])

In [8]:
# impute missing num_atts per regionid
for countyid in properties.regionidcounty.unique():
    # setup condition
    cond = properties['regionidcounty'] == countyid
    indices = np.where(cond)[0]
    # impute values based on region
    if countyid != 9999:
        properties.loc[indices,num_atts] = (properties.loc[indices,num_atts]
                                .fillna(properties.loc[indices,num_atts]
                                .apply(np.mean)))
    else:
        properties.loc[indices,num_atts] = (properties.loc[indices,num_atts]
                                            .fillna(properties[num_atts]
                                            .apply(np.mean)))

In [9]:
assert properties[num_atts].isnull().any().any() == False

In [10]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

In [11]:
feature_pipeline.fit(data)

FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(memory=None,
     steps=[('selector', DataFrameSelector(desired_cols=['bedroomcnt', 'calculatedbathnbr', 'age', 'additional_rooms_count', 'calculatedfinishedsquarefeet', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'latitude', 'longitude', 'lotsizesqu...'typeconstructiontypeid': [-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]}))]))],
       transformer_weights=None)

### Reading in level-one data

In [12]:
l1data_rfs = pd.read_csv("/home/anerdi/Desktop/Zillow/levelonedata/l1data_twostage_rfs_age.csv.gz")
# l1data_rfs_last_fold = pd.read_csv("/home/anerdi/Desktop/Zillow/levelonedata/l1data_twostage_rfs_last_fold.csv.gz")
# l1data_rfs[~l1data_rfs_last_fold.isnull().any(axis=1)] = l1data_rfs_last_fold[~l1data_rfs_last_fold.isnull().any(axis=1)]

In [13]:
l1data_linear_models = pd.read_csv("/home/anerdi/Desktop/Zillow/levelonedata/l1data_twostage_linear_models_age.csv.gz")
# l1data_linear_models_last_fold = pd.read_csv("/home/anerdi/Desktop/Zillow/levelonedata/l1data_twostage_linear_models_last_fold.csv.gz")
# l1data_linear_models[~l1data_linear_models_last_fold.isnull().any(axis=1)] = l1data_linear_models_last_fold[~l1data_linear_models_last_fold.isnull().any(axis=1)]

In [14]:
assert l1data_rfs.isnull().any().any() == False
assert l1data_linear_models.isnull().any().any() == False

In [15]:
assert (l1data_linear_models.parcelid == l1data_rfs.parcelid).all()
assert (l1data_linear_models.parcelid == data.parcelid).all()
assert l1data_linear_models.shape[0] == data.shape[0]

In [16]:
l1data = pd.concat([l1data_linear_models, l1data_rfs.iloc[:,1:], data[['logerror']]], axis=1)
# l1data = pd.merge(l1data, data[['parcelid','logerror']], on = 'parcelid')

In [17]:
l1data.head()

Unnamed: 0,parcelid,stacked_rfs_ridge,stacked_rfs_enet,stacked_rfs_lasso,stacked_rfs_larm,stacked_rfs_huber,stacked_annrfs_ridge,stacked_annrfs_enet,stacked_annrfs_lasso,stacked_annrfs_larm,...,stacked_rfs_rf_maxdepth8,stacked_rfs_rf_maxdepth10,stacked_rfs_rf_maxdepth12,stacked_annrfs_rf_maxdepth8,stacked_annrfs_rf_maxdepth10,stacked_annrfs_rf_maxdepth12,logistic_rf_maxdepth8,logistic_rf_maxdepth10,logistic_rf_maxdepth12,logerror
0,17073783,0.026642,0.0265,0.026087,0.027531,0.009768,0.021656,0.021685,0.021387,0.021477,...,0.0269,0.024892,0.026428,0.020637,0.018499,0.019837,0.019777,0.017621,0.018932,0.0953
1,17088994,0.008293,0.009277,0.0092,0.007287,0.003119,0.010055,0.01105,0.010974,0.009521,...,0.006175,0.006116,0.005256,0.008567,0.008466,0.007555,0.011739,0.011584,0.010603,0.0198
2,17100444,0.022774,0.021773,0.020932,0.011867,0.001351,0.027659,0.026147,0.024698,0.016643,...,0.014502,0.012973,0.014187,0.018923,0.017378,0.018522,0.006619,0.00512,0.006457,0.006
3,17102429,0.01775,0.017539,0.017084,0.010695,0.001707,0.026112,0.025017,0.023709,0.019355,...,0.011784,0.011442,0.013102,0.019883,0.019471,0.020995,0.012073,0.011728,0.013383,-0.0566
4,17109604,0.018645,0.018092,0.017843,0.015738,0.013915,0.016518,0.015872,0.015628,0.013343,...,0.015035,0.015901,0.015612,0.012726,0.013686,0.013483,0.017651,0.018409,0.018022,0.0573


In [19]:
for c in l1data.columns:
    print(c)

parcelid
stacked_rfs_ridge
stacked_rfs_enet
stacked_rfs_lasso
stacked_rfs_larm
stacked_rfs_huber
stacked_annrfs_ridge
stacked_annrfs_enet
stacked_annrfs_lasso
stacked_annrfs_larm
stacked_annrfs_huber
logistic_ridge
logistic_enet
logistic_lasso
logistic_larm
logistic_huber
stacked_rfs_rf_maxdepth8
stacked_rfs_rf_maxdepth10
stacked_rfs_rf_maxdepth12
stacked_annrfs_rf_maxdepth8
stacked_annrfs_rf_maxdepth10
stacked_annrfs_rf_maxdepth12
logistic_rf_maxdepth8
logistic_rf_maxdepth10
logistic_rf_maxdepth12
logerror


In [18]:
l1data.shape

(90275, 26)

### Heuristic for finding weights 

#### Stand Alones

In [23]:
# Set up quadratic objective function
X = l1data[['logistic_lasso', 'stacked_rfs_lasso', 'stacked_annrfs_lasso']].values
y = l1data.logerror.values

In [20]:
abserror = abs(X-np.tile(l1data.logerror.values[:,np.newaxis],X.shape[1]))

In [21]:
wins = np.array([(abserror[:,i] == np.min(abserror, axis=1)).sum() for i in range(X.shape[1])])

In [22]:
wins / wins.sum()

array([ 0.33065633,  0.36485184,  0.30449183])

#### All models

In [27]:
X = l1data[['stacked_annrfs_enet',
 'stacked_annrfs_lasso',
 'stacked_annrfs_rf_maxdepth8',
 'stacked_annrfs_ridge',
 'stacked_rfs_ridge']]

In [28]:
abserror = abs(X.values-np.tile(l1data.logerror.values[:,np.newaxis],X.shape[1]))

In [29]:
wins = np.array([(abserror[:,i] == np.min(abserror, axis=1)).sum() for i in range(X.shape[1])])
wins/wins.sum()

array([ 0.07591249,  0.09201883,  0.25784547,  0.16796455,  0.40625865])

In [27]:
# enet_wins = wins[::7].sum()
# ridge_wins = wins[7::7].sum()
# lasso_wins = wins[6:9].sum()
# larm_wins = wins[9:12].sum()
# huber_wins = wins[12:15].sum()
# rf_wins = wins[15:].sum()

In [28]:
# np.array([ridge_wins, enet_wins, lasso_wins, larm_wins, huber_wins, rf_wins]) / wins.sum() 

### Finding weights via classification 

In [30]:
newy = np.argmin(abserror, axis =1)

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [32]:
clf = RandomForestClassifier(max_depth=15, random_state=9)

In [33]:
assert (l1data.parcelid == data.parcelid).all()

In [34]:
clf.fit(feature_pipeline.transform(data),newy)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=9, verbose=0, warm_start=False)

In [35]:
clf.score(feature_pipeline.transform(data), newy)

0.52674605372473005

In [36]:
clf.predict_proba(feature_pipeline.transform(data)).shape

(90275, 5)

### Using classifier to find weights for test set

In [37]:
test_probs = generate_test_probs(clf, col_names= X.columns)

In [38]:
test_probs.head()

Unnamed: 0,stacked_annrfs_enet,stacked_annrfs_lasso,stacked_annrfs_rf_maxdepth8,stacked_annrfs_ridge,stacked_rfs_ridge
0,0.124644,0.032638,0.090185,0.357573,0.39496
1,0.141738,0.023277,0.098019,0.328742,0.408225
2,0.183933,0.03263,0.261554,0.100823,0.42106
3,0.257326,0.015538,0.330695,0.068147,0.328294
4,0.238848,0.041081,0.156981,0.192375,0.370715


In [39]:
test_probs.describe()

Unnamed: 0,stacked_annrfs_enet,stacked_annrfs_lasso,stacked_annrfs_rf_maxdepth8,stacked_annrfs_ridge,stacked_rfs_ridge
count,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0
mean,0.07765218,0.08948346,0.2532114,0.1652891,0.4143639
std,0.06325371,0.06468272,0.1074407,0.07290686,0.07589662
min,0.0,0.0,0.003030303,0.0,0.02888016
25%,0.03612953,0.04063128,0.1653123,0.1192126,0.3677362
50%,0.05775524,0.071871,0.2326592,0.1681708,0.4255204
75%,0.1005334,0.1232688,0.3199874,0.2051817,0.4647585
max,0.7358734,0.7717611,0.7743199,0.7732723,0.8732251


In [40]:
test_probs.to_csv("/home/anerdi/Desktop/Zillow/levelonedata/super_learner_weights_7.csv.gz", compression='gzip',
                 index=False)

### Assessing performance of models with extreme points

In [146]:
Xextreme = X[np.where((l1data.logerror <= -2) | (l1data.logerror >= 2))[0],:]

In [140]:
extreme_logerrors = l1data.iloc[np.where((l1data.logerror <= -2) | (l1data.logerror >= 2))[0],:].logerror.values[:,np.newaxis]

In [148]:
extreme_abserror = abs(Xextreme-np.tile(extreme_logerrors,Xextreme.shape[1]))

In [151]:
extreme_abserror.mean(axis=0)

array([ 2.55365065,  2.54774629,  2.54912892,  2.5542268 ,  2.54937705,
        2.55064732,  2.55432623,  2.54927759,  2.55057871,  2.55465447,
        2.54918357,  2.55058782,  2.55315009,  2.55187126,  2.55255297,
        2.55306334,  2.54588748])

### Finding optimal weights using NNs

In [13]:
l1data.columns

Index(['parcelid', 'stacked_rfs_ridge', 'stacked_rfs_enet',
       'stacked_rfs_lasso', 'stacked_rfs_larm', 'stacked_rfs_huber',
       'stacked_annrfs_ridge', 'stacked_annrfs_enet', 'stacked_annrfs_lasso',
       'stacked_annrfs_larm', 'stacked_annrfs_huber', 'logistic_ridge',
       'logistic_enet', 'logistic_lasso', 'logistic_larm', 'logistic_huber',
       'stacked_rfs_rf', 'stacked_rfs_rf_overfit', 'stacked_annrfs_rf',
       'stacked_annrfs_rf_overfit', 'logistic_rf', 'logistic_rf_overfit',
       'logerror'],
      dtype='object')

In [23]:
X = l1data[[
    'logistic_ridge',
 'stacked_rfs_ridge',
 'stacked_annrfs_ridge']].values

y = l1data.logerror.values

In [24]:
from sklearn.neural_network import MLPRegressor

In [40]:
stack = MLPRegressor(hidden_layer_sizes=(10,10),random_state=9, max_iter=300, early_stopping=True)

In [41]:
stack.fit(X,y)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=300, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=9, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [42]:
preds = stack.predict(X)

In [43]:
new_submission = DataFrame({'ParcelId': l1data['parcelid'],
                           '201610':preds,
                           '201611':preds,
                           '201612':preds,
})
new_submission['201710'] = 0
new_submission['201711'] = 0
new_submission['201712'] = 0

In [44]:
mean_absolute_errors(new_submission,data)

(0.06241616765085067,
 0.062246095856333956,
 0.07335353272780508,
 0.064614821512161927)

### Pickle the NN

In [21]:
from sklearn.externals import joblib

In [45]:
joblib.dump(stack, '/home/anerdi/Desktop/Zillow/submissions/stage2_stacked_NN_ridge.pkl') 

['/home/anerdi/Desktop/Zillow/submissions/stage2_stacked_NN_ridge.pkl']