In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime
import gc

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

In [2]:
def generate_test_probs(model, col_names=None, type = None):
    model_preds = None
    for i in range(int(properties.shape[0] / 100000)):   
        # get current test features
        current_test_feats = feature_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])

        # predict on current test obs
        current_preds = DataFrame(model.predict_proba(current_test_feats), columns=col_names,
                              index = np.arange(i*100000,(i+1)*100000))

        if model_preds is not None:
            model_preds = pd.concat([model_preds, current_preds])
        else:
            model_preds = current_preds

    #  fencepost problem
    current_test_feats = feature_pipeline.transform(properties.iloc[2900000:])
    # predict on current test obs
    current_preds = DataFrame(model.predict_proba(current_test_feats), columns=col_names,
                             index = np.arange(2900000,2985217))
    model_preds = pd.concat([model_preds, current_preds])
    return model_preds

In [3]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data

In [4]:
maindir = "/home/anerdi/Desktop/Zillow"
# create data DataFrame
logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
#proportion of living area
properties['N-LivingAreaProp'] = properties['calculatedfinishedsquarefeet']/properties['lotsizesquarefeet']

#Ratio of the built structure value to land area
properties['N-ValueProp'] = properties['structuretaxvaluedollarcnt']/properties['landtaxvaluedollarcnt']

#Ratio of tax of property over parcel
properties['N-ValueRatio'] = properties['taxvaluedollarcnt']/properties['taxamount']

# Pool
properties['Pool'] = (properties['pooltypeid2'].fillna(0) + properties['pooltypeid7'].fillna(0)).astype(int)

In [6]:
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

In [7]:
data.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,N-LivingAreaProp,N-ValueProp,N-ValueRatio,Pool,logerror,month
0,17073783,,,,2.5,3.0,,,2.5,,...,2015.06,,,61110020000000.0,0.72853,1.500013,95.188729,0,0.0953,1
1,17088994,,,,1.0,2.0,,,1.0,,...,2581.3,,,61110020000000.0,,1.500042,92.852051,0,0.0198,3
2,17100444,,,,2.0,3.0,,,2.0,,...,591.64,,,61110010000000.0,0.167605,2.36188,80.881955,0,0.006,5
3,17102429,,,,1.5,2.0,,,1.5,,...,682.78,,,61110010000000.0,0.21,2.635597,92.143882,0,-0.0566,6
4,17109604,,,,2.5,4.0,,,2.5,,...,5886.92,,,61110010000000.0,0.381758,1.0,94.106935,0,0.0573,8


### Feature pipeline 

In [8]:
# Setup variables considered in the model


# numerical variables
num_atts = ['bathroomcnt','bedroomcnt','buildingqualitytypeid','calculatedbathnbr','finishedfloor1squarefeet',
           'calculatedfinishedsquarefeet','finishedsquarefeet12','finishedsquarefeet13',
           'finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6','fireplacecnt',
           'fullbathcnt','garagecarcnt','garagetotalsqft','latitude','longitude','lotsizesquarefeet',
           'poolcnt','poolsizesum','censustractandblock','roomcnt','threequarterbathnbr','unitcnt',
           'yardbuildingsqft17','yardbuildingsqft26','numberofstories',
            'structuretaxvaluedollarcnt','taxvaluedollarcnt','landtaxvaluedollarcnt','taxamount',
           'N-ValueRatio', 'N-LivingAreaProp', 'N-ValueProp']

# categorical varaibles
cat_atts = ['airconditioningtypeid','architecturalstyletypeid',
           'buildingclasstypeid','heatingorsystemtypeid','pooltypeid10','pooltypeid2',
            'pooltypeid7','propertylandusetypeid','regionidcounty',
           'storytypeid','typeconstructiontypeid','yearbuilt','fireplaceflag',
           'taxdelinquencyflag']

# Dictionary of categorical variables and their default levels
cat_dict = {key:value for key,value in {'airconditioningtypeid':[-1] + list(range(1,14)),
           'architecturalstyletypeid':[-1] + list(range(1,28)),
           'buildingclasstypeid':[-1] + list(range(1,6)),
            'heatingorsystemtypeid':[-1] + list(range(1,26)),
            'pooltypeid10': list(range(-1,2)),
            'pooltypeid2': list(range(-1,2)),
            'pooltypeid7': list(range(-1,2)),
            'Pool': [0,1],
            'propertylandusetypeid': [-1, 31,46,47,246,247,248,260,261,262,263,264,265,266,267,268,269,270,271,
                                     273,274,275,276,279,290,291],
            'regionidcounty': [2061,3101,1286],
            'storytypeid':[-1] + list(range(1,36)),
            'typeconstructiontypeid':[-1] + list(range(1,19)),
            'yearbuilt': [-1] + list(range(1885,2018)),
            'fireplaceflag': [-1] + ['True','False'],
            'taxdelinquencyflag': [-1] + ['Y','N']
           }.items() if key in cat_atts}

In [9]:
# Categorical pipeline
cat_pipeline = Pipeline([
        ('select_and_dummify', pipes.DF_Selector_GetDummies(cat_dict)),
    ])

# Numerical pipeline
num_pipeline = Pipeline([
        ('selector', pipes.DataFrameSelector(num_atts)),
        ('imputer', Imputer()),
        ('scaler', StandardScaler())
    ])

# Full pipeline
feature_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline)
    ])

In [10]:
feature_pipeline.fit(data)

FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(memory=None,
     steps=[('selector', DataFrameSelector(desired_cols=['bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet13', 'fi... 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]}))]))],
       transformer_weights=None)

### Reading in level-one data

In [10]:
l1data_rfs = pd.read_csv("/home/anerdi/Desktop/Zillow/levelonedata/l1data_twostage_rfs.csv.gz")
l1data_rfs_last_fold = pd.read_csv("/home/anerdi/Desktop/Zillow/levelonedata/l1data_twostage_rfs_last_fold.csv.gz")
l1data_rfs[~l1data_rfs_last_fold.isnull().any(axis=1)] = l1data_rfs_last_fold[~l1data_rfs_last_fold.isnull().any(axis=1)]

In [11]:
l1data_linear_models = pd.read_csv("/home/anerdi/Desktop/Zillow/levelonedata/l1data_twostage_linear_models.csv.gz")
l1data_linear_models_last_fold = pd.read_csv("/home/anerdi/Desktop/Zillow/levelonedata/l1data_twostage_linear_models_last_fold.csv.gz")
l1data_linear_models[~l1data_linear_models_last_fold.isnull().any(axis=1)] = l1data_linear_models_last_fold[~l1data_linear_models_last_fold.isnull().any(axis=1)]

In [12]:
assert l1data_rfs.isnull().any().any() == False
assert l1data_linear_models.isnull().any().any() == False

In [13]:
l1data = pd.concat([l1data_linear_models, l1data_rfs.iloc[:,1:], data[['logerror']]], axis=1)
# l1data = pd.merge(l1data, data[['parcelid','logerror']], on = 'parcelid')

In [14]:
l1data.head()

Unnamed: 0,parcelid,stacked_rfs_ridge,stacked_rfs_enet,stacked_rfs_lasso,stacked_rfs_larm,stacked_rfs_huber,stacked_annrfs_ridge,stacked_annrfs_enet,stacked_annrfs_lasso,stacked_annrfs_larm,...,logistic_lasso,logistic_larm,logistic_huber,stacked_rfs_rf,stacked_rfs_rf_overfit,stacked_annrfs_rf,stacked_annrfs_rf_overfit,logistic_rf,logistic_rf_overfit,logerror
0,17073783,0.025002,0.025769,0.025975,0.027605,0.014742,0.019534,0.0199,0.020049,0.021077,...,0.019235,0.02018,0.010799,0.027581,0.010398,0.02153,0.001875,0.0207,0.000704,0.0953
1,17088994,0.006159,0.005356,0.005382,0.005274,0.001147,0.008189,0.007427,0.007508,0.007728,...,0.010329,0.010983,0.002344,0.00336,0.002142,0.005626,0.004083,0.008633,0.006657,0.0198
2,17100444,0.01256,0.011415,0.011473,0.012517,0.006989,0.016708,0.015655,0.015737,0.017069,...,0.003872,0.004402,0.002371,0.017957,0.021037,0.022626,0.0279,0.009633,0.008801,0.006
3,17102429,0.012187,0.010573,0.010593,0.010734,0.008559,0.020839,0.018793,0.018779,0.019272,...,0.010884,0.011038,0.008804,0.016854,0.023223,0.025292,0.032122,0.017155,0.02354,-0.0566
4,17109604,0.018297,0.017662,0.017683,0.015205,0.014209,0.015999,0.015345,0.015373,0.01265,...,0.020298,0.018098,0.016877,0.015216,0.023369,0.013065,0.021301,0.017652,0.025711,0.0573


In [15]:
(l1data.parcelid == data.parcelid).all()

True

In [16]:
l1data.shape

(90275, 23)

### Heuristic for finding weights 

#### Stand Alones

In [18]:
# Set up quadratic objective function
X = l1data[['logistic_lasso', 'stacked_rfs_lasso', 'stacked_annrfs_lasso']].values
y = l1data.logerror.values

In [19]:
abserror = abs(X-np.tile(l1data.logerror.values[:,np.newaxis],X.shape[1]))

In [20]:
wins = np.array([(abserror[:,i] == np.min(abserror, axis=1)).sum() for i in range(X.shape[1])])

In [21]:
wins / wins.sum()

array([ 0.332473  ,  0.36443091,  0.3030961 ])

#### All models

In [18]:
X = l1data[[
        'logistic_ridge', 'stacked_rfs_ridge', 'stacked_annrfs_ridge',
        'logistic_enet', 'stacked_rfs_enet', 'stacked_annrfs_enet',
        'logistic_lasso', 'stacked_rfs_lasso', 'stacked_annrfs_lasso',
        'logistic_larm', 'stacked_rfs_larm', 'stacked_annrfs_larm',
        'logistic_huber', 'stacked_rfs_huber', 'stacked_annrfs_huber',
        'logistic_rf', 'stacked_annrfs_rf',
    ]]

In [19]:
abserror = abs(X.values-np.tile(l1data.logerror.values[:,np.newaxis],X.shape[1]))

In [20]:
wins = np.array([(abserror[:,i] == np.min(abserror, axis=1)).sum() for i in range(X.shape[1])])
wins/wins.sum()

array([ 0.0548103 ,  0.07566879,  0.05380227,  0.00758793,  0.01014677,
        0.01029078,  0.00775408,  0.01422321,  0.01433398,  0.06360565,
        0.08515093,  0.04684575,  0.11919136,  0.15139297,  0.10489061,
        0.08592634,  0.09437829])

In [21]:
ridge_wins = wins[:3].sum()
enet_wins = wins[3:6].sum()
lasso_wins = wins[6:9].sum()
larm_wins = wins[9:12].sum()
huber_wins = wins[12:15].sum()
rf_wins = wins[15:].sum()

In [22]:
np.array([ridge_wins, enet_wins, lasso_wins, larm_wins, huber_wins, rf_wins]) / wins.sum() 

array([ 0.18428136,  0.02802548,  0.03631127,  0.19560233,  0.37547494,
        0.18030462])

### Finding weights via classification 

In [23]:
newy = np.argmin(abserror, axis =1)

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
clf = RandomForestClassifier(max_depth=15, random_state=9)

In [26]:
assert (l1data.parcelid == data.parcelid).all()

In [28]:
clf.fit(feature_pipeline.transform(data),newy)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=9, verbose=0, warm_start=False)

In [29]:
clf.score(feature_pipeline.transform(data), newy)

0.49331487122680701

In [30]:
clf.predict_proba(feature_pipeline.transform(data)).shape

(90275, 17)

### Using classifier to find weights for test set

In [31]:
test_probs = generate_test_probs(clf, col_names= X.columns)

In [32]:
test_probs.head()

Unnamed: 0,logistic_ridge,stacked_rfs_ridge,stacked_annrfs_ridge,logistic_enet,stacked_rfs_enet,stacked_annrfs_enet,logistic_lasso,stacked_rfs_lasso,stacked_annrfs_lasso,logistic_larm,stacked_rfs_larm,stacked_annrfs_larm,logistic_huber,stacked_rfs_huber,stacked_annrfs_huber,logistic_rf,stacked_annrfs_rf
0,0.007355,0.100739,0.158308,0.001051,0.000631,0.007289,0.00042,0.001051,0.010492,0.217057,0.139652,0.025602,0.0409,0.093722,0.010023,0.030267,0.155442
1,0.004067,0.184438,0.259696,0.000631,0.002805,0.009463,0.0,0.000631,0.015891,0.115598,0.015985,0.017721,0.054912,0.066187,0.012354,0.031405,0.208218
2,0.029282,0.154869,0.11813,0.005509,0.01255,0.010865,0.000977,0.007131,0.021389,0.061594,0.047243,0.108395,0.068472,0.109226,0.048469,0.121715,0.074182
3,0.034645,0.38911,0.120437,0.0,0.003903,0.005722,0.000897,0.009448,0.004376,0.038493,0.049261,0.046813,0.054961,0.127891,0.030066,0.045489,0.03849
4,0.06517,0.096882,0.070753,0.01012,0.002928,0.005813,0.00172,0.017875,0.012238,0.104917,0.058063,0.023602,0.139482,0.186229,0.085185,0.054248,0.064776


In [33]:
test_probs.describe()

Unnamed: 0,logistic_ridge,stacked_rfs_ridge,stacked_annrfs_ridge,logistic_enet,stacked_rfs_enet,stacked_annrfs_enet,logistic_lasso,stacked_rfs_lasso,stacked_annrfs_lasso,logistic_larm,stacked_rfs_larm,stacked_annrfs_larm,logistic_huber,stacked_rfs_huber,stacked_annrfs_huber,logistic_rf,stacked_annrfs_rf
count,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0
mean,0.05775269,0.08331603,0.05766916,0.007339208,0.009988314,0.009923883,0.007431505,0.01360287,0.01417965,0.05537879,0.07836422,0.04342092,0.1161741,0.1535175,0.09665806,0.08788004,0.1074031
std,0.06131835,0.06437139,0.04374424,0.01041509,0.01322385,0.01263183,0.01110726,0.02026178,0.02135837,0.05504736,0.06546399,0.05006003,0.08966077,0.08974504,0.06242869,0.05980993,0.09739627
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.01882056,0.03765338,0.02772986,0.003151278,0.0040228,0.004030665,0.003274133,0.005091901,0.004074169,0.01956505,0.03322622,0.01260307,0.04899401,0.08224593,0.04606598,0.04546019,0.04068885
50%,0.03710211,0.0623146,0.04671575,0.005080467,0.006759626,0.006651474,0.005115391,0.008281845,0.007321175,0.03553183,0.06101468,0.02337199,0.08714901,0.1325674,0.09046875,0.07097932,0.06593021
75%,0.07337125,0.1090162,0.07343286,0.007808747,0.0112653,0.01088747,0.007814584,0.01426276,0.01499253,0.06856758,0.09807472,0.05500596,0.1563572,0.217883,0.1352005,0.1147402,0.1411197
max,0.6637873,0.7425046,0.6887326,0.3693994,0.6266189,0.5149061,0.5053965,0.6773827,0.5902781,0.6683476,0.7776557,0.8123815,0.7601706,0.7876841,0.5949216,0.8579327,0.8378932


In [34]:
test_probs.to_csv("/home/anerdi/Desktop/Zillow/levelonedata/super_learner_weights_2.csv.gz", compression='gzip',
                 index=False)

### Assessing performance of models with extreme points

In [146]:
Xextreme = X[np.where((l1data.logerror <= -2) | (l1data.logerror >= 2))[0],:]

In [140]:
extreme_logerrors = l1data.iloc[np.where((l1data.logerror <= -2) | (l1data.logerror >= 2))[0],:].logerror.values[:,np.newaxis]

In [148]:
extreme_abserror = abs(Xextreme-np.tile(extreme_logerrors,Xextreme.shape[1]))

In [151]:
extreme_abserror.mean(axis=0)

array([ 2.55365065,  2.54774629,  2.54912892,  2.5542268 ,  2.54937705,
        2.55064732,  2.55432623,  2.54927759,  2.55057871,  2.55465447,
        2.54918357,  2.55058782,  2.55315009,  2.55187126,  2.55255297,
        2.55306334,  2.54588748])

### Finding optimal weights using NNs

In [13]:
l1data.columns

Index(['parcelid', 'stacked_rfs_ridge', 'stacked_rfs_enet',
       'stacked_rfs_lasso', 'stacked_rfs_larm', 'stacked_rfs_huber',
       'stacked_annrfs_ridge', 'stacked_annrfs_enet', 'stacked_annrfs_lasso',
       'stacked_annrfs_larm', 'stacked_annrfs_huber', 'logistic_ridge',
       'logistic_enet', 'logistic_lasso', 'logistic_larm', 'logistic_huber',
       'stacked_rfs_rf', 'stacked_rfs_rf_overfit', 'stacked_annrfs_rf',
       'stacked_annrfs_rf_overfit', 'logistic_rf', 'logistic_rf_overfit',
       'logerror'],
      dtype='object')

In [23]:
X = l1data[[
    'logistic_ridge',
 'stacked_rfs_ridge',
 'stacked_annrfs_ridge']].values

y = l1data.logerror.values

In [24]:
from sklearn.neural_network import MLPRegressor

In [40]:
stack = MLPRegressor(hidden_layer_sizes=(10,10),random_state=9, max_iter=300, early_stopping=True)

In [41]:
stack.fit(X,y)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=300, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=9, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [42]:
preds = stack.predict(X)

In [43]:
new_submission = DataFrame({'ParcelId': l1data['parcelid'],
                           '201610':preds,
                           '201611':preds,
                           '201612':preds,
})
new_submission['201710'] = 0
new_submission['201711'] = 0
new_submission['201712'] = 0

In [44]:
mean_absolute_errors(new_submission,data)

(0.06241616765085067,
 0.062246095856333956,
 0.07335353272780508,
 0.064614821512161927)

### Pickle the NN

In [21]:
from sklearn.externals import joblib

In [45]:
joblib.dump(stack, '/home/anerdi/Desktop/Zillow/submissions/stage2_stacked_NN_ridge.pkl') 

['/home/anerdi/Desktop/Zillow/submissions/stage2_stacked_NN_ridge.pkl']