In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Reading in data

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [3]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

In [4]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Readin stage 1 classification results 

In [5]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_ann_rfs_xgbs_201617.csv.gz",
                                        compression='gzip')
overestimate_probabilities.rename(columns={'stacked_pred':'overestimate_prob'},inplace=True)
overestimate_probabilities.head()

Unnamed: 0,parcelid,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,xgb1_overestimate_prob,xgb2_overestimate_prob,overestimate_prob
0,10754147,0.501589,0.510296,0.571558,0.532681,0.469636,0.508358
1,10759547,0.495938,0.488298,0.444372,0.562934,0.517057,0.536716
2,10843547,0.527465,0.678772,0.54786,0.529671,0.491906,0.514194
3,10859147,0.654785,0.662637,0.471027,0.570774,0.556556,0.573179
4,10879947,0.469003,0.490839,0.467619,0.539413,0.495418,0.513103


### Readin two-stage linear model predictions 

In [6]:
two_stage_linear_models = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/two_stage_preds_linear_models_age.csv.gz",
                       compression='gzip')
two_stage_linear_models.head()

Unnamed: 0,parcelid,ridge_10_under,ridge_11_under,ridge_12_under,ridge_10_over,ridge_11_over,ridge_12_over,enet_10_under,enet_11_under,enet_12_under,...,larm_12_under,larm_10_over,larm_11_over,larm_12_over,huber_10_under,huber_11_under,huber_12_under,huber_10_over,huber_11_over,huber_12_over
0,10754147,-0.083343,-0.08346,-0.083549,0.081872,0.081947,0.082185,-0.073147,-0.073147,-0.073147,...,-0.067742,0.073268,0.073268,0.073268,-0.030622,-0.030622,-0.030622,0.028524,0.028524,0.028524
1,10759547,-0.080735,-0.080853,-0.080942,0.080371,0.080446,0.080683,-0.073147,-0.073147,-0.073147,...,-0.067742,0.073268,0.073268,0.073268,-0.030622,-0.030622,-0.030622,0.028524,0.028524,0.028524
2,10843547,-0.820392,-0.82051,-0.820599,1.631166,1.631242,1.631479,-0.092616,-0.092616,-0.092616,...,-0.067742,0.073268,0.073268,0.073268,-0.117576,-0.117576,-0.117576,0.109521,0.109521,0.109521
3,10859147,-0.113977,-0.114095,-0.114183,0.109437,0.109512,0.109749,-0.09843,-0.09843,-0.09843,...,-0.071982,0.075447,0.075447,0.075447,-0.103229,-0.103229,-0.103229,0.096156,0.096156,0.096156
4,10879947,-0.09455,-0.094668,-0.094757,0.091331,0.091406,0.091643,-0.083955,-0.083955,-0.083955,...,-0.072357,0.07564,0.07564,0.07564,-0.035014,-0.035014,-0.035014,0.032615,0.032615,0.032615


### Readin original linear models (i.e., one stage models) 

In [7]:
ridge = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Ridge.gz")
lasso = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Lasso.gz")
enet = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/ElasticNet.gz")
larm = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/LARM.gz")
huber = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Huber_noweight.gz")

In [8]:
models = [
    ('ridge', ridge),
    ('lasso', lasso),
    ('enet', enet),
    ('larm', larm),
    ('huber', huber)
         ]

### Combine preds & overestimate probabilities to generate one prediction 

In [9]:
test_predictions = pd.merge(two_stage_linear_models, overestimate_probabilities, on='parcelid')

In [10]:
for model_pair in models:
    current_model_name, single_stage_model_results = model_pair
    
    for month in [10,11,12]:
        test_predictions['%s_%d' % (current_model_name, month)] = (
                test_predictions['%s_%d_over' % (current_model_name, month)]*test_predictions['overestimate_prob'] 
                + test_predictions['%s_%d_under' % (current_model_name, month)]*(1 - test_predictions['overestimate_prob']))

In [11]:
test_predictions.head()

Unnamed: 0,parcelid,ridge_10_under,ridge_11_under,ridge_12_under,ridge_10_over,ridge_11_over,ridge_12_over,enet_10_under,enet_11_under,enet_12_under,...,lasso_12,enet_10,enet_11,enet_12,larm_10,larm_11,larm_12,huber_10,huber_11,huber_12
0,10754147,-0.083343,-0.08346,-0.083549,0.081872,0.081947,0.082185,-0.073147,-0.073147,-0.073147,...,0.005031,0.005096,0.005096,0.005096,0.003942,0.003942,0.003942,-0.000555,-0.000555,-0.000555
1,10759547,-0.080735,-0.080853,-0.080942,0.080371,0.080446,0.080683,-0.073147,-0.073147,-0.073147,...,0.009403,0.009461,0.009461,0.009461,0.00794,0.00794,0.00794,0.001123,0.001123,0.001123
2,10843547,-0.820392,-0.82051,-0.820599,1.631166,1.631242,1.631479,-0.092616,-0.092616,-0.092616,...,0.005223,0.005326,0.005326,0.005326,0.004765,0.004765,0.004765,-0.000804,-0.000804,-0.000804
3,10859147,-0.113977,-0.114095,-0.114183,0.109437,0.109512,0.109749,-0.09843,-0.09843,-0.09843,...,0.016852,0.017226,0.017226,0.017226,0.012521,0.012521,0.012521,0.011054,0.011054,0.011054
4,10879947,-0.09455,-0.094668,-0.094757,0.091331,0.091406,0.091643,-0.083955,-0.083955,-0.083955,...,0.005561,0.005648,0.005648,0.005648,0.003581,0.003581,0.003581,-0.000313,-0.000313,-0.000313


In [12]:
for model_pair in models:
    model_name, single_stage_model = model_pair
    
    new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                           '201610':test_predictions['%s_10' % model_name],
                           '201611':test_predictions['%s_11' % model_name],
                           '201612':test_predictions['%s_12' % model_name],
    })
    new_submission['201710'] = 0
    new_submission['201711'] = 0
    new_submission['201712'] = 0
    
    print("%s:" % (model_name))
    print(mean_absolute_errors(new_submission, data))
    print("")
    new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/\
    two_stage_stage1_stacked_annrfsxgbs201617_stage2_%s_age.csv.gz" % model_name, index=False,
                 compression='gzip')
        
    print("%s using single-stage model:" % (model_name))
    print(mean_absolute_errors(single_stage_model, data))
    print("")

ridge:
(0.061774922453393284, 0.06080958530486789, 0.0733088711453135, 0.06391667276269343)

ridge using single-stage model:
(0.06332056539983931, 0.06250535370657166, 0.07484831374353078, 0.065493145336361525)

lasso:
(0.061769914395407535, 0.060740444181935076, 0.07332302521242472, 0.063901856224135259)

lasso using single-stage model:
(0.06333524733775354, 0.06244184118291356, 0.07476208625646924, 0.065470568485132233)

enet:
(0.061763559347896076, 0.060745254556995476, 0.07329752134759357, 0.063893989618241373)

enet using single-stage model:
(0.06332380630902144, 0.06246366413472067, 0.07474843300747566, 0.0654657878377429)

larm:
(0.061811440747343425, 0.060781431449706526, 0.07342321505721346, 0.063955210186277978)

larm using single-stage model:
(0.06345773598553339, 0.06251769879518071, 0.07483130304772855, 0.065572243736829725)

huber:
(0.06327475302686743, 0.06212541300192359, 0.07454798441039431, 0.065324092114950522)

huber using single-stage model:
(0.06360898332328704, 0

In [12]:
for model_pair in models:
    model_name, single_stage_model = model_pair
    
    new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                           '201610':test_predictions['%s_10' % model_name],
                           '201611':test_predictions['%s_11' % model_name],
                           '201612':test_predictions['%s_12' % model_name],
    })
    new_submission['201710'] = 0
    new_submission['201711'] = 0
    new_submission['201712'] = 0
    
    print("%s:" % (model_name))
    print(mean_absolute_errors(new_submission, data))
    print("")
    new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/\
    two_stage_stage1_stacked_annrfsxgbs201617_stage2_%s_age.csv.gz" % model_name, index=False,
                 compression='gzip')
        
    print("%s using single-stage model:" % (model_name))
    print(mean_absolute_errors(single_stage_model, data))
    print("")

ridge:
(0.06181535350035905, 0.06083767226635893, 0.07335709342412129, 0.063956051205128234)

ridge using single-stage model:
(0.06332056539983931, 0.06250535370657166, 0.07484831374353078, 0.065493145336361525)

lasso:
(0.06181682692624957, 0.06078072923399728, 0.07338800676376753, 0.063951030549685664)

lasso using single-stage model:
(0.06333524733775354, 0.06244184118291356, 0.07476208625646924, 0.065470568485132233)

enet:
(0.061810133402709556, 0.06078300893482561, 0.07335899048433357, 0.063941710689830608)

enet using single-stage model:
(0.06332380630902144, 0.06246366413472067, 0.07474843300747566, 0.0654657878377429)

larm:
(0.06192584852771982, 0.06086291065295617, 0.07356815709805206, 0.064068795149645516)

larm using single-stage model:
(0.06345773598553339, 0.06251769879518071, 0.07483130304772855, 0.065572243736829725)

huber:
(0.0632415657313987, 0.06211554658111831, 0.07460008325182128, 0.065313252806978528)

huber using single-stage model:
(0.06360898332328704, 0.0626