In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Reading in data

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [3]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

In [4]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Readin stage 1 classification results 

In [7]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs.csv.gz",
                                        compression='gzip')
overestimate_probabilities.head()

Unnamed: 0,parcelid,overestimate_prob
0,10754147,0.509328
1,10759547,0.507312
2,10843547,1.0
3,10859147,0.689618
4,10879947,0.543763


### Readin two-stage linear model predictions 

In [9]:
two_stage_linear_models = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/two_stage_preds_linear_models.csv.gz",
                       compression='gzip')
two_stage_linear_models.head()

Unnamed: 0.1,Unnamed: 0,parcelid,ridge_under,ridge_over,enet_under,enet_over,lasso_under,lasso_over,larm_under,larm_over,huber_under,huber_over
0,0,10754147,-0.082879,0.084871,-0.068505,0.073578,-0.068225,0.073431,-0.064499,0.071395,-0.062623,0.063218
1,1,10759547,-0.082642,0.085475,-0.068505,0.073578,-0.068225,0.073431,-0.064499,0.071395,-0.062623,0.063218
2,2,10843547,-0.876458,0.930269,-0.414612,0.558336,-0.301205,0.457031,-0.064499,0.071395,-2.651653,2.897348
3,3,10859147,-0.124558,0.121058,-0.091456,0.098928,-0.087705,0.095834,-0.064499,0.071395,-0.171291,0.16929
4,4,10879947,-0.084374,0.083832,-0.068765,0.073467,-0.068667,0.073493,-0.064499,0.071395,-0.060274,0.059819


### Readin original linear models (i.e., one stage models) 

In [13]:
ridge = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Ridge.gz")
lasso = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Lasso.gz")
enet = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/ElasticNet.gz")
larm = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/LARM.gz")
huber = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Huber_noweight.gz")

In [16]:
models = [
    ('ridge', ridge),
    ('lasso', lasso),
    ('enet', enet),
    ('larm', larm),
    ('huber', huber)
         ]

### Combine preds & overestimate probabilities to generate one prediction 

In [17]:
overestimate_probabilities.overestimate_prob.describe()

count    2.985217e+06
mean     5.605187e-01
std      5.892519e-02
min      6.101577e-42
25%      5.224003e-01
50%      5.547780e-01
75%      5.938315e-01
max      1.000000e+00
Name: overestimate_prob, dtype: float64

In [76]:
# set cutoffs for when to use over/under estimate models
underestimate_cutoff = 0.45
overestimate_cutoff = 0.55

In [77]:
test_predictions = pd.merge(two_stage_linear_models, overestimate_probabilities, on='parcelid')

In [78]:
for model_pair in models:
    model_name, single_stage_model_results = model_pair
    
    # combine over and under to get prediction (method 1 (M1): use only under/over models and estimated probs)
    test_predictions["m1_%s" % model_name] = (test_predictions['%s_over' % model_name]*test_predictions['overestimate_prob'] +
                    test_predictions['%s_under' % model_name]*(1 - test_predictions['overestimate_prob']))
    
    # combine over and under to get prediction (method 2 (M2): use only under/over models and estimated probs)
    test_predictions["m2_%s" % model_name] = np.where(((test_predictions['overestimate_prob'] < underestimate_cutoff) | 
                                                      (test_predictions['overestimate_prob'] > overestimate_cutoff)),
        test_predictions['m1_%s' % model_name],single_stage_model_results['201610'])
    
    
#     # combine over and under to get prediction (method 2 (M2): use under/over models, estimated probs, and single-stage models)
#     test_predictions["m2_%s" % model_name] = ((test_predictions['overestimate_prob'] < underestimate_cutoff)*test_predictions['%s_under' % model_name] 
#         + (test_predictions['overestimate_prob'] > overestimate_cutoff)*test_predictions['%s_over' % model_name]
#         + ((test_predictions['overestimate_prob'] >= underestimate_cutoff) | (test_predictions['overestimate_prob'] <= overestimate_cutoff))*single_stage_model_results['201610']
#     )

In [79]:
test_predictions.head()

Unnamed: 0.1,Unnamed: 0,parcelid,ridge_under,ridge_over,enet_under,enet_over,lasso_under,lasso_over,larm_under,larm_over,...,m1_ridge,m2_ridge,m1_lasso,m2_lasso,m1_enet,m2_enet,m1_larm,m2_larm,m1_huber,m2_huber
0,0,10754147,-0.082879,0.084871,-0.068505,0.073578,-0.068225,0.073431,-0.064499,0.071395,...,0.002561,0.01341,0.003925,0.01227,0.003862,0.01243,0.004716,0.01146,0.001471,0.01112
1,1,10759547,-0.082642,0.085475,-0.068505,0.073578,-0.068225,0.073431,-0.064499,0.071395,...,0.002646,0.01338,0.003639,0.01223,0.003575,0.0124,0.004442,0.01146,0.001217,0.01112
2,2,10843547,-0.876458,0.930269,-0.414612,0.558336,-0.301205,0.457031,-0.064499,0.071395,...,0.930269,0.930269,0.457031,0.457031,0.558336,0.558336,0.071395,0.071395,2.897348,2.897348
3,3,10859147,-0.124558,0.121058,-0.091456,0.098928,-0.087705,0.095834,-0.064499,0.071395,...,0.044823,0.044823,0.038867,0.038867,0.039836,0.039836,0.029216,0.029216,0.06358,0.06358
4,4,10879947,-0.084374,0.083832,-0.068765,0.073467,-0.068667,0.073493,-0.064499,0.071395,...,0.00709,0.01138,0.008634,0.01171,0.008575,0.01166,0.009395,0.01146,0.005028,0.01111


In [80]:
for model_pair in models:
    model_name, single_stage_model = model_pair
    
    for method in ['m1','m2']:
        new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                                   '201610':test_predictions["%s_%s" % (method,model_name)],
                                   '201611':test_predictions["%s_%s" % (method,model_name)],
                                   '201612':test_predictions["%s_%s" % (method,model_name)],
                                   '201710':test_predictions["%s_%s" % (method,model_name)],
                                   '201711':test_predictions["%s_%s" % (method,model_name)],
                                   '201712':test_predictions["%s_%s" % (method,model_name)]})

        print("%s using %s:" % (model_name, method))
        print(mean_absolute_errors(new_submission, data))
        print("")
    print("%s using single-stage model:" % (model_name))
    print(mean_absolute_errors(single_stage_model, data))
    print("")

ridge using m1:
(0.06321414914041831, 0.06228283305293353, 0.07486968327968478, 0.065387924683901949)

ridge using m2:
(0.06331441537626807, 0.06240150496898414, 0.0748224191314452, 0.065462090876918094)

ridge using single-stage model:
(0.06332056539983931, 0.06250535370657166, 0.07484831374353078, 0.065493145336361525)

lasso using m1:
(0.0632147075835896, 0.062161583884227004, 0.07480863987082337, 0.065349903600033454)

lasso using m2:
(0.06335959311236088, 0.062336798787411746, 0.07481908942573882, 0.065473903771645262)

lasso using single-stage model:
(0.06333524733775354, 0.06244184118291356, 0.07476208625646924, 0.065470568485132233)

enet using m1:
(0.06320760532770525, 0.06216216112227742, 0.07480489632595513, 0.065345126742695339)

enet using m2:
(0.06335601699098352, 0.06233357852848602, 0.0748064604284509, 0.065468560716719332)

enet using single-stage model:
(0.06332380630902144, 0.06246366413472067, 0.07474843300747566, 0.0654657878377429)

larm using m1:
(0.0632492218761