In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Reading in data

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [3]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

In [4]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Readin stage 1 classification results 

In [5]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_ann_rfs.csv.gz",
                                        compression='gzip')
overestimate_probabilities.rename(columns={'stacked_pred':'overestimate_prob'},inplace=True)
overestimate_probabilities.head()

Unnamed: 0,parcelid,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,overestimate_prob
0,10754147,0.469632,0.466012,0.495383,0.479207
1,10759547,0.406392,0.555562,0.524675,0.475759
2,10843547,0.880731,0.548264,0.548752,0.732975
3,10859147,0.569588,0.663067,0.543329,0.6021
4,10879947,0.540791,0.519636,0.485341,0.531297


### Readin two-stage linear model predictions 

In [6]:
two_stage_linear_models = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/two_stage_preds_linear_models.csv.gz",
                       compression='gzip')
two_stage_linear_models.head()

Unnamed: 0,parcelid,ridge_under,ridge_over,enet_under,enet_over,lasso_under,lasso_over,larm_under,larm_over,huber_under,huber_over
0,10754147,-0.082879,0.084871,-0.068505,0.073578,-0.068225,0.073431,-0.064499,0.071395,-0.062623,0.063218
1,10759547,-0.082642,0.085475,-0.068505,0.073578,-0.068225,0.073431,-0.064499,0.071395,-0.062623,0.063218
2,10843547,-0.876458,0.930269,-0.414612,0.558336,-0.301205,0.457031,-0.064499,0.071395,-2.651653,2.897348
3,10859147,-0.124558,0.121058,-0.091456,0.098928,-0.087705,0.095834,-0.064499,0.071395,-0.171291,0.16929
4,10879947,-0.084374,0.083832,-0.068765,0.073467,-0.068667,0.073493,-0.064499,0.071395,-0.060274,0.059819


### Readin original linear models (i.e., one stage models) 

In [7]:
ridge = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Ridge.gz")
lasso = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Lasso.gz")
enet = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/ElasticNet.gz")
larm = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/LARM.gz")
huber = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Huber_noweight.gz")

In [8]:
models = [
    ('ridge', ridge),
    ('lasso', lasso),
    ('enet', enet),
    ('larm', larm),
    ('huber', huber)
         ]

### Combine preds & overestimate probabilities to generate one prediction 

In [9]:
overestimate_probabilities.overestimate_prob.describe()

count    2.985217e+06
mean     5.628201e-01
std      6.665927e-02
min      2.171894e-01
25%      5.216461e-01
50%      5.640104e-01
75%      6.057195e-01
max      8.389546e-01
Name: overestimate_prob, dtype: float64

In [10]:
# set cutoffs for when to use over/under estimate models
underestimate_cutoff = 0.20
overestimate_cutoff = 0.80

In [11]:
test_predictions = pd.merge(two_stage_linear_models, overestimate_probabilities, on='parcelid')

In [12]:
for model_pair in models:
    model_name, single_stage_model_results = model_pair
    
    # combine over and under to get prediction (method 1 (M1): use only under/over models and estimated probs)
    test_predictions["m1_%s" % model_name] = (test_predictions['%s_over' % model_name]*test_predictions['overestimate_prob'] +
                    test_predictions['%s_under' % model_name]*(1 - test_predictions['overestimate_prob']))
    
    # combine over and under to get prediction (method 2 (M2): use only under/over models and estimated probs)
    test_predictions["m2_%s" % model_name] = np.where(((test_predictions['overestimate_prob'] < underestimate_cutoff) | 
                                                      (test_predictions['overestimate_prob'] > overestimate_cutoff)),
        test_predictions['m1_%s' % model_name],single_stage_model_results['201610'])
    
    
#     # combine over and under to get prediction (method 2 (M2): use under/over models, estimated probs, and single-stage models)
#     test_predictions["m2_%s" % model_name] = ((test_predictions['overestimate_prob'] < underestimate_cutoff)*test_predictions['%s_under' % model_name] 
#         + (test_predictions['overestimate_prob'] > overestimate_cutoff)*test_predictions['%s_over' % model_name]
#         + ((test_predictions['overestimate_prob'] >= underestimate_cutoff) | (test_predictions['overestimate_prob'] <= overestimate_cutoff))*single_stage_model_results['201610']
#     )

In [13]:
test_predictions.head()

Unnamed: 0,parcelid,ridge_under,ridge_over,enet_under,enet_over,lasso_under,lasso_over,larm_under,larm_over,huber_under,...,m1_ridge,m2_ridge,m1_lasso,m2_lasso,m1_enet,m2_enet,m1_larm,m2_larm,m1_huber,m2_huber
0,10754147,-0.082879,0.084871,-0.068505,0.073578,-0.068225,0.073431,-0.064499,0.071395,-0.062623,...,-0.002492,0.01341,-0.000342,0.01227,-0.000418,0.01243,0.000622,0.01146,-0.002319,0.01112
1,10759547,-0.082642,0.085475,-0.068505,0.073578,-0.068225,0.073431,-0.064499,0.071395,-0.062623,...,-0.002658,0.01338,-0.000831,0.01223,-0.000908,0.0124,0.000154,0.01146,-0.002753,0.01112
2,10843547,-0.876458,0.930269,-0.414612,0.558336,-0.301205,0.457031,-0.064499,0.071395,-2.651653,...,0.447828,0.6096,0.254563,0.3717,0.298535,0.4817,0.035108,0.01146,1.415627,0.01105
3,10859147,-0.124558,0.121058,-0.091456,0.098928,-0.087705,0.095834,-0.064499,0.071395,-0.171291,...,0.023327,0.03578,0.022804,0.02874,0.023174,0.03203,0.017323,0.01146,0.033773,0.01111
4,10879947,-0.084374,0.083832,-0.068765,0.073467,-0.068667,0.073493,-0.064499,0.071395,-0.060274,...,0.004993,0.01138,0.006862,0.01171,0.006802,0.01166,0.007701,0.01146,0.003531,0.01111


In [18]:
for model_pair in models:
    model_name, single_stage_model = model_pair
    
    for method in ['m1']:
        new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                                   '201610':test_predictions["%s_%s" % (method,model_name)],
                                   '201611':test_predictions["%s_%s" % (method,model_name)],
                                   '201612':test_predictions["%s_%s" % (method,model_name)],
                                   '201710':test_predictions["%s_%s" % (method,model_name)],
                                   '201711':test_predictions["%s_%s" % (method,model_name)],
                                   '201712':test_predictions["%s_%s" % (method,model_name)]})

        print("%s using %s:" % (model_name, method))
        print(mean_absolute_errors(new_submission, data))
        print("")
        new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_stage1_stacked_annrfs_stage2_%s.csv.gz" % model_name, index=False,
                     compression='gzip')
        
    print("%s using single-stage model:" % (model_name))
    print(mean_absolute_errors(single_stage_model, data))
    print("")

ridge using m1:
(0.06211399558542294, 0.06122071392359969, 0.07382398381835442, 0.064306987533746349)

ridge using single-stage model:
(0.06332056539983931, 0.06250535370657166, 0.07484831374353078, 0.065493145336361525)

lasso using m1:
(0.06213346767380986, 0.0611854643155412, 0.07377276216939664, 0.064300369921015066)

lasso using single-stage model:
(0.06333524733775354, 0.06244184118291356, 0.07476208625646924, 0.065470568485132233)

enet using m1:
(0.062129846787910566, 0.06118977201656746, 0.07377336814846552, 0.06429930442236767)

enet using single-stage model:
(0.06332380630902144, 0.06246366413472067, 0.07474843300747566, 0.0654657878377429)

larm using m1:
(0.06218541582658408, 0.061202278146372975, 0.07380443048359252, 0.064340678889622277)

larm using single-stage model:
(0.06345773598553339, 0.06251769879518071, 0.07483130304772855, 0.065572243736829725)

huber using m1:
(0.0623502486865228, 0.06161960194877311, 0.07401535038017781, 0.064568868553314548)

huber using sing

In [15]:
method, model_name = 'm1', 'ridge'
new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                                   '201610':test_predictions["%s_%s" % (method,model_name)],
                                   '201611':test_predictions["%s_%s" % (method,model_name)],
                                   '201612':test_predictions["%s_%s" % (method,model_name)],
                                   '201710':test_predictions["%s_%s" % (method,model_name)],
                                   '201711':test_predictions["%s_%s" % (method,model_name)],
                                   '201712':test_predictions["%s_%s" % (method,model_name)]})

In [16]:
mean_absolute_errors(new_submission.round(4), data)

(0.06211394414305826,
 0.061221522453450215,
 0.07382415181138596,
 0.064307164598454841)

In [17]:
new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_stage1_stacked_annrfs_stage2_ridge.csv.gz", index=False,
                     compression='gzip')