In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Reading in data

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [3]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

In [4]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Readin stage 1 classification results 

In [5]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_ann_rfs.csv.gz",
                                        compression='gzip')
overestimate_probabilities.rename(columns={'stacked_pred':'overestimate_prob'},inplace=True)
overestimate_probabilities.head()

Unnamed: 0,parcelid,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,overestimate_prob
0,10754147,0.469632,0.466012,0.495383,0.479207
1,10759547,0.406392,0.555562,0.524675,0.475759
2,10843547,0.880731,0.548264,0.548752,0.732975
3,10859147,0.569588,0.663067,0.543329,0.6021
4,10879947,0.540791,0.519636,0.485341,0.531297


### Readin two-stage linear model predictions 

In [6]:
two_stage_linear_models = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/two_stage_preds_linear_models.csv.gz",
                       compression='gzip')
two_stage_linear_models.head()

Unnamed: 0,parcelid,ridge_under,ridge_over,enet_under,enet_over,lasso_under,lasso_over,larm_under,larm_over,huber_under,huber_over
0,10754147,0.027747,5.170251,-0.068398,0.073391,-0.068158,0.073282,-0.064499,0.071395,-0.062325,0.062914
1,10759547,-0.08381,0.085823,-0.068398,0.073391,-0.068158,0.073282,-0.064499,0.071395,-0.06162,0.062129
2,10843547,-0.874368,0.932315,-0.413808,0.560253,-0.300357,0.458923,-0.064499,0.071395,-2.651445,2.897989
3,10859147,-0.124433,0.121149,-0.091428,0.098976,-0.087674,0.095881,-0.064499,0.071395,-0.171291,0.1693
4,10879947,-0.084294,0.083876,-0.068759,0.073458,-0.068661,0.073484,-0.064499,0.071395,-0.060274,0.059823


### Readin original linear models (i.e., one stage models) 

In [7]:
ridge = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Ridge.gz")
lasso = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Lasso.gz")
enet = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/ElasticNet.gz")
larm = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/LARM.gz")
huber = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/Huber_noweight.gz")

In [8]:
models = [
    ('ridge', ridge),
    ('lasso', lasso),
    ('enet', enet),
    ('larm', larm),
    ('huber', huber)
         ]

### Combine preds & overestimate probabilities to generate one prediction 

In [9]:
overestimate_probabilities.overestimate_prob.describe()

count    2.985217e+06
mean     5.628201e-01
std      6.665927e-02
min      2.171894e-01
25%      5.216461e-01
50%      5.640104e-01
75%      6.057195e-01
max      8.389546e-01
Name: overestimate_prob, dtype: float64

In [10]:
# set cutoffs for when to use over/under estimate models
underestimate_cutoff = 0.20
overestimate_cutoff = 0.80

In [11]:
test_predictions = pd.merge(two_stage_linear_models, overestimate_probabilities, on='parcelid')

In [12]:
for model_pair in models:
    model_name, single_stage_model_results = model_pair
    
    # combine over and under to get prediction (method 1 (M1): use only under/over models and estimated probs)
    test_predictions["m1_%s" % model_name] = (test_predictions['%s_over' % model_name]*test_predictions['overestimate_prob'] +
                    test_predictions['%s_under' % model_name]*(1 - test_predictions['overestimate_prob']))
    
    # combine over and under to get prediction (method 2 (M2): use only under/over models and estimated probs)
    test_predictions["m2_%s" % model_name] = np.where(((test_predictions['overestimate_prob'] < underestimate_cutoff) | 
                                                      (test_predictions['overestimate_prob'] > overestimate_cutoff)),
        test_predictions['m1_%s' % model_name],single_stage_model_results['201610'])
    
    
#     # combine over and under to get prediction (method 2 (M2): use under/over models, estimated probs, and single-stage models)
#     test_predictions["m2_%s" % model_name] = ((test_predictions['overestimate_prob'] < underestimate_cutoff)*test_predictions['%s_under' % model_name] 
#         + (test_predictions['overestimate_prob'] > overestimate_cutoff)*test_predictions['%s_over' % model_name]
#         + ((test_predictions['overestimate_prob'] >= underestimate_cutoff) | (test_predictions['overestimate_prob'] <= overestimate_cutoff))*single_stage_model_results['201610']
#     )

In [13]:
test_predictions.head()

Unnamed: 0,parcelid,ridge_under,ridge_over,enet_under,enet_over,lasso_under,lasso_over,larm_under,larm_over,huber_under,...,m1_ridge,m2_ridge,m1_lasso,m2_lasso,m1_enet,m2_enet,m1_larm,m2_larm,m1_huber,m2_huber
0,10754147,0.027747,5.170251,-0.068398,0.073391,-0.068158,0.073282,-0.064499,0.071395,-0.062325,...,2.49207,0.01341,-0.000379,0.01227,-0.000451,0.01243,0.000622,0.01146,-0.00231,0.01112
1,10759547,-0.08381,0.085823,-0.068398,0.073391,-0.068158,0.073282,-0.064499,0.071395,-0.06162,...,-0.003106,0.01338,-0.000867,0.01223,-0.00094,0.0124,0.000154,0.01146,-0.002745,0.01112
2,10843547,-0.874368,0.932315,-0.413808,0.560253,-0.300357,0.458923,-0.064499,0.071395,-2.651445,...,0.449886,0.6096,0.256176,0.3717,0.300154,0.4817,0.035108,0.01146,1.416152,0.01105
3,10859147,-0.124433,0.121149,-0.091428,0.098976,-0.087674,0.095881,-0.064499,0.071395,-0.171291,...,0.023432,0.03578,0.022844,0.02874,0.023215,0.03203,0.017323,0.01146,0.033779,0.01111
4,10879947,-0.084294,0.083876,-0.068759,0.073458,-0.068661,0.073484,-0.064499,0.071395,-0.060274,...,0.005054,0.01138,0.00686,0.01171,0.0068,0.01166,0.007701,0.01146,0.003534,0.01111


In [14]:
for model_pair in models:
    model_name, single_stage_model = model_pair
    
    for method in ['m1']:
        new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                                   '201610':test_predictions["%s_%s" % (method,model_name)],
                                   '201611':test_predictions["%s_%s" % (method,model_name)],
                                   '201612':test_predictions["%s_%s" % (method,model_name)],
                                   '201710':test_predictions["%s_%s" % (method,model_name)],
                                   '201711':test_predictions["%s_%s" % (method,model_name)],
                                   '201712':test_predictions["%s_%s" % (method,model_name)]})

        print("%s using %s:" % (model_name, method))
        print(mean_absolute_errors(new_submission, data))
        print("")
        new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_stage1_stacked_annrfs_stage2_%s_imputebycounty.csv.gz" % model_name, index=False,
                     compression='gzip')
        
    print("%s using single-stage model:" % (model_name))
    print(mean_absolute_errors(single_stage_model, data))
    print("")

ridge using m1:
(0.06211662330266915, 0.06122311707674782, 0.07381781199062934, 0.064307775814941501)

ridge using single-stage model:
(0.06332056539983931, 0.06250535370657166, 0.07484831374353078, 0.065493145336361525)

lasso using m1:
(0.062133533239734355, 0.0611860000097644, 0.07377258721151689, 0.064300487018592312)

lasso using single-stage model:
(0.06333524733775354, 0.06244184118291356, 0.07476208625646924, 0.065470568485132233)

enet using m1:
(0.06213008492740844, 0.061190336988136564, 0.07377312088293247, 0.064299513607992118)

enet using single-stage model:
(0.06332380630902144, 0.06246366413472067, 0.07474843300747566, 0.0654657878377429)

larm using m1:
(0.06218541582658408, 0.061202278146372975, 0.07380443048359252, 0.064340678889622277)

larm using single-stage model:
(0.06345773598553339, 0.06251769879518071, 0.07483130304772855, 0.065572243736829725)

huber using m1:
(0.0623504836252892, 0.06162000539505469, 0.07401439467636677, 0.064568897119718568)

huber using si

In [15]:
method, model_name = 'm1', 'ridge'
new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                                   '201610':test_predictions["%s_%s" % (method,model_name)],
                                   '201611':test_predictions["%s_%s" % (method,model_name)],
                                   '201612':test_predictions["%s_%s" % (method,model_name)],
                                   '201710':test_predictions["%s_%s" % (method,model_name)],
                                   '201711':test_predictions["%s_%s" % (method,model_name)],
                                   '201712':test_predictions["%s_%s" % (method,model_name)]})

In [16]:
mean_absolute_errors(new_submission.round(4), data)

(0.06211394414305826,
 0.061221522453450215,
 0.07382415181138596,
 0.064307164598454841)

In [17]:
new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_stage1_stacked_annrfs_stage2_ridge.csv.gz", index=False,
                     compression='gzip')