In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Reading in data

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [3]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

In [4]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Readin stage 1 classification results 

In [6]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_ann_rfs_xgbs.csv.gz",
                                        compression='gzip')
overestimate_probabilities.rename(columns={'stacked_pred':'overestimate_prob'},inplace=True)
overestimate_probabilities.head()

Unnamed: 0,parcelid,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,xgb1_overestimate_prob,xgb2_overestimate_prob,overestimate_prob
0,10754147,0.469632,0.466012,0.495383,0.658496,0.665265,0.616614
1,10759547,0.406392,0.555562,0.524675,0.536781,0.519523,0.499032
2,10843547,0.880731,0.548264,0.548752,0.430116,0.485831,0.546513
3,10859147,0.569588,0.663067,0.543329,0.67905,0.54776,0.620737
4,10879947,0.540791,0.519636,0.485341,0.50273,0.516258,0.510328


In [6]:
# overestimate_probabilities['overestimate_prob'] = (overestimate_probabilities['overestimate_prob'] 
#                                                    + overestimate_probabilities_logistic['overestimate_prob']) / 2

### Readin two-stage linear model predictions 

In [7]:
two_stage_linear_models = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/RF-two-stage-preds.csv.gz",
                       compression='gzip')
two_stage_linear_models.head()

Unnamed: 0,parcelid,rf_maxdepth8_10_over,rf_maxdepth8_11_over,rf_maxdepth8_12_over,rf_maxdepth8_10_under,rf_maxdepth8_11_under,rf_maxdepth8_12_under,rf_maxdepth10_10_over,rf_maxdepth10_11_over,rf_maxdepth10_12_over,rf_maxdepth10_10_under,rf_maxdepth10_11_under,rf_maxdepth10_12_under,rf_maxdepth12_10_over,rf_maxdepth12_11_over,rf_maxdepth12_12_over,rf_maxdepth12_10_under,rf_maxdepth12_11_under,rf_maxdepth12_12_under
0,10754147,0.165587,0.163376,0.162021,-0.071983,-0.07267,-0.072235,0.244577,0.244539,0.267524,-0.07329,-0.072027,-0.071472,0.249669,0.246563,0.244811,-0.074718,-0.074185,-0.075998
1,10759547,0.080274,0.078295,0.079266,-0.072101,-0.072962,-0.072391,0.078782,0.079321,0.078649,-0.072819,-0.072706,-0.072459,0.081675,0.084531,0.084038,-0.072558,-0.073656,-0.074533
2,10843547,0.115475,0.115126,0.115178,-0.077697,-0.07947,-0.07801,0.168102,0.159052,0.169841,-0.083726,-0.082242,-0.082912,0.167159,0.164662,0.162798,-0.081459,-0.087766,-0.090421
3,10859147,0.115453,0.114598,0.11406,-0.075649,-0.077353,-0.076625,0.152831,0.140125,0.155747,-0.081129,-0.083072,-0.084025,0.164347,0.164613,0.165065,-0.079782,-0.087065,-0.091875
4,10879947,0.106098,0.104459,0.105421,-0.155137,-0.129332,-0.127548,0.102826,0.110514,0.112231,-0.247456,-0.225792,-0.168658,0.114469,0.112775,0.103238,-0.123749,-0.142528,-0.162635


### Readin single-stage model predictions 

In [14]:
single_stage_model = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_rf.csv.gz")
mean_absolute_errors(single_stage_model,data)

(0.0631669479606187,
 0.06216046002190593,
 0.07476497987349064,
 0.065312947787403389)

### Combine preds & overestimate probabilities to generate one prediction 

In [8]:
test_predictions = pd.merge(two_stage_linear_models, overestimate_probabilities, on='parcelid')

In [9]:
models = ["rf_maxdepth8","rf_maxdepth10","rf_maxdepth12",]

In [10]:
test_predictions.head()

Unnamed: 0,parcelid,rf_maxdepth8_10_over,rf_maxdepth8_11_over,rf_maxdepth8_12_over,rf_maxdepth8_10_under,rf_maxdepth8_11_under,rf_maxdepth8_12_under,rf_maxdepth10_10_over,rf_maxdepth10_11_over,rf_maxdepth10_12_over,...,rf_maxdepth12_12_over,rf_maxdepth12_10_under,rf_maxdepth12_11_under,rf_maxdepth12_12_under,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,xgb1_overestimate_prob,xgb2_overestimate_prob,overestimate_prob
0,10754147,0.165587,0.163376,0.162021,-0.071983,-0.07267,-0.072235,0.244577,0.244539,0.267524,...,0.244811,-0.074718,-0.074185,-0.075998,0.469632,0.466012,0.495383,0.658496,0.665265,0.616614
1,10759547,0.080274,0.078295,0.079266,-0.072101,-0.072962,-0.072391,0.078782,0.079321,0.078649,...,0.084038,-0.072558,-0.073656,-0.074533,0.406392,0.555562,0.524675,0.536781,0.519523,0.499032
2,10843547,0.115475,0.115126,0.115178,-0.077697,-0.07947,-0.07801,0.168102,0.159052,0.169841,...,0.162798,-0.081459,-0.087766,-0.090421,0.880731,0.548264,0.548752,0.430116,0.485831,0.546513
3,10859147,0.115453,0.114598,0.11406,-0.075649,-0.077353,-0.076625,0.152831,0.140125,0.155747,...,0.165065,-0.079782,-0.087065,-0.091875,0.569588,0.663067,0.543329,0.67905,0.54776,0.620737
4,10879947,0.106098,0.104459,0.105421,-0.155137,-0.129332,-0.127548,0.102826,0.110514,0.112231,...,0.103238,-0.123749,-0.142528,-0.162635,0.540791,0.519636,0.485341,0.50273,0.516258,0.510328


In [11]:
for current_model_name in models:
    # combine over and under to get prediction
    for month in [10,11,12]:
        test_predictions['{0}_{1}'.format(current_model_name, month)] = (test_predictions['%s_%d_over' % (current_model_name, month)]*test_predictions['overestimate_prob'] 
                + test_predictions['%s_%d_under' % (current_model_name, month)]*(1 - test_predictions['overestimate_prob']))

In [12]:
test_predictions.head()

Unnamed: 0,parcelid,rf_maxdepth8_10_over,rf_maxdepth8_11_over,rf_maxdepth8_12_over,rf_maxdepth8_10_under,rf_maxdepth8_11_under,rf_maxdepth8_12_under,rf_maxdepth10_10_over,rf_maxdepth10_11_over,rf_maxdepth10_12_over,...,overestimate_prob,rf_maxdepth8_10,rf_maxdepth8_11,rf_maxdepth8_12,rf_maxdepth10_10,rf_maxdepth10_11,rf_maxdepth10_12,rf_maxdepth12_10,rf_maxdepth12_11,rf_maxdepth12_12
0,10754147,0.165587,0.163376,0.162021,-0.071983,-0.07267,-0.072235,0.244577,0.244539,0.267524,...,0.616614,0.074506,0.072879,0.07221,0.122711,0.123172,0.137558,0.125303,0.123593,0.121818
1,10759547,0.080274,0.078295,0.079266,-0.072101,-0.072962,-0.072391,0.078782,0.079321,0.078649,...,0.499032,0.003939,0.00252,0.003291,0.002835,0.00316,0.002949,0.004409,0.005284,0.004599
2,10843547,0.115475,0.115126,0.115178,-0.077697,-0.07947,-0.07801,0.168102,0.159052,0.169841,...,0.546513,0.027874,0.026879,0.02757,0.053901,0.049628,0.055221,0.054414,0.050189,0.047967
3,10859147,0.115453,0.114598,0.11406,-0.075649,-0.077353,-0.076625,0.152831,0.140125,0.155747,...,0.620737,0.042975,0.041798,0.04174,0.064098,0.055475,0.06481,0.071758,0.069161,0.067617
4,10879947,0.106098,0.104459,0.105421,-0.155137,-0.129332,-0.127548,0.102826,0.110514,0.112231,...,0.510328,-0.021822,-0.010022,-0.008658,-0.068698,-0.054166,-0.025313,-0.00218,-0.01224,-0.026953


In [15]:
for model_name in models:
    
    new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                           '201610':test_predictions['%s_10' % model_name],
                           '201611':test_predictions['%s_11' % model_name],
                           '201612':test_predictions['%s_12' % model_name],
    })
    new_submission['201710'] = 0
    new_submission['201711'] = 0
    new_submission['201712'] = 0
    
    print("%s:" % (model_name))
    print(mean_absolute_errors(new_submission, data))
    print("")
    new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/\
    two_stage_stage1_stacked_annrfsxgbs_stage2_%s_age.csv.gz" % model_name, index=False,
                 compression='gzip')
        
    print("%s using single-stage model:" % (model_name))
    print(mean_absolute_errors(single_stage_model, data))
    print("")

rf_maxdepth8:
(0.061236963418852444, 0.060359636388199196, 0.07287444549170666, 0.063418604974310261)

rf_maxdepth8 using single-stage model:
(0.0631669479606187, 0.06216046002190593, 0.07476497987349064, 0.065312947787403389)

rf_maxdepth10:
(0.06114340405470921, 0.06028706587833531, 0.0729048157894437, 0.063354762225704825)

rf_maxdepth10 using single-stage model:
(0.0631669479606187, 0.06216046002190593, 0.07476497987349064, 0.065312947787403389)

rf_maxdepth12:
(0.06113073642662708, 0.060252683202254034, 0.07276164507478927, 0.063310884512725055)

rf_maxdepth12 using single-stage model:
(0.0631669479606187, 0.06216046002190593, 0.07476497987349064, 0.065312947787403389)



In [20]:
new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                           '201610':test_predictions['rf_oct'],
                           '201611':test_predictions['rf_nov'],
                           '201612':test_predictions['rf_dec']})
new_submission['201710'] = 0
new_submission['201711'] = 0
new_submission['201712'] = 0

In [21]:
mean_absolute_errors(new_submission.round(4), data)

(0.062045469158127406,
 0.06109260679079965,
 0.07364278320874057,
 0.064202786232732401)

In [22]:
new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_stage1_stacked_annrfs_stage2_rf.csv.gz", index=False,
                     compression='gzip')