In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Reading in data

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [3]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

In [4]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Readin stage 1 classification results 

In [5]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_ann_rfs.csv.gz",
                                        compression='gzip')
overestimate_probabilities.rename(columns={'stacked_pred':'overestimate_prob'},inplace=True)
overestimate_probabilities.head()

Unnamed: 0,parcelid,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,overestimate_prob
0,10754147,0.469632,0.466012,0.495383,0.479207
1,10759547,0.406392,0.555562,0.524675,0.475759
2,10843547,0.880731,0.548264,0.548752,0.732975
3,10859147,0.569588,0.663067,0.543329,0.6021
4,10879947,0.540791,0.519636,0.485341,0.531297


### Readin two-stage linear model predictions 

In [6]:
two_stage_linear_models = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/XGB-two-stage-preds.csv.gz",
                       compression='gzip')
two_stage_linear_models.head()

Unnamed: 0.1,Unnamed: 0,parcelid,xgb_oct_under,xgb_oct_over,xgb_nov_under,xgb_nov_over,xgb_dec_under,xgb_dec_over
0,0,10754147,-0.198943,0.087975,-0.209733,0.085679,-0.193651,0.087517
1,1,10759547,-0.11094,0.081205,-0.114238,0.079867,-0.110725,0.081849
2,2,10843547,-0.269981,0.624938,-0.287432,0.675193,-0.259808,0.618806
3,3,10859147,-0.224659,0.487692,-0.223898,0.5115,-0.19966,0.479048
4,4,10879947,-0.282351,0.271146,-0.28656,0.225149,-0.333289,0.256966


### Readin single-stage model predictions 

In [7]:
single_stage_model = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/XGB_600.gz")
mean_absolute_errors(single_stage_model,data)

(0.06211727600763514,
 0.06136520946002188,
 0.07414289916043706,
 0.064404712807773379)

### Combine preds & overestimate probabilities to generate one prediction 

In [8]:
test_predictions = pd.merge(two_stage_linear_models, overestimate_probabilities, on='parcelid')

In [11]:
models = ["xgb_oct","xgb_nov","xgb_dec",]

In [12]:
for model in models:
    # combine over and under to get prediction
    test_predictions[model] = (test_predictions['%s_over' % model]*test_predictions['overestimate_prob'] +
                    test_predictions['%s_under' % model]*(1 - test_predictions['overestimate_prob']))

In [13]:
test_predictions.head()

Unnamed: 0.1,Unnamed: 0,parcelid,xgb_oct_under,xgb_oct_over,xgb_nov_under,xgb_nov_over,xgb_dec_under,xgb_dec_over,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,overestimate_prob,xgb_oct,xgb_nov,xgb_dec
0,0,10754147,-0.198943,0.087975,-0.209733,0.085679,-0.193651,0.087517,0.469632,0.466012,0.495383,0.479207,-0.06145,-0.06817,-0.058914
1,1,10759547,-0.11094,0.081205,-0.114238,0.079867,-0.110725,0.081849,0.406392,0.555562,0.524675,0.475759,-0.019525,-0.021891,-0.019106
2,2,10843547,-0.269981,0.624938,-0.287432,0.675193,-0.259808,0.618806,0.880731,0.548264,0.548752,0.732975,0.385972,0.418148,0.384194
3,3,10859147,-0.224659,0.487692,-0.223898,0.5115,-0.19966,0.479048,0.569588,0.663067,0.543329,0.6021,0.204247,0.218885,0.20899
4,4,10879947,-0.282351,0.271146,-0.28656,0.225149,-0.333289,0.256966,0.540791,0.519636,0.485341,0.531297,0.01172,-0.014691,-0.019689


In [14]:
new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                           '201610':test_predictions['xgb_oct'],
                           '201611':test_predictions['xgb_nov'],
                           '201612':test_predictions['xgb_dec']})
new_submission['201710'] = 0
new_submission['201711'] = 0
new_submission['201712'] = 0

In [15]:
mean_absolute_errors(new_submission.round(4), data)

(0.06253124372111714,
 0.06132256297918948,
 0.07422489936745252,
 0.064653488644345586)

In [16]:
new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_stage1_stacked_annrfs_stage2_xgb.csv.gz", index=False,
                     compression='gzip')