In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Reading in data

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [3]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

In [4]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

In [8]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_ann_rfs.csv.gz")
overestimate_probabilities.rename(columns={"stacked_pred":"overestimate_prob"}, inplace=True)
overestimate_probabilities.head()

Unnamed: 0,parcelid,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,overestimate_prob
0,10754147,0.469632,0.466012,0.495383,0.479207
1,10759547,0.406392,0.555562,0.524675,0.475759
2,10843547,0.880731,0.548264,0.548752,0.732975
3,10859147,0.569588,0.663067,0.543329,0.6021
4,10879947,0.540791,0.519636,0.485341,0.531297


In [6]:
lme_preds = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/LME-nlme-two-stage-preds.csv.gz",
                       compression='gzip')
lme_preds.head()

Unnamed: 0,parcelid,lme_over,lme_under
0,10754147,0.110718,-0.107523
1,10759547,0.110718,-0.107523
2,10843547,1.44871,-1.356412
3,10859147,0.172072,-0.165112
4,10879947,0.110752,-0.107566


In [9]:
lme_preds = pd.merge(overestimate_probabilities, lme_preds, on='parcelid')
lme_preds['lme'] = (lme_preds['lme_over']*lme_preds['overestimate_prob'] +
                    lme_preds['lme_under']*(1 - lme_preds['overestimate_prob']))
lme_preds.head()

Unnamed: 0,parcelid,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,overestimate_prob,lme_over,lme_under,lme
0,10754147,0.469632,0.466012,0.495383,0.479207,0.110718,-0.107523,-0.00294
1,10759547,0.406392,0.555562,0.524675,0.475759,0.110718,-0.107523,-0.003693
2,10843547,0.880731,0.548264,0.548752,0.732975,1.44871,-1.356412,0.699673
3,10859147,0.569588,0.663067,0.543329,0.6021,0.172072,-0.165112,0.037907
4,10879947,0.540791,0.519636,0.485341,0.531297,0.110752,-0.107566,0.008426


In [10]:
new_submission = DataFrame({'ParcelId': lme_preds['parcelid'],
                           '201610':lme_preds['lme'],
                           '201611':lme_preds['lme'],
                           '201612':lme_preds['lme']})
new_submission['201710'] = 0
new_submission['201711'] = 0
new_submission['201712'] = 0

In [11]:
mean_absolute_errors(new_submission.round(4), data)

(0.06218806509945767,
 0.06125673603504926,
 0.07380603795284651,
 0.064354191055958895)

In [15]:
LME = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/LME.csv.gz", compression="gzip")
mean_absolute_errors(LME, data)

(0.06335799788964462,
 0.06250493564507772,
 0.07474658919966297,
 0.065494156708368903)

In [12]:
new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_stage1_stacked_annrfs_stage_lme.csv.gz", index=False,
                     compression='gzip')