In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Reading in data

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [3]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

In [4]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Readin stage 1 classification results 

In [15]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_ann_rfs.csv.gz",
                                        compression='gzip')
overestimate_probabilities.rename(columns={'stacked_pred':'overestimate_prob'},inplace=True)
overestimate_probabilities.head()

Unnamed: 0,parcelid,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,overestimate_prob
0,10754147,0.469632,0.466012,0.495383,0.479207
1,10759547,0.406392,0.555562,0.524675,0.475759
2,10843547,0.880731,0.548264,0.548752,0.732975
3,10859147,0.569588,0.663067,0.543329,0.6021
4,10879947,0.540791,0.519636,0.485341,0.531297


In [9]:
# overestimate_probabilities['overestimate_prob'] = (overestimate_probabilities['overestimate_prob'] 
#                                                    + overestimate_probabilities_logistic['overestimate_prob']) / 2

### Readin two-stage linear model predictions 

In [8]:
two_stage_linear_models = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/RF-two-stage-preds.csv.gz",
                       compression='gzip')
two_stage_linear_models.head()

Unnamed: 0,parcelid,rf_oct_over,rf_oct_under,rf_nov_over,rf_nov_under,rf_dec_over,rf_dec_under
0,10754147,0.253206,-0.089342,0.279583,-0.075154,0.279143,-0.077137
1,10759547,0.083172,-0.079168,0.085873,-0.078729,0.085967,-0.079182
2,10843547,0.14541,-0.110451,0.142955,-0.105511,0.142864,-0.124257
3,10859147,0.116956,-0.092979,0.111848,-0.093563,0.112627,-0.093089
4,10879947,0.120668,-0.129706,0.117671,-0.177729,0.121505,-0.164241


### Readin single-stage model predictions 

In [9]:
single_stage_model = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_rf.csv.gz")
mean_absolute_errors(single_stage_model,data)

(0.0631669479606187,
 0.06216046002190593,
 0.07476497987349064,
 0.065312947787403389)

### Combine preds & overestimate probabilities to generate one prediction 

In [16]:
test_predictions = pd.merge(two_stage_linear_models, overestimate_probabilities, on='parcelid')

In [17]:
models = ["rf_oct","rf_nov","rf_dec",]

In [18]:
for model in models:
    # combine over and under to get prediction
    test_predictions[model] = (test_predictions['%s_over' % model]*test_predictions['overestimate_prob'] +
                    test_predictions['%s_under' % model]*(1 - test_predictions['overestimate_prob']))

In [19]:
test_predictions.head()

Unnamed: 0,parcelid,rf_oct_over,rf_oct_under,rf_nov_over,rf_nov_under,rf_dec_over,rf_dec_under,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,overestimate_prob,rf_oct,rf_nov,rf_dec
0,10754147,0.253206,-0.089342,0.279583,-0.075154,0.279143,-0.077137,0.469632,0.466012,0.495383,0.479207,0.074809,0.094838,0.093595
1,10759547,0.083172,-0.079168,0.085873,-0.078729,0.085967,-0.079182,0.406392,0.555562,0.524675,0.475759,-0.001933,-0.000418,-0.000611
2,10843547,0.14541,-0.110451,0.142955,-0.105511,0.142864,-0.124257,0.880731,0.548264,0.548752,0.732975,0.077088,0.076608,0.071536
3,10859147,0.116956,-0.092979,0.111848,-0.093563,0.112627,-0.093089,0.569588,0.663067,0.543329,0.6021,0.033423,0.030115,0.030772
4,10879947,0.120668,-0.129706,0.117671,-0.177729,0.121505,-0.164241,0.540791,0.519636,0.485341,0.531297,0.003317,-0.020784,-0.012425


In [20]:
new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                           '201610':test_predictions['rf_oct'],
                           '201611':test_predictions['rf_nov'],
                           '201612':test_predictions['rf_dec']})
new_submission['201710'] = 0
new_submission['201711'] = 0
new_submission['201712'] = 0

In [21]:
mean_absolute_errors(new_submission.round(4), data)

(0.062045469158127406,
 0.06109260679079965,
 0.07364278320874057,
 0.064202786232732401)

In [22]:
new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_stage1_stacked_annrfs_stage2_rf.csv.gz", index=False,
                     compression='gzip')