In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [3]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [4]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')
del logerror, properties

### Loading in predictions from the models 

In [5]:
os.chdir("/home/anerdi/Desktop/Zillow/submissions/")

In [6]:
BART = pd.read_csv("BART_submission.csv.gz", compression="gzip")
mean_absolute_errors(BART, data)

(0.06315269012412131,
 0.06226942536365466,
 0.0750168754592673,
 0.065379215158680754)

In [7]:
LME = pd.read_csv("two_stage_stage1_stacked_annrfs_stage_lme.csv.gz", compression="gzip")
mean_absolute_errors(LME, data)

(0.06218806509945767,
 0.06125673603504926,
 0.07380603795284651,
 0.064354191055958895)

In [102]:
rf_lme = pd.read_csv("two_stage_stage1_rf_stage2_lme.csv.gz", compression="gzip")
mean_absolute_errors(rf_lme, data)

(0.061984790034157,
 0.06130268346111716,
 0.0732554341575618,
 0.064133481620229374)

In [8]:
logistic_LME = pd.read_csv("two_stage_lme.csv.gz", compression="gzip")
mean_absolute_errors(logistic_LME, data)

(0.06328571428571401,
 0.062285596933187234,
 0.07491650373778051,
 0.065439744790447077)

In [9]:
RF = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_rf.csv.gz", compression="gzip")
mean_absolute_errors(RF, data)

(0.062045469158127406,
 0.06109260679079965,
 0.07364278320874057,
 0.064202786232732401)

In [10]:
logistic_RF = pd.read_csv("two_stage_rf.csv.gz", compression="gzip")
mean_absolute_errors(logistic_RF, data)

(0.0631669479606187,
 0.06216046002190593,
 0.07476497987349064,
 0.065312947787403389)

In [11]:
RF_singlestage = pd.read_csv("RF_n100_maxfeat5_maxdepth8.gz", compression="gzip")
mean_absolute_errors(RF_singlestage, data)

(0.0627264215390796,
 0.06182163198247547,
 0.07473668775158138,
 0.064978084757667934)

In [12]:
RF_2 = pd.read_csv("RF_n100_maxfeat10_maxdepth20_extreme.gz", compression="gzip")
mean_absolute_errors(RF_2, data)

(0.06195625879043588,
 0.06436095290251921,
 0.07321730879815982,
 0.064762854132521586)

In [13]:
Ridge = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_ridge.csv.gz", compression="gzip")
mean_absolute_errors(Ridge, data)

(0.06211394414305826,
 0.061221522453450215,
 0.07382415181138596,
 0.064307164598454841)

In [93]:
rf_ridge = pd.read_csv("two_stage_stage1_rf_stage2_ridge.csv.gz", compression="gzip")
mean_absolute_errors(rf_ridge, data)

(0.061900200924251476,
 0.06124611171960577,
 0.07323162737205288,
 0.064067255911964383)

In [14]:
logistic_ridge = pd.read_csv("two_stage_ridge.csv.gz", compression="gzip")
mean_absolute_errors(logistic_ridge, data)

(0.06321432589913599,
 0.06228148959474268,
 0.07486952271420345,
 0.06538770779676889)

In [15]:
Enet = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_enet.csv.gz", compression="gzip")
mean_absolute_errors(Enet, data)

(0.06212993771348221,
 0.06119145673603508,
 0.07377395054629106,
 0.06429983610395705)

In [95]:
rf_enet = pd.read_csv("two_stage_stage1_rf_stage2_enet.csv.gz")
mean_absolute_errors(rf_enet, data)

(0.061908358448864956,
 0.06118674698795178,
 0.07323622771707874,
 0.064060255209552885)

In [16]:
logistic_enet = pd.read_csv("two_stage_enet.csv.gz", compression="gzip")
mean_absolute_errors(logistic_enet, data)

(0.06320811733976288,
 0.06216155531215774,
 0.07480471535365159,
 0.065345258721610872)

In [17]:
Lasso = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_lasso.csv.gz", compression="gzip")
mean_absolute_errors(Lasso, data)

(0.062133273056057994,
 0.06118707557502739,
 0.07377216791259353,
 0.064300479981269115)

In [96]:
rf_lasso = pd.read_csv("two_stage_stage1_rf_stage2_lasso.csv.gz")
mean_absolute_errors(rf_lasso, data)

(0.061914707655214174,
 0.06119074479737131,
 0.07324531339850505,
 0.06406665886209334)

In [18]:
logistic_lasso = pd.read_csv("two_stage_lasso.csv.gz", compression="gzip")
mean_absolute_errors(logistic_lasso, data)

(0.06321498894916613,
 0.06216084337349388,
 0.07480948821161598,
 0.065350081948021527)

In [19]:
XGB = pd.read_csv("two_stage_xgb.csv.gz", compression="gzip")
mean_absolute_errors(XGB, data)

(0.06357010247136825,
 0.062275520262869734,
 0.07545290396779751,
 0.065712491219854813)

In [20]:
XGB600 = pd.read_csv("XGB_600.gz", compression="gzip")
mean_absolute_errors(XGB600, data)

(0.06211727600763514,
 0.06136520946002188,
 0.07414289916043706,
 0.064404712807773379)

In [21]:
XGB3000 = pd.read_csv("XGB_3000_RF.gz", compression='gzip')
mean_absolute_errors(XGB3000, data)

(0.06228837162257941,
 0.056873301559340145,
 0.06987688277262188,
 0.062632190538725402)

In [22]:
Huber = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_huber.csv.gz", compression="gzip")
mean_absolute_errors(Huber, data)

(0.06235075346594324,
 0.06162031763417309,
 0.07401564117308788,
 0.06456937485366418)

In [97]:
rf_huber = pd.read_csv("two_stage_stage1_rf_stage2_huber.csv.gz")
mean_absolute_errors(rf_huber, data)

(0.062133735181836354,
 0.061565826944140145,
 0.0734212190914319,
 0.064310266916412964)

In [23]:
logistic_huber = pd.read_csv("two_stage_huber.csv.gz", compression="gzip")
mean_absolute_errors(logistic_huber, data)

(0.0633122161944946,
 0.06246286966046,
 0.07484818861414623,
 0.065479173495668438)

In [24]:
LARM =  pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_larm.csv.gz", compression="gzip")
mean_absolute_errors(LARM, data)

(0.062185292344785925,
 0.06120224534501639,
 0.07380408280621042,
 0.064340529150081871)

In [98]:
rf_larm = pd.read_csv("two_stage_stage1_rf_stage2_larm.csv.gz")
mean_absolute_errors(rf_larm, data)

(0.061992927466345146,
 0.06124435925520252,
 0.07329637722829221,
 0.06413409037696087)

In [25]:
logistic_larm = pd.read_csv("two_stage_larm.csv.gz", compression="gzip")
mean_absolute_errors(logistic_larm, data)

(0.06324942736588297,
 0.06221018619934285,
 0.07484307073030456,
 0.065387532193865519)

In [26]:
Adaptive_LASSO =  pd.read_csv("Adp-lasso-af.gz", compression="gzip")
mean_absolute_errors(Adaptive_LASSO, data)

(0.06346130419931673,
 0.062492389375684604,
 0.07486073950546275,
 0.06557490517443211)

In [142]:
models = [(Ridge,0.07),(logistic_ridge,0.02),(rf_ridge,0.05),
    (Enet,0.05),(logistic_enet,0.02),(rf_enet,0.05),
    (Lasso,0.05),(logistic_lasso,0.02),(rf_lasso,0.05),
    (Huber, 0.05),(logistic_huber,0.02),(rf_huber,0.05),
    (LARM,0.05),(logistic_larm,0.02),(rf_larm,0.05),
    (LME, 0.05),(logistic_LME,0.06),(rf_lme,0.05),
    (XGB600,0.09),
    (RF,0.10),
    (XGB3000, 0.02),
    (RF_2, 0.01)]


# # current best ensemble
# models = [(Ridge,0.08),(logistic_ridge,0.03),(rf_ridge,0.03),
#     (Enet,0.06),(logistic_enet,0.03),(rf_enet,0.03),
#     (Lasso,0.06),(logistic_lasso,0.03),(rf_lasso,0.03),
#     (Huber, 0.06),(logistic_huber,0.03),(rf_huber,0.03),
#     (LARM,0.06),(logistic_larm,0.03),(rf_larm,0.03),
#     (LME, 0.05),(logistic_LME,0.08),(rf_lme,0.03),
#     (XGB600,0.09),
#     (RF,0.10),
#     (XGB3000, 0.02),
#     (RF_2, 0.01)]


# models = [(Ridge,0.09),(logistic_ridge,0.05),
#     (Enet,0.07),(logistic_enet,0.05),
#     (Lasso,0.07),(logistic_lasso,0.05),
#     (Huber, 0.07),(logistic_huber,0.05),
#     (LARM,0.07),(logistic_larm,0.05),
#     (LME, 0.02),(logistic_LME,0.11),
#     (XGB600,0.11),
#     (RF,0.10),
#     (XGB3000, 0.03),
#     (RF_2, 0.01)]

# scale = 0.90

# models = [(Ridge,0.82279736 / 3 * scale),
#          (LME, 0.82279736 / 3 * scale),
#          (LARM,0.82279736 / 3 * scale),
#          (Enet,0.15298894 * scale),
#          (Huber, 0.02112733 * scale),
#          (RF, 1.09076218 * (1-scale)),
#          (XGB3000, 0.05),
#           (RF_2, 0.05)
# ]

In [143]:
sum([y for x,y in models])

1.0000000000000002

In [144]:
ensemble = Ridge[['ParcelId']].copy()
cols = ['201610','201611','201612','201710','201711','201712']
foo = models[0][0][cols]*models[0][1]
for pair in models[1:]:
    model,wt = pair
    foo = foo + model[cols]*wt 

In [145]:
ensemble = pd.concat([ensemble,foo.round(4)], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.061930239099859404,
 0.061154709748083215,
 0.07352139160437032,
 0.064124209786935157)

In [134]:
# current best
ensemble = pd.concat([ensemble,foo.round(4)], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.062031906771147276,
 0.061217579408543304,
 0.07365583668775155,
 0.06422425661437603)

In [146]:
ensemble.to_csv("new_ensemble.gz", index=False, float_format='%.4g', compression='gzip')