In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [3]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [4]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')
del logerror, properties

### Loading in predictions from the models 

In [5]:
os.chdir("/home/anerdi/Desktop/Zillow/submissions/")

In [6]:
BART = pd.read_csv("BART_submission.csv.gz", compression="gzip")
mean_absolute_errors(BART, data)

(0.06315269012412131,
 0.06226942536365466,
 0.0750168754592673,
 0.065379215158680754)

In [8]:
LME = pd.read_csv("two_stage_lme.csv.gz", compression="gzip")
mean_absolute_errors(LME, data)

(0.06328571428571401,
 0.062285596933187234,
 0.07491650373778051,
 0.065439744790447077)

In [9]:
RF = pd.read_csv("two_stage_rf.csv.gz", compression="gzip")
mean_absolute_errors(RF, data)

(0.0631669479606187,
 0.06216046002190593,
 0.07476497987349064,
 0.065312947787403389)

In [10]:
RF_singlestage = pd.read_csv("RF_n100_maxfeat5_maxdepth8.gz", compression="gzip")
mean_absolute_errors(RF_singlestage, data)

(0.0627264215390796,
 0.06182163198247547,
 0.07473668775158138,
 0.064978084757667934)

In [11]:
RF_2 = pd.read_csv("RF_n100_maxfeat10_maxdepth20_extreme.gz", compression="gzip")
mean_absolute_errors(RF_2, data)

(0.06195625879043588,
 0.06436095290251921,
 0.07321730879815982,
 0.064762854132521586)

In [12]:
Ridge = pd.read_csv("two_stage_ridge.csv.gz", compression="gzip")
mean_absolute_errors(Ridge, data)

(0.06321432589913599,
 0.06228148959474268,
 0.07486952271420345,
 0.06538770779676889)

In [13]:
Enet = pd.read_csv("two_stage_enet.csv.gz", compression="gzip")
mean_absolute_errors(Enet, data)

(0.06320811733976288,
 0.06216155531215774,
 0.07480471535365159,
 0.065345258721610872)

In [14]:
Lasso = pd.read_csv("two_stage_lasso.csv.gz", compression="gzip")
mean_absolute_errors(Lasso, data)

(0.06321498894916613,
 0.06216084337349388,
 0.07480948821161598,
 0.065350081948021527)

In [15]:
XGB = pd.read_csv("two_stage_lme.csv.gz", compression="gzip")
mean_absolute_errors(XGB, data)

(0.06328571428571401,
 0.062285596933187234,
 0.07491650373778051,
 0.065439744790447077)

In [19]:
XGB600 = pd.read_csv("XGB_600.gz", compression="gzip")
mean_absolute_errors(XGB600, data)

(0.06211727600763514,
 0.06136520946002188,
 0.07414289916043706,
 0.064404712807773379)

In [20]:
XGB3000 = pd.read_csv("XGB_3000_RF.gz", compression='gzip')
mean_absolute_errors(XGB3000, data)

(0.06228837162257941,
 0.056873301559340145,
 0.06987688277262188,
 0.062632190538725402)

In [21]:
Huber = pd.read_csv("two_stage_huber.csv.gz", compression="gzip")
mean_absolute_errors(Huber, data)

(0.0633122161944946,
 0.06246286966046,
 0.07484818861414623,
 0.065479173495668438)

In [22]:
LARM =  pd.read_csv("two_stage_larm.csv.gz", compression="gzip")
mean_absolute_errors(LARM, data)

(0.06324942736588297,
 0.06221018619934285,
 0.07484307073030456,
 0.065387532193865519)

In [23]:
Adaptive_LASSO =  pd.read_csv("Adp-lasso-af.gz", compression="gzip")
mean_absolute_errors(Adaptive_LASSO, data)

(0.06346130419931673,
 0.062492389375684604,
 0.07486073950546275,
 0.06557490517443211)

In [293]:
models = [(Ridge,0.12),
    (Enet,0.12),
    (Lasso,0.12),
    (Huber, 0.12),
    (LARM,0.12),
    (LME, 0.12),
    (XGB600,0.10),
    (RF_singlestage,0.10),
    (XGB3000, 0.02),
    (RF_2, 0.06)]


# current best ensemble
# models = [(Ridge,0.12),
#     (Enet,0.12),
#     (Lasso,0.12),
#     (Huber, 0.12),
#     (LARM,0.12),
#     (LME, 0.12),
#     (XGB600,0.10),
#     (RF_singlestage,0.10),
#     (XGB3000, 0.04),
#     (RF_2, 0.04)]

# scale = 0.90

# models = [(Ridge,0.82279736 / 3 * scale),
#          (LME, 0.82279736 / 3 * scale),
#          (LARM,0.82279736 / 3 * scale),
#          (Enet,0.15298894 * scale),
#          (Huber, 0.02112733 * scale),
#          (RF, 1.09076218 * (1-scale)),
#          (XGB3000, 0.05),
#           (RF_2, 0.05)
# ]

In [294]:
sum([y for x,y in models])

1.0

In [295]:
ensemble = Ridge[['ParcelId']].copy()
cols = ['201610','201611','201612','201710','201711','201712']
foo = models[0][0][cols]*models[0][1]
for pair in models[1:]:
    model,wt = pair
    foo = foo + model[cols]*wt 

In [296]:
ensemble = pd.concat([ensemble,foo.round(4)], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.06235097448262017,
 0.06150602409638539,
 0.07393709028177108,
 0.064529079840786735)

In [35]:
# previous best
ensemble = pd.concat([ensemble,foo.round(4)], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.06244358047016284,
 0.06159731653888284,
 0.07403536515238629,
 0.064622559119644138)

In [297]:
ensemble.to_csv("new_ensemble.gz", index=False, float_format='%.4g', compression='gzip')