In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [3]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [4]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')
del logerror, properties

### Loading in predictions from the models 

In [6]:
BART = pd.read_csv("BART_submission.csv.gz", compression="gzip")
mean_absolute_errors(BART, data)

(0.06315269012412131,
 0.06226942536365466,
 0.0750168754592673,
 0.065379215158680754)

In [14]:
LME = pd.read_csv("LME-lme4.csv.gz", compression="gzip")
mean_absolute_errors(LME, data)

(0.06381314296520732,
 0.06296542002479723,
 0.07527978655081258,
 0.065966333214116088)

In [12]:
LME = pd.read_csv("LME-lme4.csv.gz", compression="gzip")
mean_absolute_errors(LME, data)

(0.06340905986589798,
 0.06242550273177568,
 0.07500259138195367,
 0.065559045346992986)

In [7]:
LME = pd.read_csv("LME.csv.gz", compression="gzip")
mean_absolute_errors(LME, data)

(0.0633024491005806,
 0.06252642013941097,
 0.07472870881317234,
 0.065462743733816525)

In [8]:
RF = pd.read_csv("RF_n100_maxfeat5_maxdepth8.gz", compression="gzip")
mean_absolute_errors(RF, data)

(0.0627264215390796,
 0.06182163198247547,
 0.07473668775158138,
 0.064978084757667934)

In [5]:
RF_2 = pd.read_csv("RF.gz", compression="gzip")
mean_absolute_errors(RF_2, data)

(0.056695840867992744,
 0.056348083242059234,
 0.06729453709028171,
 0.058779208616249121)

In [9]:
Ridge = pd.read_csv("Ridge.gz", compression="gzip")
mean_absolute_errors(Ridge, data)

(0.06332056539983931,
 0.06250535370657166,
 0.07484831374353078,
 0.065493145336361525)

In [10]:
Enet = pd.read_csv("ElasticNet.gz", compression="gzip")
mean_absolute_errors(Enet, data)

(0.06332380630902144,
 0.06246366413472067,
 0.07474843300747566,
 0.0654657878377429)

In [11]:
Lasso = pd.read_csv("Lasso.gz", compression="gzip")
mean_absolute_errors(Lasso, data)

(0.06333524733775354,
 0.06244184118291356,
 0.07476208625646924,
 0.065470568485132233)

In [122]:
XGB = pd.read_csv("XGB_600.gz", compression="gzip")
mean_absolute_errors(XGB, data)

(0.06211727600763514,
 0.06136520946002188,
 0.07414289916043706,
 0.064404712807773379)

In [121]:
XGB3000 = pd.read_csv("XGB_3000.csv")
mean_absolute_errors(XGB3000, data)

(0.05998706659714675,
 0.05918412810186208,
 0.07170160296434737,
 0.06220029687696086)

In [14]:
Huber = pd.read_csv("HuberRegressor.gz", compression="gzip")
mean_absolute_errors(Huber, data)

(0.06361133815551537,
 0.06264460021905807,
 0.0749035882691202,
 0.065703583469913368)

In [15]:
LARM =  pd.read_csv("LARM.gz", compression="gzip")
mean_absolute_errors(LARM, data)

(0.06345773598553339,
 0.06251769879518071,
 0.07483130304772855,
 0.065572243736829725)

In [16]:
Adaptive_LASSO =  pd.read_csv("Adp-lasso-af.gz", compression="gzip")
mean_absolute_errors(Adaptive_LASSO, data)

(0.06346130419931673,
 0.062492389375684604,
 0.07486073950546275,
 0.06557490517443211)

In [184]:
models = [(Ridge,0.13),
         (Enet,0.13),
         (Lasso,0.13),
         (LME, 0.13),
         (LARM,0.125),
         (XGB,0.125),
         (RF,0.12),
         (Adaptive_LASSO, 0.01),
         (Huber, 0.01),
         (BART, 0.01),
         (XGB3000, 0.04),
         (RF_2, 0.04)
]

scale = 0.90

models = [(Ridge,0.82279736 / 3 * scale),
         (LME, 0.82279736 / 3 * scale),
         (LARM,0.82279736 / 3 * scale),
         (Enet,0.15298894 * scale),
         (Huber, 0.02112733 * scale),
         (RF, 1.09076218 * (1-scale)),
         (XGB3000, 0.05),
          (RF_2, 0.05)
]

In [185]:
sum([y for x,y in models])

1.106298485

In [186]:
ensemble = Ridge[['ParcelId']].copy()
cols = ['201610','201611','201612','201710','201711','201712']
foo = models[0][0][cols]*models[0][1]
for pair in models[1:]:
    model,wt = pair
    foo = foo + model[cols]*wt 

In [187]:
ensemble = pd.concat([ensemble,foo], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.06259861175946599,
 0.06173762320486744,
 0.0740465838799026,
 0.064745165074467426)

In [32]:
ensemble = pd.concat([ensemble,foo], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.06259217287302513,
 0.0617576086180706,
 0.0740689090448652,
 0.064750230690080013)

In [188]:
ensemble.to_csv("new_ensemble.gz", index=False, float_format='%.4g', compression='gzip')