In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime
import gc
import re

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_months_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    
    overall_mae = abs(trainresults['201612'] - trainresults['logerror']).mean()
    return (oct_error, nov_error, dec_error, overall_months_mae, overall_mae)

### Reading in data 

In [3]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [4]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')
del logerror, properties

### Loading in predictions from the models 

In [5]:
os.chdir("/home/anerdi/Desktop/Zillow/submissions/")

### Manual model ensembling

In [6]:
# available_base_learners = {'{0}_{1}'.format(re.search(r'stage1_([a-z,0-9_]+)_stage2',f).group(1)
#                     ,re.search(r'stage2_([a-z,0-9_]+)_',f).group(1)): f
#           for f in os.listdir(submission_dir) if re.match(r'^    two_stage_stage1_[a-z,0-9,_]+',f) is not None}



In [15]:
annrfsxgbs_LME = pd.read_csv("two_stage_stage1_stacked_annrfsxgbs_stage2_lme.csv.gz", compression="gzip")
annrfs_LME = pd.read_csv("two_stage_stage1_stacked_annrfs_stage_lme.csv.gz", compression="gzip")
rf_lme = pd.read_csv("two_stage_stage1_rf_stage2_lme.csv.gz", compression="gzip")
logistic_LME = pd.read_csv("two_stage_lme.csv.gz", compression="gzip")

RF_2 = pd.read_csv("RF_n100_maxfeat10_maxdepth20_extreme.gz", compression="gzip")
rf_overfit = pd.read_csv("    two_stage_stage1_stacked_annrfsxgbs_stage2_rf_maxdepth12_age.csv.gz")

XGB = pd.read_csv("two_stage_xgb.csv.gz", compression="gzip")
XGB600 = pd.read_csv("XGB_600.gz", compression="gzip")
XGB3000 = pd.read_csv("XGB_3000_RF.gz", compression='gzip')

stacked_final = pd.read_csv("super_learner_preds.csv.gz")

In [28]:
models = [(stacked_final,0.80),
    (annrfsxgbs_LME, 0.09),(annrfs_LME,0.06),
    (XGB3000, 0.03),
    (RF_2, 0.02)]

sum([y for x,y in models])

1.0

In [29]:
ensemble = stacked_final[['ParcelId']].copy()
cols = ['201610','201611','201612','201710','201711','201712']
foo = models[0][0][cols]*models[0][1]
for pair in models[1:]:
    model,wt = pair
    foo = foo + model[cols]*wt 

In [30]:
ensemble = pd.concat([ensemble,foo.round(4)], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == stacked_final.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.06132045408880847,
 0.06046500547645126,
 0.07294134560092015,
 0.063503394989463821,
 0.06627135751869448)

In [13]:
# current best
ensemble = pd.concat([ensemble,foo.round(4)], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == stacked_final.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.06132716495881037,
 0.06047294633077763,
 0.0729488211615872,
 0.063510524467337778,
 0.06627859207975909)

In [31]:
ensemble.to_csv("new_ensemble.gz", index=False, float_format='%.4g', compression='gzip')