In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [3]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [4]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')
del logerror, properties

### Loading in predictions from the models 

In [5]:
os.chdir("/home/anerdi/Desktop/Zillow/submissions/")

In [6]:
BART = pd.read_csv("BART_submission.csv.gz", compression="gzip")

Ridge = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_ridge.csv.gz", compression="gzip")
rf_ridge = pd.read_csv("two_stage_stage1_rf_stage2_ridge.csv.gz", compression="gzip")
logistic_ridge = pd.read_csv("two_stage_ridge.csv.gz", compression="gzip")

Enet = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_enet.csv.gz", compression="gzip")
rf_enet = pd.read_csv("two_stage_stage1_rf_stage2_enet.csv.gz")
logistic_enet = pd.read_csv("two_stage_enet.csv.gz", compression="gzip")

Lasso = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_lasso.csv.gz", compression="gzip")
rf_lasso = pd.read_csv("two_stage_stage1_rf_stage2_lasso.csv.gz")
logistic_lasso = pd.read_csv("two_stage_lasso.csv.gz", compression="gzip")

Huber = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_huber.csv.gz", compression="gzip")
rf_huber = pd.read_csv("two_stage_stage1_rf_stage2_huber.csv.gz")
logistic_huber = pd.read_csv("two_stage_huber.csv.gz", compression="gzip")

LARM =  pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_larm.csv.gz", compression="gzip")
rf_larm = pd.read_csv("two_stage_stage1_rf_stage2_larm.csv.gz")
logistic_larm = pd.read_csv("two_stage_larm.csv.gz", compression="gzip")

LME = pd.read_csv("two_stage_stage1_stacked_annrfs_stage_lme.csv.gz", compression="gzip")
rf_lme = pd.read_csv("two_stage_stage1_rf_stage2_lme.csv.gz", compression="gzip")
logistic_LME = pd.read_csv("two_stage_lme.csv.gz", compression="gzip")

Adaptive_LASSO =  pd.read_csv("Adp-lasso-af.gz", compression="gzip")

RF = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_rf.csv.gz", compression="gzip")
logistic_RF = pd.read_csv("two_stage_rf.csv.gz", compression="gzip")

RF_singlestage = pd.read_csv("RF_n100_maxfeat5_maxdepth8.gz", compression="gzip")
RF_2 = pd.read_csv("RF_n100_maxfeat10_maxdepth20_extreme.gz", compression="gzip")

XGB = pd.read_csv("two_stage_xgb.csv.gz", compression="gzip")
XGB600 = pd.read_csv("XGB_600.gz", compression="gzip")
XGB3000 = pd.read_csv("XGB_3000_RF.gz", compression='gzip')

KeyboardInterrupt: 

### Determining Order for Stacked Model (ANN)

In [7]:
model_dict = {
                'stacked_rfs_ridge': rf_ridge,
                'stacked_rfs_enet': rf_enet,
                'stacked_rfs_lasso': rf_lasso,
                'stacked_rfs_larm': rf_larm, 
                'stacked_rfs_huber': rf_huber,
                'stacked_annrfs_ridge': Ridge,
                'stacked_annrfs_enet': Enet, 
                'stacked_annrfs_lasso': Lasso,
                'stacked_annrfs_larm': LARM, 
                'stacked_annrfs_huber': Huber, 
                'logistic_ridge': logistic_ridge,
                'logistic_enet': logistic_enet,
                'logistic_lasso': logistic_lasso,
                'logistic_larm': logistic_larm,
                'logistic_huber': logistic_huber,
#                 'stacked_rfs_rf', 
#                 'stacked_rfs_rf_overfit', 
                'stacked_annrfs_rf': RF,
#                 'stacked_annrfs_rf_overfit', 
                'logistic_rf': logistic_RF, 
#                 'logistic_rf_overfit'
             }

In [8]:
order_of_models = [m for m in model_dict.keys()]
order_of_models

['stacked_rfs_huber',
 'stacked_rfs_lasso',
 'stacked_annrfs_rf',
 'logistic_lasso',
 'stacked_annrfs_huber',
 'logistic_rf',
 'logistic_huber',
 'stacked_rfs_enet',
 'stacked_rfs_larm',
 'stacked_annrfs_enet',
 'logistic_ridge',
 'stacked_annrfs_larm',
 'stacked_annrfs_lasso',
 'stacked_rfs_ridge',
 'logistic_enet',
 'stacked_annrfs_ridge',
 'logistic_larm']

In [142]:
# # current best ensemble
# models = [(Ridge,0.08),(logistic_ridge,0.03),(rf_ridge,0.03),
#     (Enet,0.06),(logistic_enet,0.03),(rf_enet,0.03),
#     (Lasso,0.06),(logistic_lasso,0.03),(rf_lasso,0.03),
#     (Huber, 0.06),(logistic_huber,0.03),(rf_huber,0.03),
#     (LARM,0.06),(logistic_larm,0.03),(rf_larm,0.03),
#     (LME, 0.05),(logistic_LME,0.08),(rf_lme,0.03),
#     (XGB600,0.09),
#     (RF,0.10),
#     (XGB3000, 0.02),
#     (RF_2, 0.01)]

# sum([y for x,y in models])

### Loading Stack Model (ANN)

In [10]:
from sklearn.externals import joblib

# loading pickle of model
stack = joblib.load('/home/anerdi/Desktop/Zillow/submissions/stage2_stacked_NN.pkl') 

In [13]:
model_dict['stacked_annrfs_ridge']['201610'].as_matrix()

array([-0.0025, -0.0027,  0.4478, ...,  0.0159,  0.0159,  0.0159])

In [15]:
npatterns = 2985217 # of test observations
nmodels = len(order_of_models)

In [18]:
col = '201610'
X = np.zeros((npatterns,nmodels))
for i,model_name in enumerate(order_of_models):
    X[:,i] = model_dict[model_name][col]

In [2]:
stack.predict(X[:10,])

NameError: name 'stack' is not defined

In [144]:
ensemble = Ridge[['ParcelId']].copy()
cols = ['201610','201611','201612','201710','201711','201712']
foo = models[0][0][cols]*models[0][1]
for pair in models[1:]:
    model,wt = pair
    foo = foo + model[cols]*wt 

In [145]:
ensemble = pd.concat([ensemble,foo.round(4)], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.061930239099859404,
 0.061154709748083215,
 0.07352139160437032,
 0.064124209786935157)

In [134]:
# current best
ensemble = pd.concat([ensemble,foo.round(4)], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.062031906771147276,
 0.061217579408543304,
 0.07365583668775155,
 0.06422425661437603)

In [146]:
ensemble.to_csv("new_ensemble.gz", index=False, float_format='%.4g', compression='gzip')