In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime
import gc

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_months_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    
    overall_mae = abs(trainresults['201612'] - trainresults['logerror']).mean()
    return (oct_error, nov_error, dec_error, overall_months_mae, overall_mae)

### Reading in data 

In [3]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

In [4]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')
del logerror, properties

### Loading in predictions from the models 

In [5]:
os.chdir("/home/anerdi/Desktop/Zillow/submissions/")

In [6]:
BART = pd.read_csv("BART_submission.csv.gz", compression="gzip")

Ridge = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_ridge.csv.gz", compression="gzip")
rf_ridge = pd.read_csv("two_stage_stage1_rf_stage2_ridge.csv.gz", compression="gzip")
logistic_ridge = pd.read_csv("two_stage_ridge.csv.gz", compression="gzip")

Enet = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_enet.csv.gz", compression="gzip")
rf_enet = pd.read_csv("two_stage_stage1_rf_stage2_enet.csv.gz")
logistic_enet = pd.read_csv("two_stage_enet.csv.gz", compression="gzip")

Lasso = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_lasso.csv.gz", compression="gzip")
rf_lasso = pd.read_csv("two_stage_stage1_rf_stage2_lasso.csv.gz")
logistic_lasso = pd.read_csv("two_stage_lasso.csv.gz", compression="gzip")

Huber = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_huber.csv.gz", compression="gzip")
rf_huber = pd.read_csv("two_stage_stage1_rf_stage2_huber.csv.gz")
logistic_huber = pd.read_csv("two_stage_huber.csv.gz", compression="gzip")

LARM =  pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_larm.csv.gz", compression="gzip")
rf_larm = pd.read_csv("two_stage_stage1_rf_stage2_larm.csv.gz")
logistic_larm = pd.read_csv("two_stage_larm.csv.gz", compression="gzip")

LME = pd.read_csv("two_stage_stage1_stacked_annrfs_stage_lme.csv.gz", compression="gzip")
rf_lme = pd.read_csv("two_stage_stage1_rf_stage2_lme.csv.gz", compression="gzip")
logistic_LME = pd.read_csv("two_stage_lme.csv.gz", compression="gzip")

Adaptive_LASSO =  pd.read_csv("Adp-lasso-af.gz", compression="gzip")

RF = pd.read_csv("two_stage_stage1_stacked_annrfs_stage2_rf.csv.gz", compression="gzip")
logistic_RF = pd.read_csv("two_stage_rf.csv.gz", compression="gzip")

RF_singlestage = pd.read_csv("RF_n100_maxfeat5_maxdepth8.gz", compression="gzip")
RF_2 = pd.read_csv("RF_n100_maxfeat10_maxdepth20_extreme.gz", compression="gzip")

XGB = pd.read_csv("two_stage_xgb.csv.gz", compression="gzip")
XGB600 = pd.read_csv("XGB_600.gz", compression="gzip")
XGB3000 = pd.read_csv("XGB_3000_RF.gz", compression='gzip')

### Determining Order for Stacked Model (ANN)

In [7]:
model_dict = {
                'stacked_rfs_ridge': rf_ridge,
                'stacked_rfs_enet': rf_enet,
                'stacked_rfs_lasso': rf_lasso,
                'stacked_rfs_larm': rf_larm, 
                'stacked_rfs_huber': rf_huber,
                'stacked_annrfs_ridge': Ridge,
                'stacked_annrfs_enet': Enet, 
                'stacked_annrfs_lasso': Lasso,
                'stacked_annrfs_larm': LARM, 
                'stacked_annrfs_huber': Huber, 
                'logistic_ridge': logistic_ridge,
                'logistic_enet': logistic_enet,
                'logistic_lasso': logistic_lasso,
                'logistic_larm': logistic_larm,
                'logistic_huber': logistic_huber,
#                 'stacked_rfs_rf', 
#                 'stacked_rfs_rf_overfit', 
                'stacked_annrfs_rf': RF,
#                 'stacked_annrfs_rf_overfit', 
                'logistic_rf': logistic_RF, 
#                 'logistic_rf_overfit':RF_2,
#                 'stacked_rfs_xgb600', 
#                 'stacked_annrfs_xgb600', 
#                 'logistic_xgb600':XGB
             }

In [8]:
order_of_models = [
        'logistic_ridge', 'stacked_rfs_ridge', 'stacked_annrfs_ridge',
        'logistic_enet', 'stacked_rfs_enet', 'stacked_annrfs_enet',
        'logistic_lasso', 'stacked_rfs_lasso', 'stacked_annrfs_lasso',
        'logistic_larm', 'stacked_rfs_larm', 'stacked_annrfs_larm',
        'logistic_huber', 'stacked_rfs_huber', 'stacked_annrfs_huber',
        'logistic_rf', 'stacked_annrfs_rf',
    ]


order_of_models

['logistic_ridge',
 'stacked_rfs_ridge',
 'stacked_annrfs_ridge',
 'logistic_enet',
 'stacked_rfs_enet',
 'stacked_annrfs_enet',
 'logistic_lasso',
 'stacked_rfs_lasso',
 'stacked_annrfs_lasso',
 'logistic_larm',
 'stacked_rfs_larm',
 'stacked_annrfs_larm',
 'logistic_huber',
 'stacked_rfs_huber',
 'stacked_annrfs_huber',
 'logistic_rf',
 'stacked_annrfs_rf']

### Loading test weights 

In [9]:
test_weights = pd.read_csv("/home/anerdi/Desktop/Zillow/levelonedata/super_learner_weights_2.csv.gz")

In [10]:
npatterns = 2985217 # of test observations
nmodels = len(order_of_models)

In [11]:
stacked_final = Ridge[['ParcelId']].copy()

for col in ['201610','201611','201612']:
    month_preds = np.zeros(npatterns)
    X = np.zeros((npatterns,nmodels))
    for i,model_name in enumerate(order_of_models):
        X[:,i] = model_dict[model_name][col]

    # make predictions on X in chunks of 1M patterns at a time
    for j in range(30):
        A = X[j*100000:(j+1)*100000,:]
        B = test_weights.values[j*100000:(j+1)*100000,:].T
        month_preds[j*100000:(j+1)*100000] = np.einsum('ij,ji->i', A, B)
    # fencepost
    month_preds[2900000:] = np.einsum('ij,ji->i',X[2900000:,:], test_weights.values[2900000:,:].T)
    stacked_final = pd.concat([stacked_final, Series(month_preds, name=col)], axis=1)

In [12]:
stacked_final['201710'] = 0
stacked_final['201711'] = 0
stacked_final['201712'] = 0

In [13]:
assert all(stacked_final.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(stacked_final, data)

(0.06198401966890031,
 0.06125177848256764,
 0.07355242281041613,
 0.064182612581198673,
 0.06687094848229157)

In [15]:
assert all(stacked_final.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(stacked_final, data)

(0.06168418150557012,
 0.060981272449663664,
 0.07328333524176578,
 0.063895304944010647,
 0.06659304264059372)

In [14]:
stacked_final.to_csv("new_stacked.gz", index=False, float_format='%.4g', compression='gzip')

### Manual model ensembling

In [17]:
# overall_prop = 0.80

# # stacking ridge
# ridge_models = [logistic_ridge,rf_ridge,Ridge]
# ridge_weights = [0.33127665,0.36622542,0.30249792]
# ridge_win_prop = 0.18428136
# ridge_combinations = [tuple([ridge_models[i],ridge_weights[i]*ridge_win_prop*overall_prop]) for i in range(len(ridge_models))]

# # stacking enet
# enet_models = [logistic_enet,rf_enet,Enet]
# enet_weights = [0.33258377,0.36457491,0.30284132]
# enet_win_prop = 0.02802548
# enet_combinations = [tuple([enet_models[i],enet_weights[i]*enet_win_prop*overall_prop]) for i in range(len(enet_models))]

# # stacking lasso
# lasso_models = [logistic_lasso,rf_lasso,Lasso]
# lasso_weights = [0.332473,0.36443091,0.3030961]
# lasso_win_prop = 0.03631127
# lasso_combinations = [tuple([lasso_models[i],lasso_weights[i]*lasso_win_prop*overall_prop]) for i in range(len(lasso_models))]

# # stacking larm
# larm_models = [logistic_larm,rf_larm,LARM]
# larm_weights = [0.3308668,0.36512877,0.30400443]
# larm_win_prop = 0.19560233
# larm_combinations = [tuple([larm_models[i],larm_weights[i]*larm_win_prop*overall_prop]) for i in range(len(larm_models))]

# # stacking huber
# huber_models = [logistic_huber,rf_huber,Huber]
# huber_weights = [0.32970368,0.36645804,0.30383827]
# huber_win_prop = 0.37547494
# huber_combinations = [tuple([huber_models[i],huber_weights[i]*huber_win_prop*overall_prop]) for i in range(len(huber_models))]

# # stacking rfs
# rf_models = [RF,logistic_RF]
# rf_weights = [0.554838,0.445162]
# rf_win_prop = 0.18030462
# rf_combinations = [tuple([rf_models[i],rf_weights[i]*rf_win_prop*overall_prop]) for i in range(len(rf_models))]

# models = (ridge_combinations 
#           + enet_combinations 
#           + lasso_combinations 
#           + larm_combinations 
#           + huber_combinations 
#           + rf_combinations
#           + [(LME, 0.30*0.09),(logistic_LME,0.33*0.09),(rf_lme,0.37*0.09),(XGB600,0.05),(XGB3000, 0.03),(RF_2, 0.03)]
#          )


# huber_combinations = [tuple([huber_models[i],huber_weights[i]*0.35]) for i in range(len(huber_models))]
# ridge_combinations = [tuple([ridge_models[i],ridge_weights[i]*0.25]) for i in range(len(ridge_models))]
# rf_combinations = [tuple([rf_models[i],rf_weights[i]*0.25]) for i in range(len(rf_models))]

# models = (ridge_combinations + huber_combinations  + rf_combinations
#           + [(LME, 0.30*0.04),(logistic_LME,0.33*0.04),(rf_lme,0.37*0.04),(XGB600,0.05),(XGB3000, 0.03),(RF_2, 0.03)]
#          )

# models = [
#         logistic_ridge,rf_ridge,Ridge,
#         logistic_enet,rf_enet,Enet,
#         logistic_lasso,rf_lasso,Lasso,
#         logistic_larm,rf_larm,LARM,
#         logistic_huber,rf_huber,Huber,
#         RF,logistic_RF
#         ]

# weights = [ 0.0548103 ,  0.07566879,  0.05380227,  0.00758793,  0.01014677,
#         0.01029078,  0.00775408,  0.01422321,  0.01433398,  0.06360565,
#         0.08515093,  0.04684575,  0.11919136,  0.15139297,  0.10489061,
#         0.08592634,  0.09437829]

# models = ([tuple([models[i],weights[i]*0.80]) for i in range(len(models))] +
#     [(LME, 0.30*0.15),(logistic_LME,0.33*0.15),(rf_lme,0.37*0.15),
#     (XGB3000, 0.03),(RF_2, 0.02)])


# current best ensemble
# models = [(Ridge,0.08),(logistic_ridge,0.03),(rf_ridge,0.03),
#     (Enet,0.06),(logistic_enet,0.03),(rf_enet,0.03),
#     (Lasso,0.06),(logistic_lasso,0.03),(rf_lasso,0.03),
#     (Huber, 0.06),(logistic_huber,0.03),(rf_huber,0.03),
#     (LARM,0.06),(logistic_larm,0.03),(rf_larm,0.03),
#     (LME, 0.05),(logistic_LME,0.08),(rf_lme,0.03),
#     (XGB600,0.09),
#     (RF,0.10),
#     (XGB3000, 0.02),
#     (RF_2, 0.01)]

models = [(stacked_final,0.70),
    (LME, 0.05),(logistic_LME,0.08),(rf_lme,0.03),
    (XGB600,0.09),
    (XGB3000, 0.03),
    (RF_2, 0.02)]

sum([y for x,y in models])

1.0

In [18]:
ensemble = Ridge[['ParcelId']].copy()
cols = ['201610','201611','201612','201710','201711','201712']
foo = models[0][0][cols]*models[0][1]
for pair in models[1:]:
    model,wt = pair
    foo = foo + model[cols]*wt 

In [19]:
ensemble = pd.concat([ensemble,foo.round(4)], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.06163584488647778,
 0.06093236582694411,
 0.07323018976423241,
 0.063845867478342311,
 0.06654205594018466)

In [134]:
# current best
ensemble = pd.concat([ensemble,foo.round(4)], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.062031906771147276,
 0.061217579408543304,
 0.07365583668775155,
 0.06422425661437603)

In [20]:
ensemble.to_csv("new_ensemble.gz", index=False, float_format='%.4g', compression='gzip')