In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def generate_submissions(oct_model,nov_model,dec_model,name='new_submission',logy=True):
    """
    This function creates the submission file for the public leaderboard predictions.
    Three already fitted models, one for each of the predicting time points, is required.
    """
    submission_df = DataFrame()
    for i in range(int(properties.shape[0] / 100000)):
        all_feats = full_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])
        foo = properties.iloc[i*100000:(i+1)*100000][['parcelid']].reset_index(drop=True)
        if logy:
            foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                            '201611': nov_model.predict(all_feats),
                                                            '201612': dec_model.predict(all_feats)})], axis=1)
        else:
            foo = pd.concat([foo, DataFrame({'201610': np.log(oct_model.predict(all_feats)),
                                                            '201611': np.log(nov_model.predict(all_feats)),
                                                            '201612': np.log(dec_model.predict(all_feats))})], axis=1)
        submission_df = pd.concat([submission_df, foo], ignore_index=True)

    #  fencepost problem
    all_feats = full_pipeline.transform(properties.iloc[2900000:])
    foo = properties.iloc[2900000:][['parcelid']].reset_index(drop=True)
    foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                    '201611': nov_model.predict(all_feats),
                                                    '201612': dec_model.predict(all_feats)})], axis=1)
    submission_df = pd.concat([submission_df, foo], ignore_index=True)
    
    submission_df['201710'] = 0
    submission_df['201711'] = 0
    submission_df['201712'] = 0
    
    submission_df.rename(columns={'parcelid':'ParcelId'}, inplace=True)    
#     submission_df[['201610','201611','201612','201710','201711','201712']]= submission_df[['201610','201611','201612',
#                                                                                            '201710','201711','201712']].round(4)
    # unit test
    submission_df.drop_duplicates(inplace=True)
    assert submission_df.shape[0] == properties.shape[0]
    # write to .csv
    submission_df[['ParcelId','201610','201611','201612',
                  '201710','201711','201712']].to_csv(name + ".gz", index=False, float_format='%.4g', compression='gzip')
    return submission_df

In [3]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [4]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")
test_parcels = pd.read_csv(maindir + "/data/sample_submission.csv", usecols = ['ParcelId'])

test_parcels.rename(columns={'ParcelId':'parcelid'}, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

### Loading in predictions from the models 

In [7]:
BART = pd.read_csv("BART_submission.csv.gz", compression="gzip")
mean_absolute_errors(BART, data)

(0.06315269012412131,
 0.06226942536365466,
 0.0750168754592673,
 0.065379215158680754)

In [8]:
LME = pd.read_csv("LME.csv.gz", compression="gzip")
mean_absolute_errors(LME, data)

(0.0633024491005806,
 0.06252642013941097,
 0.07472870881317234,
 0.065462743733816525)

In [9]:
RF = pd.read_csv("RF_n100_maxfeat5_maxdepth8.gz", compression="gzip")
mean_absolute_errors(RF, data)

(0.0627264215390796,
 0.06182163198247547,
 0.07473668775158138,
 0.064978084757667934)

In [10]:
RF_2 = pd.read_csv("RF_n100_maxfeat10_maxdepth20_extreme.gz", compression="gzip")
mean_absolute_errors(RF_2, data)

(0.06195625879043588,
 0.06436095290251921,
 0.07321730879815982,
 0.064762854132521586)

In [50]:
Ridge = pd.read_csv("Ridge.gz", compression="gzip")
mean_absolute_errors(Ridge, data)

(0.06329828003054072,
 0.06249673358159914,
 0.07483452948821169,
 0.065475511825333801)

In [58]:
Enet = pd.read_csv("ElasticNet.gz", compression="gzip")
mean_absolute_errors(Enet, data)

(0.06332393369499698,
 0.06246643373493972,
 0.07474747383553765,
 0.065466258838679459)

In [59]:
Lasso = pd.read_csv("Lasso.gz", compression="gzip")
mean_absolute_errors(Lasso, data)

(0.0633293628691983,
 0.062449004381161084,
 0.07475657504312816,
 0.065467549168812908)

In [14]:
XGB = pd.read_csv("XGB_600.gz", compression="gzip")
mean_absolute_errors(XGB, data)

(0.06211727600763514,
 0.06136520946002188,
 0.07414289916043706,
 0.064404712807773379)

In [15]:
XGB3000 = pd.read_csv("XGB_3000.csv")
mean_absolute_errors(XGB3000, data)

(0.05998706659714675,
 0.05918412810186208,
 0.07170160296434737,
 0.06220029687696086)

In [16]:
Huber = pd.read_csv("HuberRegressor.gz", compression="gzip")
mean_absolute_errors(Huber, data)

(0.06361133815551537,
 0.06264460021905807,
 0.0749035882691202,
 0.065703583469913368)

In [35]:
LARM =  pd.read_csv("LARM.gz", compression="gzip")
mean_absolute_errors(LARM, data)

(0.06345773598553339,
 0.06251769879518071,
 0.07483130304772855,
 0.065572243736829725)

In [18]:
Adaptive_LASSO =  pd.read_csv("Adp-lasso-af.gz", compression="gzip")
mean_absolute_errors(Adaptive_LASSO, data)

(0.06346130419931673,
 0.062492389375684604,
 0.07486073950546275,
 0.06557490517443211)

In [60]:
models = [(Ridge,0.13),
         (Enet,0.13),
         (Lasso,0.13),
         (LME, 0.13),
         (LARM,0.10),
         (Adaptive_LASSO, 0.10),
         (XGB,0.10),
         (RF,0.10),
         (XGB3000, 0.04),
         (RF_2, 0.04)]

In [61]:
sum([y for x,y in models])

1.0

In [62]:
ensemble = Ridge[['ParcelId']].copy()
cols = ['201610','201611','201612','201710','201711','201712']
foo = models[0][0][cols]*models[0][1]
for pair in models[1:]:
    model,wt = pair
    foo = foo + model[cols]*wt

In [63]:
ensemble = pd.concat([ensemble,foo], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.06245490627730066,
 0.06164479633280732,
 0.07392136658756347,
 0.0646160996419579)

In [32]:
ensemble = pd.concat([ensemble,foo], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.06259217287302513,
 0.0617576086180706,
 0.0740689090448652,
 0.064750230690080013)

In [64]:
ensemble.to_csv("new_ensemble.gz", index=False, float_format='%.4g', compression='gzip')