In [3]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [4]:
def generate_submissions(oct_model,nov_model,dec_model,name='new_submission',logy=True):
    """
    This function creates the submission file for the public leaderboard predictions.
    Three already fitted models, one for each of the predicting time points, is required.
    """
    submission_df = DataFrame()
    for i in range(int(properties.shape[0] / 100000)):
        all_feats = full_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])
        foo = properties.iloc[i*100000:(i+1)*100000][['parcelid']].reset_index(drop=True)
        if logy:
            foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                            '201611': nov_model.predict(all_feats),
                                                            '201612': dec_model.predict(all_feats)})], axis=1)
        else:
            foo = pd.concat([foo, DataFrame({'201610': np.log(oct_model.predict(all_feats)),
                                                            '201611': np.log(nov_model.predict(all_feats)),
                                                            '201612': np.log(dec_model.predict(all_feats))})], axis=1)
        submission_df = pd.concat([submission_df, foo], ignore_index=True)

    #  fencepost problem
    all_feats = full_pipeline.transform(properties.iloc[2900000:])
    foo = properties.iloc[2900000:][['parcelid']].reset_index(drop=True)
    foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                    '201611': nov_model.predict(all_feats),
                                                    '201612': dec_model.predict(all_feats)})], axis=1)
    submission_df = pd.concat([submission_df, foo], ignore_index=True)
    
    submission_df['201710'] = 0
    submission_df['201711'] = 0
    submission_df['201712'] = 0
    
    submission_df.rename(columns={'parcelid':'ParcelId'}, inplace=True)    
#     submission_df[['201610','201611','201612','201710','201711','201712']]= submission_df[['201610','201611','201612',
#                                                                                            '201710','201711','201712']].round(4)
    # unit test
    submission_df.drop_duplicates(inplace=True)
    assert submission_df.shape[0] == properties.shape[0]
    # write to .csv
    submission_df[['ParcelId','201610','201611','201612',
                  '201710','201711','201712']].to_csv(name + ".gz", index=False, float_format='%.4g', compression='gzip')
    return submission_df

In [5]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [6]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")
test_parcels = pd.read_csv(maindir + "/data/sample_submission.csv", usecols = ['ParcelId'])

test_parcels.rename(columns={'ParcelId':'parcelid'}, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
#life of property
properties['N-life'] = 2018 - properties['yearbuilt']

#error in calculation of the finished living area of home
properties['N-LivingAreaError'] = properties['calculatedfinishedsquarefeet']/properties['finishedsquarefeet12']

#proportion of living area
properties['N-LivingAreaProp'] = properties['calculatedfinishedsquarefeet']/properties['lotsizesquarefeet']
properties['N-LivingAreaProp2'] = properties['finishedsquarefeet12']/properties['finishedsquarefeet15']

#Amout of extra space
properties['N-ExtraSpace'] = properties['lotsizesquarefeet'] - properties['calculatedfinishedsquarefeet'] 
properties['N-ExtraSpace-2'] = properties['finishedsquarefeet15'] - properties['finishedsquarefeet12'] 

#Total number of rooms
properties['N-TotalRooms'] = properties['bathroomcnt']*properties['bedroomcnt']

#Average room size
properties['N-AvRoomSize'] = properties['calculatedfinishedsquarefeet']/properties['roomcnt'] 

# Number of Extra rooms
properties['N-ExtraRooms'] = properties['roomcnt'] - properties['N-TotalRooms'] 

#Ratio of the built structure value to land area
properties['N-ValueProp'] = properties['structuretaxvaluedollarcnt']/properties['landtaxvaluedollarcnt']

#Does property have a garage, pool or hot tub and AC?
properties['N-GarPoolAC'] = ((properties['garagecarcnt']>0) & (properties['pooltypeid10']>0) & (properties['airconditioningtypeid']!=5))*1 

properties["N-location"] = properties["latitude"] + properties["longitude"]
properties["N-location-2"] = properties["latitude"]*properties["longitude"]
properties["N-location-2round"] = properties["N-location-2"].round(-4)

properties["N-latitude-round"] = properties["latitude"].round(-4)
properties["N-longitude-round"] = properties["longitude"].round(-4)

#Ratio of tax of property over parcel
properties['N-ValueRatio'] = properties['taxvaluedollarcnt']/properties['taxamount']

#TotalTaxScore
properties['N-TaxScore'] = properties['taxvaluedollarcnt']*properties['taxamount']

#polnomials of tax delinquency year
properties["N-taxdelinquencyyear-2"] = properties["taxdelinquencyyear"] ** 2
properties["N-taxdelinquencyyear-3"] = properties["taxdelinquencyyear"] ** 3

#Length of time since unpaid taxes
properties['N-life'] = 2018 - properties['taxdelinquencyyear']

#Number of properties in the zip
zip_count = properties['regionidzip'].value_counts().to_dict()
properties['N-zip_count'] = properties['regionidzip'].map(zip_count)

#Number of properties in the city
city_count = properties['regionidcity'].value_counts().to_dict()
properties['N-city_count'] = properties['regionidcity'].map(city_count)

#Number of properties in the city
region_count = properties['regionidcounty'].value_counts().to_dict()
properties['N-county_count'] = properties['regionidcounty'].map(region_count)

#Average structuretaxvaluedollarcnt by city
group = properties.groupby('regionidcity')['structuretaxvaluedollarcnt'].aggregate('mean').to_dict()
properties['N-Avg-structuretaxvaluedollarcnt'] = properties['regionidcity'].map(group)

#Deviation away from average
properties['N-Dev-structuretaxvaluedollarcnt'] = (abs((properties['structuretaxvaluedollarcnt'] 
                                                       - properties['N-Avg-structuretaxvaluedollarcnt']))
                                                  /properties['N-Avg-structuretaxvaluedollarcnt'])

In [8]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

### Loading in predictions from the models 

In [114]:
LME = pd.read_csv("test_predictions.csv.gz", compression="gzip")
mean_absolute_errors(LME, data)

(0.06335799788964462,
 0.06250493564507772,
 0.07474658919966297,
 0.065494156708368903)

In [20]:
RF = pd.read_csv("RF_n100_maxfeat5_maxdepth8.gz", compression="gzip")
mean_absolute_errors(RF, data)

(0.0627264215390796,
 0.06182163198247547,
 0.07473668775158138,
 0.064978084757667934)

In [21]:
RF_2 = pd.read_csv("RF_n100_maxfeat10_maxdepth20_extreme.gz", compression="gzip")
mean_absolute_errors(RF_2, data)

(0.06195625879043588,
 0.06436095290251921,
 0.07321730879815982,
 0.064762854132521586)

In [22]:
Ridge = pd.read_csv("Ridge.gz", compression="gzip")
mean_absolute_errors(Ridge, data)

(0.06360390734619255,
 0.06268171032858727,
 0.07535184221736627,
 0.065798443401779524)

In [23]:
Enet = pd.read_csv("ElasticNet.gz", compression="gzip")
mean_absolute_errors(Enet, data)

(0.06356341482419092,
 0.06265443740963851,
 0.07534662269695212,
 0.065767957756965326)

In [24]:
Lasso = pd.read_csv("Lasso.gz", compression="gzip")
mean_absolute_errors(Lasso, data)

(0.06354550414788025,
 0.06262862377984654,
 0.07529440752731466,
 0.065741373900257555)

In [25]:
XGB = pd.read_csv("XGB_600.gz", compression="gzip")
mean_absolute_errors(XGB, data)

(0.06211727600763514,
 0.06136520946002188,
 0.07414289916043706,
 0.064404712807773379)

In [26]:
XGB3000 = pd.read_csv("XGB_3000.csv")
mean_absolute_errors(XGB3000, data)

(0.05998706659714675,
 0.05918412810186208,
 0.07170160296434737,
 0.06220029687696086)

In [27]:
Huber = pd.read_csv("HuberRegressor.gz", compression="gzip")
mean_absolute_errors(Huber, data)

(0.06361133815551537,
 0.06264460021905807,
 0.0749035882691202,
 0.065703583469913368)

In [28]:
LARM_LASSO =  pd.read_csv("LARM_LASSO.gz", compression="gzip")
mean_absolute_errors(LARM_LASSO, data)

(0.06361498894916652,
 0.06264090909090905,
 0.07490658999424941,
 0.065705532662140173)

In [125]:
models = [(Ridge,0.13),
         (Enet,0.13),
         (Lasso,0.13),
         (LARM_LASSO,0.14),
         (LME, 0.14), 
         (XGB,0.13),
         (RF,0.13),
         (XGB3000, 0.035),
         (RF_2, 0.035)]

In [126]:
sum([y for x,y in models])

1.0

In [127]:
ensemble = Ridge[['ParcelId']].copy()
cols = ['201610','201611','201612','201710','201711','201712']
foo = models[0][0][cols]*models[0][1]
for pair in models[1:]:
    model,wt = pair
    foo = foo + model[cols]*wt

In [128]:
ensemble = pd.concat([ensemble,foo], axis=1)
ensemble['ParcelId'] = ensemble['ParcelId'].astype(int)
assert all(ensemble.ParcelId.unique() == Ridge.ParcelId.unique())
mean_absolute_errors(ensemble, data)

(0.06253640058108395,
 0.06168207707014895,
 0.074036725003617,
 0.06469503666628855)

In [124]:
ensemble.to_csv("new_ensemble.gz", index=False, float_format='%.4g', compression='gzip')