In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

### Submission Functions 

In [2]:
def generate_submissions(oct_model,nov_model,dec_model,name='new_submission',logy=True):
    """
    This function creates the submission file for the public leaderboard predictions.
    Three already fitted models, one for each of the predicting time points, is required.
    """
    submission_df = DataFrame()
    for i in range(int(properties.shape[0] / 100000)):
        all_feats = full_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])
        foo = properties.iloc[i*100000:(i+1)*100000][['parcelid']].reset_index(drop=True)
        if logy:
            foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                            '201611': nov_model.predict(all_feats),
                                                            '201612': dec_model.predict(all_feats)})], axis=1)
        else:
            foo = pd.concat([foo, DataFrame({'201610': np.log(oct_model.predict(all_feats)),
                                                            '201611': np.log(nov_model.predict(all_feats)),
                                                            '201612': np.log(dec_model.predict(all_feats))})], axis=1)
        submission_df = pd.concat([submission_df, foo], ignore_index=True)

    #  fencepost problem
    all_feats = full_pipeline.transform(properties.iloc[2900000:])
    foo = properties.iloc[2900000:][['parcelid']].reset_index(drop=True)
    foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                    '201611': nov_model.predict(all_feats),
                                                    '201612': dec_model.predict(all_feats)})], axis=1)
    submission_df = pd.concat([submission_df, foo], ignore_index=True)
    
    submission_df['201710'] = 0
    submission_df['201711'] = 0
    submission_df['201712'] = 0
    
    submission_df.rename(columns={'parcelid':'ParcelId'}, inplace=True)    
#     submission_df[['201610','201611','201612','201710','201711','201712']]= submission_df[['201610','201611','201612',
#                                                                                            '201710','201711','201712']].round(4)
    # unit test
    submission_df.drop_duplicates(inplace=True)
    assert submission_df.shape[0] == properties.shape[0]
    # write to .csv
    submission_df[['ParcelId','201610','201611','201612',
                  '201710','201711','201712']].to_csv(name + ".gz", index=False, float_format='%.4g', compression='gzip')
    return submission_df

In [3]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [4]:
#maindir = "/home/anerdi/Desktop/Zillow"
maindir = "/users/ccheung/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")
test_parcels = pd.read_csv(maindir + "/data/sample_submission.csv", usecols = ['ParcelId'])

test_parcels.rename(columns={'ParcelId':'parcelid'}, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


### Additional Features

In [5]:
#life of property
properties['N-life'] = 2018 - properties['yearbuilt']

#error in calculation of the finished living area of home
properties['N-LivingAreaError'] = properties['calculatedfinishedsquarefeet']/properties['finishedsquarefeet12']

#proportion of living area
properties['N-LivingAreaProp'] = properties['calculatedfinishedsquarefeet']/properties['lotsizesquarefeet']
properties['N-LivingAreaProp2'] = properties['finishedsquarefeet12']/properties['finishedsquarefeet15']

#Amout of extra space
properties['N-ExtraSpace'] = properties['lotsizesquarefeet'] - properties['calculatedfinishedsquarefeet'] 
properties['N-ExtraSpace-2'] = properties['finishedsquarefeet15'] - properties['finishedsquarefeet12'] 

#Total number of rooms
properties['N-TotalRooms'] = properties['bathroomcnt']*properties['bedroomcnt']

#Average room size
properties['N-AvRoomSize'] = properties['calculatedfinishedsquarefeet']/properties['roomcnt'] 

# Number of Extra rooms
properties['N-ExtraRooms'] = properties['roomcnt'] - properties['N-TotalRooms'] 

#Ratio of the built structure value to land area
properties['N-ValueProp'] = properties['structuretaxvaluedollarcnt']/properties['landtaxvaluedollarcnt']

#Does property have a garage, pool or hot tub and AC?
properties['N-GarPoolAC'] = ((properties['garagecarcnt']>0) & (properties['pooltypeid10']>0) & (properties['airconditioningtypeid']!=5))*1 

properties["N-location"] = properties["latitude"] + properties["longitude"]
properties["N-location-2"] = properties["latitude"]*properties["longitude"]
properties["N-location-2round"] = properties["N-location-2"].round(-4)

properties["N-latitude-round"] = properties["latitude"].round(-4)
properties["N-longitude-round"] = properties["longitude"].round(-4)

#Ratio of tax of property over parcel
properties['N-ValueRatio'] = properties['taxvaluedollarcnt']/properties['taxamount']

#TotalTaxScore
properties['N-TaxScore'] = properties['taxvaluedollarcnt']*properties['taxamount']

#polnomials of tax delinquency year
properties["N-taxdelinquencyyear-2"] = properties["taxdelinquencyyear"] ** 2
properties["N-taxdelinquencyyear-3"] = properties["taxdelinquencyyear"] ** 3

#Length of time since unpaid taxes
properties['N-life'] = 2018 - properties['taxdelinquencyyear']

#Number of properties in the zip
zip_count = properties['regionidzip'].value_counts().to_dict()
properties['N-zip_count'] = properties['regionidzip'].map(zip_count)

#Number of properties in the city
city_count = properties['regionidcity'].value_counts().to_dict()
properties['N-city_count'] = properties['regionidcity'].map(city_count)

#Number of properties in the city
region_count = properties['regionidcounty'].value_counts().to_dict()
properties['N-county_count'] = properties['regionidcounty'].map(region_count)

#Average structuretaxvaluedollarcnt by city
group = properties.groupby('regionidcity')['structuretaxvaluedollarcnt'].aggregate('mean').to_dict()
properties['N-Avg-structuretaxvaluedollarcnt'] = properties['regionidcity'].map(group)

#Deviation away from average
properties['N-Dev-structuretaxvaluedollarcnt'] = (abs((properties['structuretaxvaluedollarcnt'] 
                                                       - properties['N-Avg-structuretaxvaluedollarcnt']))
                                                  /properties['N-Avg-structuretaxvaluedollarcnt'])

In [6]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

### Data  Preprocessing Pipeline

In [7]:
# Setup variables considered in the model

# numerical variables
num_atts = ['bathroomcnt','bedroomcnt','buildingqualitytypeid','calculatedbathnbr','finishedfloor1squarefeet',
           'calculatedfinishedsquarefeet','finishedsquarefeet12','finishedsquarefeet13',
           'finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6','fireplacecnt',
           'fullbathcnt','garagecarcnt','garagetotalsqft','latitude','longitude','lotsizesquarefeet',
           'poolcnt','poolsizesum','censustractandblock','roomcnt','threequarterbathnbr','unitcnt',
           'yardbuildingsqft17','yardbuildingsqft26','numberofstories',
            'structuretaxvaluedollarcnt','taxvaluedollarcnt','landtaxvaluedollarcnt','taxamount',
           'N-ValueRatio', 'N-LivingAreaProp', 'N-ValueProp', 'N-Dev-structuretaxvaluedollarcnt', 
            'N-TaxScore', 'N-zip_count', 'N-Avg-structuretaxvaluedollarcnt', 'N-city_count',
           'N-LivingAreaProp2', 'N-location-2round', 'N-TotalRooms','N-AvRoomSize']

# categorical varaibles
cat_atts = ['airconditioningtypeid','architecturalstyletypeid',
           'buildingclasstypeid','heatingorsystemtypeid','pooltypeid10','pooltypeid2',
            'pooltypeid7','propertylandusetypeid', 'regionidcounty',
           'storytypeid','typeconstructiontypeid','yearbuilt','fireplaceflag',
           'taxdelinquencyflag']

# Dictionary of categorical variables and their default levels
cat_dict = {'airconditioningtypeid':[-1] + list(range(1,14)),
           'architecturalstyletypeid':[-1] + list(range(1,28)),
           'buildingclasstypeid':[-1] + list(range(1,6)),
            'heatingorsystemtypeid':[-1] + list(range(1,26)),
            'pooltypeid10': list(range(-1,2)),
            'pooltypeid2': list(range(-1,2)),
            'pooltypeid7': list(range(-1,2)),
            'propertylandusetypeid': [-1, 31,46,47,246,247,248,260,261,262,263,264,265,266,267,268,269,270,271,
                                     273,274,275,276,279,290,291],
            'regionidcounty': [2061,3101,1286],
            'storytypeid':[-1] + list(range(1,36)),
            'typeconstructiontypeid':[-1] + list(range(1,19)),
            'yearbuilt': [-1] + list(range(1885,2018)),
            'fireplaceflag': [-1] + ['True','False'],
            'taxdelinquencyflag': [-1] + ['Y','N']
           }

In [8]:
# A custom transformer, which selects certain variables
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, desired_cols):
        self.desired_cols = desired_cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.desired_cols].values

# A custom transformer, which first selects the categorical variables
# from the DataFrame and then performs the dummification
class DF_Selector_GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self, cat_dict):
        self.cat_dict = cat_dict
        self.ndummies = sum(len(c) - 1  for c in cat_dict.values()) 
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.fillna(-1) # missing values are given -1 missing label
        foo = np.zeros((X.shape[0],self.ndummies))
        start = 0
        end = 0
        for c in sorted(self.cat_dict.keys()):
            end += len(self.cat_dict[c]) - 1
            foo[:, start:end] = pd.get_dummies(X[c].astype('category', categories=self.cat_dict[c]))[self.cat_dict[c][1:]]
            start += len(self.cat_dict[c]) - 1
        return foo

In [9]:
# Categorical pipeline
cat_pipeline = Pipeline([
        ('select_and_dummify', DF_Selector_GetDummies(cat_dict)),
    ])

# Numerical pipeline
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_atts)),
        ('imputer', Imputer()),
    ])

# Full pipeline
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline)
    ])

In [10]:
import gc

In [11]:
gc.collect()

7

## Training KNeighborsRegressor

In [12]:
from sklearn.neighbors import KNeighborsRegressor

In [13]:
data['wts_oct'] = np.where(data['month'] == 10, 1.5, 1)
data['wts_nov'] = np.where(data['month'] == 11, 1.5, 1)
data['wts_dec'] = np.where(data['month'] == 12, 1.5, 1)

## Creating a train/test set

In [14]:
X_train = full_pipeline.fit_transform(data)

In [15]:
Y_train = data['logerror'].values

# Neighbor Regression

In [22]:
neigh = KNeighborsRegressor(n_neighbors = 10)

print("Training")
print("training oct model...")
neigh_o = neigh.fit(X_train, Y_train) 
print("training nov model...")
neigh_n = neigh.fit(X_train, Y_train) 
print("training dec model...")
neigh_d = neigh.fit(X_train, Y_train) 

print("Writing to excel...")
submission_df = generate_submissions(neigh_o, neigh_n, neigh_d, name = "KNeighborsRegressor_10")
mean_absolute_errors(submission_df, data)

Training
training oct model...
training nov model...
training dec model...
Writing to excel...


(0.06899624472573807,
 0.06845635268346104,
 0.07824080506037953,
 0.070762862327323608)

# Radius Regression

In [17]:
from sklearn.neighbors import RadiusNeighborsRegressor
radius = RadiusNeighborsRegressor(radius = 0.5)

print("Training")
print("training oct model...")
radius_o = radius.fit(X_train, Y_train) 
print("training nov model...")
radius_n = radius.fit(X_train, Y_train) 
print("training dec model...")
radius_d = radius.fit(X_train, Y_train) 

print("Writing to excel...")
submission_df = generate_submissions(radius_o, radius_n, radius_d, name = "Radius_5")
mean_absolute_errors(submission_df, data)

Training
training oct model...
training nov model...
training dec model...
Writing to excel...


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


(0.00015505575647980707,
 3.231106243154435e-05,
 0.00020411155836687754,
 0.00013880385155701238)

# Least Angle Regression model

In [18]:
from sklearn import linear_model
reg = linear_model.Lars(n_nonzero_coefs = 1)

print("Training")
print("training oct model...")
reg_o = reg.fit(X_train, Y_train) 
print("training nov model...")
reg_n = reg.fit(X_train, Y_train) 
print("training dec model...")
reg_d = reg.fit(X_train, Y_train) 

print("Writing to excel...")
submission_df = generate_submissions(reg_o, reg_n, reg_d, name = "LARM")
mean_absolute_errors(submission_df, data)

Training
training oct model...
training nov model...
training dec model...
Writing to excel...


(0.0634578084219202,
 0.06251777597632119,
 0.07483130487282763,
 0.065572302812281269)

# Least Angle Regression LASSO model

In [19]:
from sklearn import linear_model
reglasso = linear_model.LassoLars(alpha = 0.03)

print("Training")
print("training oct model...")
reglasso_o = reglasso.fit(X_train, Y_train) 
print("training nov model...")
reglasso_n = reglasso.fit(X_train, Y_train) 
print("training dec model...")
reglasso_d = reglasso.fit(X_train, Y_train) 

print("Writing to excel...")
submission_df = generate_submissions(reglasso_o, reglasso_n, reglasso_d, name = "LARM_LASSO")
mean_absolute_errors(submission_df, data)

Training
training oct model...
training nov model...
training dec model...
Writing to excel...


(0.06361494146404957,
 0.06264093954428532,
 0.07490654043002301,
 0.065705501414452097)

# Mixed effect linear regression model

In [191]:
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.dependence_structures import Exchangeable, Independence
from statsmodels.regression.mixed_linear_model import MixedLM

ind = Independence()
mod_gee = GEE.from_formula(Y_train, X_train[:,1:], groups = X_train[:,0], cov_struct = ind)
mod_gee = mod_gee.fit()


ModuleNotFoundError: No module named 'statsmodels.genmod.dependence_structures'

In [41]:
Ridge.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,-0.019446,-0.019446,-0.019446,0,0,0
1,10759547,-0.019573,-0.019573,-0.019573,0,0,0
2,10843547,0.402314,0.402314,0.402314,0,0,0
3,10859147,0.016314,0.016314,0.016314,0,0,0
4,10879947,0.002653,0.002653,0.002653,0,0,0


In [40]:
Enet.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,-0.019564,-0.019564,-0.019564,0,0,0
1,10759547,-0.019629,-0.019629,-0.019629,0,0,0
2,10843547,0.212738,0.212738,0.212738,0,0,0
3,10859147,0.016817,0.016817,0.016817,0,0,0
4,10879947,0.010969,0.010969,0.010969,0,0,0


In [38]:
Lasso.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,-0.018543,-0.018543,-0.018543,0,0,0
1,10759547,-0.018589,-0.018589,-0.018589,0,0,0
2,10843547,0.203752,0.203752,0.203752,0,0,0
3,10859147,0.018183,0.018183,0.018183,0,0,0
4,10879947,0.010949,0.010949,0.010949,0,0,0


## Model Averaging 

In [42]:
RF = pd.read_csv("RF_n100_maxfeat5_maxdepth8.gz", compression="gzip")
mean_absolute_errors(RF, data)

(0.0627264215390796,
 0.06182163198247547,
 0.07473668775158138,
 0.064978084757667934)

In [50]:
RF_2 = pd.read_csv("RF_n100_maxfeat10_maxdepth20_extreme.gz", compression="gzip")
mean_absolute_errors(RF_2, data)

(0.06195625879043588,
 0.06436095290251921,
 0.07321730879815982,
 0.064762854132521586)

In [43]:
XGB = pd.read_csv("XGB_600.gz", compression="gzip")
mean_absolute_errors(XGB, data)

(0.06211727600763514,
 0.06136520946002188,
 0.07414289916043706,
 0.064404712807773379)

In [47]:
XGB3000 = pd.read_csv("XGB_3000.csv")
mean_absolute_errors(XGB3000, data)

(0.05998706659714675,
 0.05918412810186208,
 0.07170160296434737,
 0.06220029687696086)

In [51]:
blend3 = submission_df[['ParcelId']].copy()
cols = ['201610','201611','201612','201710','201711','201712']
blend3 = pd.concat([blend3,(20/100)*Ridge[cols] + (20/100)*Enet[cols] + (20/100)*Lasso[cols] +
                    (18/100)*XGB[cols] + (18/100)*RF[cols] + (2/100)*XGB3000[cols]
                   +(2/100)*RF_2[cols]], axis=1)
blend3['ParcelId'] = blend3['ParcelId'].astype(int)
assert all(blend3.ParcelId.unique() == submission_df.ParcelId.unique())
mean_absolute_errors(blend3, data)

(0.06268473765862069,
 0.061797918439336595,
 0.07437674098281555,
 0.064875449656555853)

In [52]:
blend3.to_csv("new_ensemble_2.gz", index=False, float_format='%.4g', compression='gzip')