| Feature | Description |
|---------|:-----------:|
|'airconditioningtypeid' |	 Type of cooling system present in the home (if any) |
|'architecturalstyletypeid' |	 Architectural style of the home (i.e. ranch, colonial, split-level, etc…) |
|'basementsqft' |	 Finished living area below or partially below ground level |
|'bathroomcnt' |	 Number of bathrooms in home including fractional bathrooms |
|'bedroomcnt' |	 Number of bedrooms in home  |
|'buildingqualitytypeid' |	 Overall assessment of condition of the building from best (lowest) to worst (highest) |
|'buildingclasstypeid' |	The building framing type (steel frame, wood frame, concrete/brick)  |
|'calculatedbathnbr' |	 Number of bathrooms in home including fractional bathroom |
|'decktypeid' |	Type of deck (if any) present on parcel |
|'threequarterbathnbr' |	 Number of 3/4 bathrooms in house (shower + sink + toilet) |
|'finishedfloor1squarefeet' |	 Size of the finished living area on the first (entry) floor of the home |
|'calculatedfinishedsquarefeet' |	 Calculated total finished living area of the home  |
|'finishedsquarefeet6' |	Base unfinished and finished area |
|'finishedsquarefeet12' |	Finished living area |
|'finishedsquarefeet13' |	Perimeter  living area |
|'finishedsquarefeet15' |	Total area |
|'finishedsquarefeet50' |	 Size of the finished living area on the first (entry) floor of the home |
|'fips' |	 Federal Information Processing Standard code -  see https://en.wikipedia.org/wiki/FIPS_county_code for more details |
|'fireplacecnt' |	 Number of fireplaces in a home (if any) |
|'fireplaceflag' |	 Is a fireplace present in this home  |
|'fullbathcnt' |	 Number of full bathrooms (sink, shower + bathtub, and toilet) present in home |
|'garagecarcnt' |	 Total number of garages on the lot including an attached garage |
|'garagetotalsqft' |	 Total number of square feet of all garages on lot including an attached garage |
|'hashottuborspa' |	 Does the home have a hot tub or spa |
|'heatingorsystemtypeid' |	 Type of home heating system |
|'latitude' |	 Latitude of the middle of the parcel multiplied by 10e6 |
|'longitude' |	 Longitude of the middle of the parcel multiplied by 10e6 |
|'lotsizesquarefeet' |	 Area of the lot in square feet |
|'numberofstories' |	 Number of stories or levels the home has |
|'parcelid' |	 Unique identifier for parcels (lots)  |
|'poolcnt' |	 Number of pools on the lot (if any) |
|'poolsizesum' |	 Total square footage of all pools on property |
|'pooltypeid10' |	 Spa or Hot Tub |
|'pooltypeid2' |	 Pool with Spa/Hot Tub |
|'pooltypeid7' |	 Pool without hot tub |
|'propertycountylandusecode' |	 County land use code i.e. it's zoning at the county level |
|'propertylandusetypeid' |	 Type of land use the property is zoned for |
|'propertyzoningdesc' |	 Description of the allowed land uses (zoning) for that property |
|'rawcensustractandblock' |	 Census tract and block ID combined - also contains blockgroup assignment by extension |
|'censustractandblock' |	 Census tract and block ID combined - also contains blockgroup assignment by extension |
|'regionidcounty' |	County in which the property is located |
|'regionidcity' |	 City in which the property is located (if any) |
|'regionidzip' |	 Zip code in which the property is located |
|'regionidneighborhood' |	Neighborhood in which the property is located |
|'roomcnt' |	 Total number of rooms in the principal residence |
|'storytypeid' |	 Type of floors in a multi-story house (i.e. basement and main level, split-level, attic, etc.).  See tab for details. |
|'typeconstructiontypeid' |	 What type of construction material was used to construct the home |
|'unitcnt' |	 Number of units the structure is built into (i.e. 2 = duplex, 3 = triplex, etc...) |
|'yardbuildingsqft17' |	Patio in  yard |
|'yardbuildingsqft26' |	Storage shed/building in yard |
|'yearbuilt' |	 The Year the principal residence was built  |
|'taxvaluedollarcnt' |	The total tax assessed value of the parcel |
|'structuretaxvaluedollarcnt' |	The assessed value of the built structure on the parcel |
|'landtaxvaluedollarcnt' |	The assessed value of the land area of the parcel |
|'taxamount' |	The total property tax assessed for that assessment year |
|'assessmentyear' |	The year of the property tax assessment  |
|'taxdelinquencyflag' |	Property taxes for this parcel are past due as of 2015 |
|'taxdelinquencyyear' |	Year for which the unpaid propert taxes were due  |
| | |

In [1]:
# pred.to_csv('zillow_prediction.csv', index=False, float_format='%.4g')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

In [2]:
data_y = pd.DataFrame.from_csv('data/train_2016.csv', parse_dates=['transactiondate'], index_col=None)

In [3]:
data_X = pd.read_csv(
    'data/properties_2016.csv',
    index_col=None,
    dtype={
        'hashottuborspa': np.bool,
        'propertycountylandusecode': np.str,
        'propertyzoningdesc': np.str,
        'fireplaceflag': np.bool,
        'taxdelinquencyflag': np.str}
)

In [4]:
print(data_X['taxdelinquencyflag'].unique())
data_X['taxdelinquencyflag'] = data_X['taxdelinquencyflag'].map({'Y': True})

[nan 'Y']


In [5]:
merged_data = data_y.merge(data_X, on='parcelid')

In [6]:
pd.isnull(merged_data).sum() > 0

parcelid                        False
logerror                        False
transactiondate                 False
airconditioningtypeid            True
architecturalstyletypeid         True
basementsqft                     True
bathroomcnt                      True
bedroomcnt                       True
buildingclasstypeid              True
buildingqualitytypeid            True
calculatedbathnbr                True
decktypeid                       True
finishedfloor1squarefeet         True
calculatedfinishedsquarefeet     True
finishedsquarefeet12             True
finishedsquarefeet13             True
finishedsquarefeet15             True
finishedsquarefeet50             True
finishedsquarefeet6              True
fips                             True
fireplacecnt                     True
fullbathcnt                      True
garagecarcnt                     True
garagetotalsqft                  True
hashottuborspa                   True
heatingorsystemtypeid            True
latitude    

In [7]:
merged_data['basementsqft'].unique()

array([   nan,   814.,   330.,   168.,   671.,  1555.,  1528.,   771.,
         700.,   831.,  1048.,  1312.,   690.,   564.,   198.,   676.,
         184.,   540.,   260.,   557.,   312.,   913.,  1350.,   802.,
         162.,   616.,  1551.,   485.,   238.,   493.,   760.,   515.,
         196.,   585.,   234.,   510.,   579.,   100.,   608.,  1210.])

In [17]:
print(merged_data['hashottuborspa'].unique())
print(merged_data['fireplaceflag'].unique())
print(merged_data['taxdelinquencyflag'].unique())
for f in ['hashottuborspa', 'fireplaceflag', 'taxdelinquencyflag']:
    merged_data[f] = merged_data[f].astype(np.bool)

[ True]
[ True]
[ True]


In [24]:
train = merged_data.sample(frac=0.9)
test = merged_data[~merged_data.index.isin(train.index)]
print(len(train))
print(len(test))
print(len(train) + len(test))
print(len(merged_data))

81730
9081
90811
90811


In [25]:
train_X = train.drop(
        ['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'],
        axis=1
    )
test_X = test.drop(
        ['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'],
        axis=1
    )

train_y = train['logerror']
test_y = test['logerror']

In [26]:
from xgboost.sklearn import XGBRegressor

hyper_params = {
    'base_score': 0.5,
    'colsample_bylevel': 1,
    'colsample_bytree': 0.7,
    'gamma': 0.3,
    'learning_rate': 0.1,
    'max_delta_step': 0,
    'max_depth': 5,
    'min_child_weight': 5,
    'missing': None,
    'n_estimators': 800,
    'n_jobs': 2,
    'objective': 'reg:linear',
    'reg_alpha': 1.7,
    'reg_lambda': 1,
    'scale_pos_weight': 1,
    'silent': 0,
    'subsample': 0.95
}
clf=None
clf = XGBRegressor(**hyper_params)

In [54]:
from sklearn.metrics import mean_squared_error
eval_set = [(train_X.values, train_y.values), (test_X.values, test_y.values)]


In [58]:
clf.fit(
    X=train_X.values,
    y=train_y.values,
    early_stopping_rounds=10,
    eval_set=eval_set,
    verbose=True,
    eval_metric='mae'
)

[0]	validation_0-mae:0.449683	validation_1-mae:0.449957
Multiple eval metrics have been passed: 'validation_1-mae' will be used for early stopping.

Will train until validation_1-mae hasn't improved in 10 rounds.
[1]	validation_0-mae:0.406673	validation_1-mae:0.406882
[2]	validation_0-mae:0.368152	validation_1-mae:0.368203
[3]	validation_0-mae:0.33367	validation_1-mae:0.33357
[4]	validation_0-mae:0.302804	validation_1-mae:0.302578
[5]	validation_0-mae:0.275256	validation_1-mae:0.274912
[6]	validation_0-mae:0.250657	validation_1-mae:0.250176
[7]	validation_0-mae:0.228744	validation_1-mae:0.22812
[8]	validation_0-mae:0.209192	validation_1-mae:0.20846
[9]	validation_0-mae:0.191802	validation_1-mae:0.191
[10]	validation_0-mae:0.176346	validation_1-mae:0.175518
[11]	validation_0-mae:0.162604	validation_1-mae:0.161755
[12]	validation_0-mae:0.150391	validation_1-mae:0.149619
[13]	validation_0-mae:0.139613	validation_1-mae:0.138858
[14]	validation_0-mae:0.130124	validation_1-mae:0.129385
[15]	

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0.3, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=5, missing=None,
       n_estimators=800, n_jobs=2, nthread=2, objective='reg:linear',
       random_state=0, reg_alpha=1.7, reg_lambda=1, scale_pos_weight=1,
       seed=0, silent=0, subsample=0.95)

In [59]:
pred = clf.predict(test_X.values)

In [60]:
mean_squared_error(test_y, pred)

0.026327383887581608

In [61]:
i = 0
for t, p in zip(test_y, pred):
    print('{}\t{}'.format(t, p))
    i+=1
    if i>20:
        break

0.0276	0.0049159228801727295
0.008	0.004741460084915161
-0.0294	0.005183398723602295
-0.0222	0.015986382961273193
0.0526	0.0122147798538208
0.0411	0.021196722984313965
0.0402	0.030481994152069092
-0.044000000000000004	0.009989768266677856
0.003	0.010458201169967651
0.0208	-0.0037955045700073242
-0.6368	-0.03178513050079346
0.0554	0.00976794958114624
0.179	0.0017482340335845947
0.0344	0.01780661940574646
-0.0263	-0.029414892196655273
-0.0661	-0.17247837781906128
-0.0545	-0.016630589962005615
0.006999999999999999	0.007565706968307495
-0.408	-0.027644693851470947
-0.006	0.02229270339012146
-0.006	0.006953537464141846


In [53]:
clf.fit?

In [79]:
target_data = pd.read_csv('data/sample_submission.csv')

In [80]:
print(len(target_data))
target_data.head()

2985217


Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0


In [81]:
target_props = data_X.loc[data_X['parcelid'].isin(target_data.ParcelId)]

In [86]:
for f in ['hashottuborspa', 'fireplaceflag', 'taxdelinquencyflag']:
    target_props[f] = target_props[f].astype(np.bool)

In [88]:
target_pred = clf.predict(target_props.drop(
        ['parcelid', 'propertyzoningdesc', 'propertycountylandusecode'],
        axis=1
    ).values)

In [89]:
for c in target_data.columns[target_data.columns != 'ParcelId']:
    target_data[c] = target_pred