In [6]:
import pandas as pd
import numpy as np
import pickle
import gc

import warnings
warnings.filterwarnings('ignore')

# load data

In [98]:
df_properties_2016 = pd.read_csv('../data/properties_2016.csv')
df_properties_2017 = pd.read_csv('../data/properties_2017.csv')

In [99]:
df_transactions_2016 = pd.read_csv('../data/train_2016_v2.csv')
df_transactions_2017 = pd.read_csv('../data/train_2017.csv')

In [100]:
sample_submission = pd.read_csv('../data/sample_submission.csv', low_memory = False)

In [101]:
df_train_2016 = pd.merge(df_transactions_2016, df_properties_2016, how = 'left', on = 'parcelid')
df_train_2017 = pd.merge(df_transactions_2017, df_properties_2017, how = 'left', on = 'parcelid')

# assign 2017 tax data to NULL due to info leak
# df_train_2017[['structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt', 'taxvaluedollarcnt', 'taxamount']] = np.nan

# merge the two set 
df_train = pd.concat([df_train_2016, df_train_2017], axis = 0)

In [78]:
df_pred_2016 = pd.merge(sample_submission[['ParcelId']], df_properties_2016.rename(columns = {'parcelid': 'ParcelId'}), 
                how = 'left', on = 'ParcelId')
df_pred_2017 = pd.merge(sample_submission[['ParcelId']], df_properties_2017.rename(columns = {'parcelid': 'ParcelId'}), 
                how = 'left', on = 'ParcelId')

In [102]:
# get some memory back
del df_properties_2016, df_properties_2017, df_train_2016, df_train_2017
gc.collect()

200

# Pre-process data

In [19]:
# load all features generated from the sql file

## categorical features

In [8]:
# # add censustract
# t = pd.DataFrame(df['rawcensustractandblock'].astype(str).str.split('.',1).tolist(),columns = ['censustrack','censusblock'])
# df['censustrack'] = t['censustrack']

# t = pd.DataFrame(df_properties['rawcensustractandblock'].astype(str).str.split('.',1).tolist(),columns = ['censustrack','censusblock'])
# df_properties['censustrack'] = t['censustrack']


## feature engineering

### 1.location related features

In [56]:
def feature_engineer(df):
    ###################################
    ####### DIMENSION: SPACE ##########
    ###################################

    parcel_location_variables = ['regionidneighborhood',
                                 'regionidzip', 
                                 'regionidcity', 
                                 #'censustrack',
                                 'rawcensustractandblock']

    # iterate through all above regions
    for region in parcel_location_variables:

        #### COUNT OF PROPERTIES ####
        # number of properties in the zipcode
        df['f_num_n_prop_in_'+region] = df[region].map(df[region].value_counts().to_dict())

        ##### HOW NEW IS THIS BUILDING COMPARING TO OTHER BUILDINGS #####
        df['f_cat_median_year_in_'+region] = df[region].map(df.groupby(region)['yearbuilt'].aggregate('median').to_dict())
        df['f_num_how_new_in_'+region] = df['yearbuilt'] - df['f_cat_median_year_in_'+region]

        # Neighborhood latitude and longitude
        df['f_num_median_lat_in_'+region] = df[region].map(df.groupby(region)['latitude'].aggregate('median').to_dict())
        df['f_num_median_lon_in_'+region] = df[region].map(df.groupby(region)['longitude'].aggregate('median').to_dict())

#         #### TRANSACTION RELATED ####
#         # how many transaction made in this region per observed properties
#         # this tells us how active a region is
#         df['f_num_pct_trans_in_'+region] = df.groupby(region)['logerror'].aggregate('count') * 1.0 / df['f_num_n_prop_in_'+region] 

#         #### LOG ERROR RELATED ####
#         df['f_num_error_std_in_'+region] = df[region].map(df[df.transactiondate < '2016-10-01'].groupby(region)['logerror'].aggregate("std").to_dict())
#         df['f_num_error_mean_in_'+region] = df[region].map(df[df.transactiondate < '2016-10-01'].groupby(region)['logerror'].aggregate("mean").to_dict())

    return df

### 2. time related features
Here the features are generated from SQL.

In [49]:
!psql -d zillow -a -f ../scripts/feature_engineering_time.sql

DROP TABLE IF EXISTS transactions_2016;
DROP TABLE
CREATE TABLE transactions_2016 (
	parcelid bigint, 
	logerror double precision,
	transactiondate varchar
);
CREATE TABLE
DROP TABLE IF EXISTS transactions_2017;
DROP TABLE
COPY transactions_2016
FROM '/Users/dai_li/Workspace/personal/Competitions/zillow/data/train_2016_v2.csv' DELIMITER ',' CSV HEADER;
COPY 90275
CREATE TABLE transactions_2017 (
	parcelid bigint, 
	logerror double precision,
	transactiondate varchar
);
CREATE TABLE
COPY transactions_2017
FROM '/Users/dai_li/Workspace/personal/Competitions/zillow/data/train_2017.csv' DELIMITER ',' CSV HEADER;
COPY 77613
DROP TABLE IF EXISTS transactions;
DROP TABLE
CREATE TABLE transactions AS
SELECT * FROM transactions_2016
UNION ALL
SELECT * FROM transactions_2017;
SELECT 167888
DROP TABLE IF EXISTS tmp_additional_temporal_information;
DROP TABLE
CREATE TABLE tmp_additional_temporal_information AS
SELECT 
    t.parcelid
    , t.logerror
    , t.transactiondate
    , substring(transact

In [103]:
d_features = pd.read_csv('../data/monthly_transactions_features.csv')

In [104]:
d_features.head()

Unnamed: 0,year_and_month,avg_logerror,avg_abs_logerror,std_dev_logerror,std_dev_abs_logerror,avg_logerror_last_1_month,avg_logerror_last_2_month,avg_logerror_last_3_month,avg_std_dev_logerror_last_1_month,avg_std_dev_logerror_last_2_month,avg_std_dev_logerror_last_3_month,avg_std_dev_abs_logerror_last_1_month,avg_std_dev_abs_logerror_last_2_month,avg_std_dev_abs_logerror_last_3_month
0,2016-01,0.01587,0.072695,0.171525,0.156164,,,,,,,,,
1,2016-02,0.016082,0.077434,0.198599,0.183584,0.01587,0.01587,0.01587,0.171525,0.171525,0.171525,0.156164,0.156164,0.156164
2,2016-03,0.009867,0.072044,0.172171,0.156682,0.016082,0.015976,0.015976,0.198599,0.185062,0.185062,0.183584,0.169874,0.169874
3,2016-04,0.006605,0.069972,0.16656,0.151292,0.009867,0.012974,0.013939,0.172171,0.185385,0.180765,0.156682,0.170133,0.165477
4,2016-05,0.006926,0.066241,0.150861,0.135716,0.006605,0.008236,0.010851,0.16656,0.169365,0.17911,0.151292,0.153987,0.163853


In [105]:
d_features.shape

(21, 14)

## Combine existing data with new features

In [114]:
t = df_train
t['year_and_month'] = t['transactiondate'].str[:7]
t = pd.merge(left=t, right=d_features, on=['year_and_month'], how = 'inner', suffixes=('', '_y'))
del t['year_and_month']
df_train = t

In [None]:
# df_pred_2016 = pd.merge(sample_submission[['ParcelId']], df_properties_2016.rename(columns = {'parcelid': 'ParcelId'}), 
#                 how = 'left', on = 'ParcelId')
# df_pred_2017 = pd.merge(sample_submission[['ParcelId']], df_properties_2017.rename(columns = {'parcelid': 'ParcelId'}), 
#                 how = 'left', on = 'ParcelId')

# save the cleaned data sets

In [134]:
df_train.to_csv('../tmp/train_full.csv', index=False, header=True)

In [120]:
df_pred_2016.to_csv('../tmp/pred_2016.csv', index=False, header=True)

In [121]:
df_pred_2017.to_csv('../tmp/pred_2017.csv', index=False, header=True)

# Generate masks

In [122]:
df = df_train

In [153]:
all_fields = set(df.columns)

# these are fields that are used to identify fields
identifiers = set(['transactiondate', 'parcelid'])

# log error that we want to model
label = set(['logerror'])

# the following are categorical features
feats_objects = set(
 ['taxdelinquencyflag',
 'propertycountylandusecode',
 'propertyzoningdesc',
 'fireplaceflag',
 'hashottuborspa']
)

# the following are numerical features that should be treated as categorical features
feats_categorical_as_numeric = set([
    'airconditioningtypeid',
    'architecturalstyletypeid',
    'buildingqualitytypeid',
    'buildingclasstypeid',
    'decktypeid',
    'fips',
    'heatingorsystemtypeid',
    'propertylandusetypeid',
    'regionidcounty',
    'regionidcity',
    'regionidzip',
    'regionidneighborhood',
    'storytypeid',
    'typeconstructiontypeid',
])


# the rest are numeric features
feats_numeric = set([
    'basementsqft',
    'bathroomcnt',
    'bedroomcnt',
    'calculatedbathnbr',
    'threequarterbathnbr',
    'finishedfloor1squarefeet',
    'calculatedfinishedsquarefeet',
    'finishedsquarefeet6',
    'finishedsquarefeet12',
    'finishedsquarefeet13',
    'finishedsquarefeet15',
    'finishedsquarefeet50',
    'fireplacecnt',
    'fullbathcnt',
    'garagecarcnt',
    'garagetotalsqft',
    'hashottuborspa',
    'lotsizesquarefeet',
    'numberofstories',
    'poolcnt',
    'poolsizesum',
    'pooltypeid10',
    'pooltypeid2',
    'pooltypeid7',
    'roomcnt',
    'unitcnt',
    'yardbuildingsqft17',
    'yardbuildingsqft26',
    'taxvaluedollarcnt',
    'structuretaxvaluedollarcnt',
    'landtaxvaluedollarcnt',
    'taxamount',
    'latitude',
    'longitude',
    'yearbuilt',
    'assessmentyear',
    'taxdelinquencyyear',
    'rawcensustractandblock',
    'censustractandblock',
])

feats_numerics_feature_engineered = set([col for col in df.columns if 'f_num' in col or '_logerror' in col])


feats_categorical_feature_engineered = set([col for col in df.columns if 'f_cat' in col])


# fields that are thrown away for now
feats_for_consideration_later = set([
])

In [154]:
feats_categorical = feats_objects | feats_categorical_feature_engineered
feats_numeric = feats_numeric | feats_numerics_feature_engineered 
feats = feats_categorical | feats_numeric | feats_categorical_as_numeric

feats_categorical_as_numeric = list(feats_categorical_as_numeric)
feats_categorical = list(feats_categorical)
feats_numeric = list(feats_numeric)
feats = list(feats)

# save results to pick files

In [155]:
pickle.dump(feats_categorical_as_numeric, open('../tmp/feats_categorical_as_numeric.pkl', 'w'))
pickle.dump(feats_categorical, open('../tmp/feats_categorical.pkl', 'w'))
pickle.dump(feats_numeric, open('../tmp/feats_numeric.pkl', 'w'))
pickle.dump(feats, open('../tmp/feats.pkl', 'w'))

In [156]:
mask_train = (df.transactiondate < '2019-01-01')
mask_validation = (df.transactiondate >= '2016-10-01') & (df.transactiondate < '2017-01-01')
# mask_prediction = ~df.parcelid.isnull()

In [157]:
pickle.dump(mask_train, open('../tmp/mask_train.pkl', 'w'))
pickle.dump(mask_validation, open('../tmp/mask_validation.pkl', 'w'))
# pickle.dump(mask_prediction, open('../tmp/mask_prediction.pkl', 'w'))