In [99]:
import numpy as np
import pandas as pd

import xgboost as xgb

from sklearn.feature_selection import VarianceThreshold

from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

import time

In [154]:
train_datapath_2016 = "./zillow-data/merged_2016 v2.csv"
train_datapath_2017 = "./zillow-data/merged_2017 v2.csv"

test_datapath_2016 = "./zillow-data/properties_2016 v2.csv"
test_datapath_2017 = "./zillow-data/properties_2017 v2.csv"

In [125]:
# modify 'transactiondate' of df to keep only the month
def parse_transactiondate(df):
    # modify transactiondate col to keep only the month
    new_transactiondate = []
    for data in df['transactiondate']:
        temp = data.split("-")
        new_transactiondate.append(int(temp[1])) # keep only month

    new_transactiondate = np.array(new_transactiondate)
    new_transactiondate = new_transactiondate.reshape(new_transactiondate.shape[0], 1)

    df['transactiondate'] = new_transactiondate

# return label index, feature index list
# assumes that df label_is_first
def split_on_label(df):
    return df.columns[0], df.columns[1:]

def gen_testdata(df, new_transactiondate):
    df_copy = df.copy()
    # df_copy['transactiondate'] = new_transactiondate
    df_copy.insert(0, 'transactiondate', new_transactiondate)
    return df_copy

def get_low_var_feature(support_list):
    low_var_feature_index = []
    support_feature_index = []
    for i in range(len(support_list)):
        if support_list[i]:
            support_feature_index.append(i)
        else:
            low_var_feature_index.append(i)
    return low_var_feature_index, support_feature_index

def handle_low_var(sel, train_data, feature_list):
    temp_sel = sel.fit(train_data)
    low_var_list, support_list = get_low_var_feature(sel.get_support())
    
    for i in low_var_list:
        print(feature_list[i])
    return temp_sel, support_list

    

In [96]:
# read training data
train_data16 = pd.read_csv(train_datapath_2016, index_col=0)
train_data17 = pd.read_csv(train_datapath_2017, index_col=0)

# drop parcelid (index of properties features)
train_data16 = train_data16.drop('parcelid', axis=1)
train_data17 = train_data17.drop('parcelid', axis=1)

label16, feature16 = split_on_label(train_data16)
label17, feature17 = split_on_label(train_data17)

print("[2016] num of features:", len(feature16))
print("[2016] num of instances:", train_data16.shape[0])

print("[2017] num of features:", len(feature17))
print("[2017] num of instances:", train_data17.shape[0])

# modify 'transactiondate' to keep only the month
parse_transactiondate(train_data16)
parse_transactiondate(train_data17)

# training data and label for model 2016
X_train16 = train_data16[feature16]
y_train16 = train_data16[label16]

# training data and label for model 2017
X_train17 = train_data17[feature17]
y_train17 = train_data17[label17]

[2016] num of features: 58
[2016] num of instances: 90275
[2017] num of features: 58
[2017] num of instances: 77613


In [104]:
np.any(np.isnan(np.array(X_train16)))

False

In [105]:
np.any(np.isnan(np.array(X_train17)))

False

In [133]:
# Removing Features with low variance
# it should remove features with many missing values that were filled by imputation

sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

print("Low Var Features 2016:")
sel16, support_list16 = handle_low_var(sel, X_train16, feature16)

print("\nLow Var Features 2017:")
sel17, support_list17 = handle_low_var(sel, X_train17, feature17)
    
new_X_train16 = sel16.transform(X_train16)
new_X_train17 = sel17.transform(X_train17)

assert(len(support_list16) == new_X_train16.shape[1]), "Error: Invalid Size"
assert(len(support_list17) == new_X_train17.shape[1]), "Error: Invalid Size"

new_feature16 = feature16[support_list16]
new_feature17 = feature17[support_list17]

print("\n[2016] num of features:", new_X_train16.shape[1])
print("[2017] num of features:", new_X_train17.shape[1])

Low Var Features 2016:
architecturalstyletypeid
buildingclasstypeid
decktypeid
fireplacecnt
hashottuborspa
poolcnt
pooltypeid10
pooltypeid2
pooltypeid7
storytypeid
threequarterbathnbr
typeconstructiontypeid
numberofstories
fireplaceflag
assessmentyear
taxdelinquencyflag

Low Var Features 2017:
architecturalstyletypeid
buildingclasstypeid
decktypeid
fireplacecnt
hashottuborspa
poolcnt
pooltypeid10
pooltypeid2
pooltypeid7
storytypeid
threequarterbathnbr
typeconstructiontypeid
numberofstories
fireplaceflag
assessmentyear
taxdelinquencyflag

[2016] num of features: 42
[2017] num of features: 42


In [143]:
def handle_XGBoost_RFECV(X_train, y_train):
    # alpha values from 0.01 to 1000 up to 5 values log scale
    alphalist = np.logspace(-2, 4, 5, endpoint=True)
    
    error_dict = {}
    feature_dict = {}

    # cross-validate on alpha (regularization strenght) from alphalist
    for i in alphalist:
        print("alpha:", i)
        bst = xgb.sklearn.XGBRegressor(max_depth=3, learning_rate=0.1, reg_alpha=i)
        
        # cross-validate on num of features selected using RFECV
        # step parameter is number of features to remove at each step
        selector = RFECV(bst, step=10, cv=5)
        # selector = RFE(bst, 7, step=5)
        selector = selector.fit(X_train, y_train)
        
        # selected feature index
        feature = []
        for j in range(len(selector.support_)):
            if selector.support_[j]:
                feature.append(j)
        feature_dict[i] = feature
                
        # cross_val_score on the underlying estimator array of scores
        score = cross_val_score(selector.estimator_, X_train[:,feature], y_train, 
                                cv=5, scoring='neg_mean_absolute_error').mean()
        print("cv score:", -score)
        error_dict[i] = -score

    opt_a = min(error_dict, key= error_dict.get)
    opt_feature = feature_dict[opt_a]

    print("alpha (lambda) from CV: {}".format(opt_a))
    print("size of opt_feature from CV:", len(opt_feature))

    # obtain train MAE with CV'ed alpha value
    bst = xgb.sklearn.XGBRegressor(reg_alpha=opt_a)
    bst.fit(X_train[:,opt_feature], y_train)
    y_pred = bst.predict(X_train[:,opt_feature])
    print("Train MAE: {:.6}".format(mean_absolute_error(y_train, y_pred)))
    
    return bst, opt_feature

In [144]:
# XGBoost with RFE CV for 2016
start_time = time.time()

print("XGBoost Model 2016:")
new_bst16, new_opt_feature = handle_XGBoost_RFECV(new_X_train16, y_train16)

elapsed_time = time.time() - start_time
print(time.strftime("Elasped time: %H:%M:%S", time.gmtime(elapsed_time)))


XGBoost Model 2016:
alpha: 0.01
cv score: 0.068606078336347
alpha: 0.31622776601683794
cv score: 0.0684698396432951
alpha: 10.0
cv score: 0.06830471008195879
alpha: 316.22776601683796
cv score: 0.06890066387241485
alpha: 10000.0
cv score: 0.16259024496151384
alpha (lambda) from CV: 10.0
size of opt_feature from CV: 22
Train MAE: 0.0678517
Elasped time: 00:24:48


In [151]:
print("Selected Features from RFE (2016):")
for i in new_opt_feature:
    print(new_feature16[i])
    
opt_feature16_index = new_opt_feature
opt_feature16 = new_feature16[new_opt_feature]

Selected Features from RFE (2016):
transactiondate
bedroomcnt
calculatedfinishedsquarefeet
finishedsquarefeet12
heatingorsystemtypeid
latitude
longitude
lotsizesquarefeet
propertycountylandusecode
propertylandusetypeid
propertyzoningdesc
rawcensustractandblock
regionidcity
regionidneighborhood
regionidzip
yearbuilt
structuretaxvaluedollarcnt
taxvaluedollarcnt
landtaxvaluedollarcnt
taxamount
taxdelinquencyyear
censustractandblock


In [152]:
# XGBoost with RFE CV for 2017
start_time = time.time()

print("XGBoost Model 2017:")
new_bst17, new_opt_feature2 = handle_XGBoost_RFECV(new_X_train17, y_train17)

elapsed_time = time.time() - start_time
print(time.strftime("Elasped time: %H:%M:%S", time.gmtime(elapsed_time)))

XGBoost Model 2017:
alpha: 0.01
cv score: 0.07081324816015105
alpha: 0.31622776601683794
cv score: 0.07201245552315816
alpha: 10.0
cv score: 0.07043204263804788
alpha: 316.22776601683796
cv score: 0.0715665909445019
alpha: 10000.0
cv score: 0.18765495150177647
alpha (lambda) from CV: 10.0
size of opt_feature from CV: 22
Train MAE: 0.0702628
Elasped time: 00:19:48


In [153]:
print("Selected Features from RFE (2017):")
for i in new_opt_feature:
    print(new_feature17[i])
    
opt_feature17_index = new_opt_feature2
opt_feature17 = new_feature17[new_opt_feature2]

Selected Features from RFE (2017):
transactiondate
bedroomcnt
calculatedfinishedsquarefeet
finishedsquarefeet12
heatingorsystemtypeid
latitude
longitude
lotsizesquarefeet
propertycountylandusecode
propertylandusetypeid
propertyzoningdesc
rawcensustractandblock
regionidcity
regionidneighborhood
regionidzip
yearbuilt
structuretaxvaluedollarcnt
taxvaluedollarcnt
landtaxvaluedollarcnt
taxamount
taxdelinquencyyear
censustractandblock


In [155]:
# read in test
test_data2016 = pd.read_csv(test_datapath_2016, index_col=0)
test_data2017 = pd.read_csv(test_datapath_2017, index_col=0)

print("[2016] num of instances: ", test_data2016.shape[0])
print("[2017] num of instances: ", test_data2017.shape[0])

# save parcelid for merge 
test_parcelid16 = test_data2016['parcelid']
test_parcelid17 = test_data2017['parcelid']

# drop parcelid col
test_data2016 = test_data2016.drop('parcelid', axis=1)
test_data2017 = test_data2017.drop('parcelid', axis=1)

# generate transaction date
test10_16 = np.repeat(10, test_data2016.shape[0])
test11_16 = np.repeat(11, test_data2016.shape[0])
test12_16 = np.repeat(12, test_data2016.shape[0])

test10_17 = np.repeat(10, test_data2017.shape[0])
test11_17 = np.repeat(11, test_data2017.shape[0])
test12_17 = np.repeat(12, test_data2017.shape[0])

# get new test data with transaction date
X_test10_16 = gen_testdata(test_data2016, test10_16)
X_test11_16 = gen_testdata(test_data2016, test11_16)
X_test12_16 = gen_testdata(test_data2016, test12_16)

X_test10_17 = gen_testdata(test_data2017, test10_17)
X_test11_17 = gen_testdata(test_data2017, test11_17)
X_test12_17 = gen_testdata(test_data2017, test12_17)

  mask |= (ar1 == a)


[2016] num of instances:  2985217
[2017] num of instances:  2985217


In [159]:
# predict on test data

y_pred10_16 = new_bst16.predict(np.array(X_test10_16[opt_feature16]))
y_pred11_16 = new_bst16.predict(np.array(X_test11_16[opt_feature16]))
y_pred12_16 = new_bst16.predict(np.array(X_test12_16[opt_feature16]))

y_pred10_17 = new_bst17.predict(np.array(X_test10_17[opt_feature17]))
y_pred11_17 = new_bst17.predict(np.array(X_test11_17[opt_feature17]))
y_pred12_17 = new_bst17.predict(np.array(X_test12_17[opt_feature17]))

In [160]:
# merged on Parcelid for predicted result on test data

test_dict_16 = {'Parcelid': test_parcelid16, '201610': y_pred10_16, '201611': y_pred11_16, '201612': y_pred12_16}
test_dict_17 = {'Parcelid': test_parcelid17, '201710': y_pred10_17, '201711': y_pred11_17, '201712': y_pred12_17}

df_test_16 = pd.DataFrame(data=test_dict_16)
df_test_17 = pd.DataFrame(data=test_dict_17)

df_merged = df_test_16.merge(df_test_17, left_on='Parcelid', right_on='Parcelid', how='outer')
print(df_merged)

          Parcelid    201610    201611    201612    201710    201711    201712
0         10754147  0.025181  0.025181  0.025181  0.061004  0.061004  0.061004
1         10759547 -0.032385 -0.032385 -0.032385  0.021161  0.021161  0.021161
2         10843547  0.049178  0.049178  0.049178  0.018709  0.018709  0.018709
3         10859147  0.009108  0.009108  0.009108  0.022996  0.022996  0.022996
4         10879947 -0.001519 -0.001519 -0.001519  0.006760  0.006760  0.006760
5         10898347  0.009677  0.009677  0.009677  0.015958  0.015958  0.015958
6         10933547  0.001214  0.001214  0.001214  0.009431  0.009431  0.009431
7         10940747  0.004977  0.004977  0.004977  0.010451  0.010451  0.010451
8         10954547  0.022996  0.022996  0.022996  0.064069  0.064069  0.064069
9         10976347  0.004818  0.004818  0.004818  0.011785  0.011785  0.011785
10        11073947  0.000491  0.000491  0.000491  0.009674  0.009674  0.009674
11        11114347  0.021408  0.021408  0.021408  0.

In [161]:
# handle submission file
submitfile = "./zillow-data/sample_submission.csv"

submit_df = pd.read_csv(submitfile)
print(submit_df.shape[0], submit_df.shape[1])

assert(submit_df.shape[0] == df_merged.shape[0]), "Error: invalid row size for submit!"

2985217 7


In [162]:
# create submit file from df_merged

df_merged.to_csv("./zillow-data/draft_submission3.csv", index=False)