In [1]:
# Imports
import numpy as np
import pandas as pd
import time
import datetime
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

  from ._conv import register_converters as _register_converters


In [66]:
# Defines
YEAR = 2017
THRESHOLD = .60
FILL_NA = 0
DATE_CONVERSION = 'timestamps'
PREPROCESSING = 'MinMax'
KFOLD_SPLITS = 10

In [67]:
# Functions
def check_na(train):
    # Finds the number of missing values in each column
    num_of_na = [train.loc[:,col].isnull().sum() for col in train]
    # Divide by rows for proportion 
    prop_na = [num / train.shape[0] for num in num_of_na]
    # Put the proporitons and column names into a df and sort
    na_df = pd.DataFrame({'prop_na' : prop_na, 'column' : train.columns}).sort_values('prop_na')
    return na_df

In [68]:
# Read csvs
if YEAR == 2016:
    properties = pd.read_csv('properties_2016.csv', low_memory = False)
    train = pd.read_csv('train_2016_v2.csv', low_memory = False)
elif YEAR == 2017:
    properties = pd.read_csv('properties_2017.csv', low_memory = False)
    train = pd.read_csv('train_2017.csv', low_memory = False)
# train has Y and properties has features
# Find row intersection of train and properties
train = train.merge(properties, on = 'parcelid', how = 'left')

In [69]:
print('BEFORE')
print(check_na(train))
# Remove all columns above the THRESHOLD
train = train.loc[:, (train.isnull().sum(axis=0) <= (train.shape[0]*THRESHOLD))]
print('AFTER')
print(check_na(train))

BEFORE
     prop_na                        column
0   0.000000                      parcelid
1   0.000000                      logerror
2   0.000000               transactiondate
27  0.000438                     longitude
34  0.000438     propertycountylandusecode
35  0.000438         propertylandusetypeid
37  0.000438        rawcensustractandblock
39  0.000438                regionidcounty
42  0.000438                       roomcnt
19  0.000438                          fips
7   0.000438                    bedroomcnt
6   0.000438                   bathroomcnt
54  0.000438                assessmentyear
26  0.000438                      latitude
53  0.000451             taxvaluedollarcnt
55  0.000464         landtaxvaluedollarcnt
56  0.000502                     taxamount
41  0.001082                   regionidzip
52  0.001920    structuretaxvaluedollarcnt
13  0.003028  calculatedfinishedsquarefeet
59  0.003621           censustractandblock
49  0.003917                     yearbuilt
21  

In [70]:
# Convert transactiondate strings into floats
date_strings = (train.values[:,2])
date_converted = []
if DATE_CONVERSION == 'timestamps':
    for string in date_strings:
        date_converted.append(time.mktime(datetime.datetime.strptime(string, "%Y-%m-%d").timetuple()))
train['transactiondate'] = np.asarray(date_converted)
# Drop the columns with string and int
train = train.drop(columns=['propertycountylandusecode', 'propertyzoningdesc'])

In [71]:
# Preprocessing
y = train.values[:,1]
y = y.reshape(y.shape[0],1)
x = train.values[:,2:]

# KFolds
kf = KFold(n_splits = KFOLD_SPLITS, shuffle = True)
for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [    0     1     2 ... 77609 77611 77612] TEST: [   14    17    29 ... 77590 77605 77610]
TRAIN: [    0     1     2 ... 77610 77611 77612] TEST: [    5    12    19 ... 77596 77602 77606]
TRAIN: [    0     2     3 ... 77610 77611 77612] TEST: [    1     8    10 ... 77587 77604 77607]
TRAIN: [    0     1     2 ... 77610 77611 77612] TEST: [    7     9    18 ... 77536 77597 77608]
TRAIN: [    0     1     3 ... 77610 77611 77612] TEST: [    2     4    13 ... 77585 77594 77601]
TRAIN: [    0     1     2 ... 77607 77608 77610] TEST: [   11    20    31 ... 77609 77611 77612]
TRAIN: [    1     2     3 ... 77610 77611 77612] TEST: [    0    35    43 ... 77579 77589 77599]
TRAIN: [    0     1     2 ... 77610 77611 77612] TEST: [   15    46    57 ... 77575 77576 77603]
TRAIN: [    0     1     2 ... 77610 77611 77612] TEST: [   16    24    32 ... 77569 77572 77581]
TRAIN: [    0     1     2 ... 77610 77611 77612] TEST: [    3     6    23 ... 77592 77598 77600]


In [79]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV


model = XGBRegressor(max_depth=6, n_estimators=500, learning_rate=0.01)
# reg_cv = GridSearchCV(model, {'max_depth': [4,6,8], 'n_estimators': [500,1000]}, 
#     'learning_rate': [0.1, 0.01], verbose=1, cv=2)
# reg_cv.fit(x_train, y_train)


In [80]:
# model = XGBRegressor(**reg_cv.best_params_)
model.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [81]:
prediction = model.predict(x_test)   #predict values
prediction

array([0.05079138, 0.01918674, 0.02837124, ..., 0.07011229, 0.02022144,
       0.02401921], dtype=float32)

In [82]:
from sklearn.metrics import mean_squared_error as mse
mse(y_test, prediction)

0.03156123321371255