In [9]:
# Imports
import numpy as np
import pandas as pd
import time
import datetime
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [10]:
# Defines
YEAR = 2017
THRESHOLD = .60
FILL_NA = 0
DATE_CONVERSION = 'timestamps'
PREPROCESSING = 'MinMax'
KFOLD_SPLITS = 10

In [11]:
# Functions
def check_na(train):
    # Finds the number of missing values in each column
    num_of_na = [train.loc[:,col].isnull().sum() for col in train]
    # Divide by rows for proportion 
    prop_na = [num / train.shape[0] for num in num_of_na]
    # Put the proporitons and column names into a df and sort
    na_df = pd.DataFrame({'prop_na' : prop_na, 'column' : train.columns}).sort_values('prop_na')
    return na_df

In [12]:
# Read csvs
if YEAR == 2016:
    properties = pd.read_csv('properties_2016.csv', low_memory = False)
    train = pd.read_csv('train_2016_v2.csv', low_memory = False)
elif YEAR == 2017:
    properties = pd.read_csv('properties_2017.csv', low_memory = False)
    train = pd.read_csv('train_2017.csv', low_memory = False)
# train has Y and properties has features
# Find row intersection of train and properties
train = train.merge(properties, on = 'parcelid', how = 'left')

In [13]:
print('BEFORE')
print(check_na(train))
# Remove all columns above the THRESHOLD
train = train.loc[:, (train.isnull().sum(axis=0) <= (train.shape[0]*THRESHOLD))]
print('AFTER')
print(check_na(train))

BEFORE
     prop_na                        column
0   0.000000                      parcelid
1   0.000000                      logerror
2   0.000000               transactiondate
27  0.000438                     longitude
34  0.000438     propertycountylandusecode
35  0.000438         propertylandusetypeid
37  0.000438        rawcensustractandblock
39  0.000438                regionidcounty
42  0.000438                       roomcnt
19  0.000438                          fips
7   0.000438                    bedroomcnt
6   0.000438                   bathroomcnt
54  0.000438                assessmentyear
26  0.000438                      latitude
53  0.000451             taxvaluedollarcnt
55  0.000464         landtaxvaluedollarcnt
56  0.000502                     taxamount
41  0.001082                   regionidzip
52  0.001920    structuretaxvaluedollarcnt
13  0.003028  calculatedfinishedsquarefeet
59  0.003621           censustractandblock
49  0.003917                     yearbuilt
21  

TODO: fill the missing values via prediction

Types of variables:  
- Continuous  
- Discrete  
- Categorical

Predict variables according to their type:  
- Linear: continuous
- Logistic: discrete, categorical


In [14]:
continuous = ['bathroomcnt',
              'buildingqualitytypeid',
              'calculatedbathnbr',
              'finishedfloor1squarefeet',
              'calculatedfinishedsquarefeet',
              'finishedsquarefeet6',
              'finishedsquarefeet12',
              'finishedsquarefeet13',
              'finishedsquarefeet15',
              'finishedsquarefeet50',
              'garagetotalsqft',
              'latitude',
              'longitude',
              'lotsizesquarefeet',
              'poolsizesum',
              'taxvaluedollarcnt',
              'structuretaxvaluedollarcnt',
              'landtaxvaluedollarcnt',
              'taxamount'
]

discrete = ['bedroomcnt',
            'threequarterbathnbr',
            'fireplacecnt',
            'fullbathcnt',
            'garagecarcnt',
            'numberofstories',
            'poolcnt',
            'roomcnt',
            'unitcnt'
]
categorical = ['airconditioningtypeid', 
               'architecturalstyletypeid',
               'basementsqft',
               'buildingclasstypeid',
               'decktypeid',
               'fips',
               'fireplaceflag',
               'hashottuborspa',
               'heatingorsystemtypeid',
               'pooltypeid10',
               'pooltypeid2',
               'pooltypeid7',
               'propertycountylandusecode',
               'propertylandusetypeid',
               'propertyzoningdesc',
               'rawcensustractandblock',
               'censustractandblock',
               'regionidcounty',
               'regionidcity',
               'regionidzip',
               'regionidneighborhood',
               'storytypeid',
               'typeconstructiontypeid',
               'yardbuildingsqft17',
               'yardbuildingsqft26',
               'yearbuilt',
               'assessmentyear',
               'taxdelinquencyflag',
               'taxdelinquencyyear'
]

In [15]:
# test linear reg
from sklearn.linear_model import LogisticRegression
# Use saga solver due to large dataset
solver = 'saga' 
lr = LogisticRegression(solver=solver,
#                         multi_class=model,
#                         C=1,
#                         penalty='l1',
#                         fit_intercept=True,
#                         max_iter=this_max_iter,
                        random_state=42,
)

In [16]:
properties = properties.loc[1:100000,]
prop2 = pd.get_dummies(properties)
prop2.dtypes

parcelid                           int64
airconditioningtypeid            float64
architecturalstyletypeid         float64
basementsqft                     float64
bathroomcnt                      float64
bedroomcnt                       float64
buildingclasstypeid              float64
buildingqualitytypeid            float64
calculatedbathnbr                float64
decktypeid                       float64
finishedfloor1squarefeet         float64
calculatedfinishedsquarefeet     float64
finishedsquarefeet12             float64
finishedsquarefeet13             float64
finishedsquarefeet15             float64
finishedsquarefeet50             float64
finishedsquarefeet6              float64
fips                             float64
fireplacecnt                     float64
fullbathcnt                      float64
garagecarcnt                     float64
garagetotalsqft                  float64
heatingorsystemtypeid            float64
latitude                         float64
longitude       

In [17]:
# xtrain is the dataset without rows which are missing a value for 
# the variable in question, 
# and ytrain is that value
variable = 'bedroomcnt'
xtrain_isna = prop2.loc[:, variable].isnull()
xtrain = prop2.drop(variable, axis=1).loc[np.logical_not(xtrain_isna),]
ytrain = prop2.loc[np.logical_not(xtrain_isna), variable]

In [18]:
xtrain.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,propertyzoningdesc_WVRPD12U*,propertyzoningdesc_WVRPD12U-R,propertyzoningdesc_WVRPD17U*,propertyzoningdesc_WVRPD18U*,propertyzoningdesc_WVRPD40000,propertyzoningdesc_WVRPD4OOOO,propertyzoningdesc_WVRPD56*,propertyzoningdesc_WVRR1-RPD1,fireplaceflag_True,taxdelinquencyflag_Y
1,10759547,,,,0.0,,,,,,...,0,0,0,0,0,0,0,0,0,0
2,10843547,,,,0.0,5.0,,,,,...,0,0,0,0,0,0,0,0,0,0
3,10859147,,,,0.0,3.0,6.0,,,,...,0,0,0,0,0,0,0,0,0,0
4,10879947,,,,0.0,4.0,,,,,...,0,0,0,0,0,0,0,0,0,0
5,10898347,,,,0.0,4.0,4.0,,,,...,0,0,0,0,0,0,0,0,0,0


In [19]:
ytrain.head()

1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
Name: bedroomcnt, dtype: float64

In [20]:
# [var for var in discrete + categorical]

In [21]:
lr.fit(xtrain, ytrain)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# # Replace all NAs with number defined in FILL_NA
# train = train.fillna(FILL_NA)
# # Convert transactiondate strings into floats
# date_strings = (train.values[:,2])
# date_converted = []
# if DATE_CONVERSION == 'timestamps':
#     for string in date_strings:
#         date_converted.append(time.mktime(datetime.datetime.strptime(string, "%Y-%m-%d").timetuple()))
# train['transactiondate'] = np.asarray(date_converted)
# # Drop the columns with string and int
# train = train.drop(columns=['propertycountylandusecode', 'propertyzoningdesc'])

In [None]:
# # Preprocessing
# y = train.values[:,1]
# y = y.reshape(y.shape[0],1)
# x = train.values[:,2:]
# if PREPROCESSING == 'MinMax':
#     scaler = MinMaxScaler()
#     scaler.fit(x)
#     x = scaler.transform(x)
# # KFolds
# kf = KFold(n_splits = KFOLD_SPLITS, shuffle = True)
# for train_index, test_index in kf.split(x):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     x_train, x_test = x[train_index], x[test_index]
#     y_train, y_test = y[train_index], y[test_index]

In [None]:
# model = tf.keras.models.Sequential([
#         tf.keras.layers.Dense(6, input_dim = 27, activation = 'relu'),
#         tf.keras.layers.Dense(6, activation = 'relu'),
#         tf.keras.layers.Dense(6, activation = 'relu'),
#         tf.keras.layers.Dense(1, activation = 'linear')
#     ])
# sgd = tf.keras.optimizers.SGD(lr=0.1)
# model.compile(loss = 'mse', optimizer = sgd)
# model.fit(x_train, y_train, epochs = 500, batch_size = 32, verbose = 1)

In [None]:
# print(model.evaluate(x_test, y_test, verbose = 1))