# Load Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split
import joblib

In [2]:
# Ensemble Methods
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse

# Load Data

In [3]:
data = pd.read_csv('data/nyc-rolling-sales.csv')

# Clean Data

In [4]:
### Clean Data
# Convert to float
data['SALE PRICE'] = pd.to_numeric(data['SALE PRICE'], errors='coerce')
data['SALE PRICE'] = data['SALE PRICE'].fillna(0)

data['GROSS SQUARE FEET'] = pd.to_numeric(data['GROSS SQUARE FEET'], errors='coerce')
data['LAND SQUARE FEET'] = pd.to_numeric(data['LAND SQUARE FEET'], errors='coerce')

# Convert to date
data['SALE DATE'] = pd.to_datetime(data['SALE DATE'], errors='coerce')


# Remove 5th and 95th percentile tails
zero = 0
fifth = data['SALE PRICE'].describe(np.arange(0.05, 1, 0.05)).T['15%']
ninetyfifth = data['SALE PRICE'].describe(np.arange(0.05, 1, 0.05)).T['95%']
data = data[(data['SALE PRICE'] > zero) &
             (data['SALE PRICE'] <= ninetyfifth)].copy()

# Handle Missing Values by Dropping (for now)
data.dropna(inplace=True)


In [5]:
data.shape

(36128, 22)

In [6]:
dep_var = 'SALE PRICE'
cat_names = ['BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY', 'TAX CLASS AT PRESENT',
              'BUILDING CLASS AT PRESENT', 'BUILDING CLASS AT TIME OF SALE']
cont_names = ['LAND SQUARE FEET', 'GROSS SQUARE FEET', 'RESIDENTIAL UNITS', 
              'COMMERCIAL UNITS']

# Feature Engineering

In [10]:
# Multihot encode categorical variables
df_cat = pd.get_dummies(data[cat_names].astype(str))

# Reassign numerical to diff df
df_cont = data[cont_names]

# Normalize numerical features
df_cont_norm = (df_cont-df_cont.min())/(df_cont.max()-df_cont.min())

# Concatenate features
X = pd.concat([df_cat, df_cont_norm], axis=1)

# Get dependent variable and store as different df series
# y = np.log(data[dep_var])
y = data[dep_var]

In [11]:
df_cont.corr().style.background_gradient('coolwarm', axis=None)

Unnamed: 0,LAND SQUARE FEET,GROSS SQUARE FEET,RESIDENTIAL UNITS,COMMERCIAL UNITS
LAND SQUARE FEET,1.0,0.791965,0.654991,0.0291856
GROSS SQUARE FEET,0.791965,1.0,0.870247,0.0237757
RESIDENTIAL UNITS,0.654991,0.870247,1.0,0.00917148
COMMERCIAL UNITS,0.0291856,0.0237757,0.00917148,1.0


In [12]:
X.shape, y.shape

((36128, 560), (36128,))

# Modelling

In [13]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

# model = RandomForestRegressor(n_estimators=300, random_state=42)
model = GradientBoostingRegressor(n_estimators=300, random_state=42, )

# Fit Model
%time model.fit(X_train, y_train)

CPU times: user 38.5 s, sys: 155 ms, total: 38.6 s
Wall time: 39 s


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=300,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [14]:
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [15]:
rmse_score = np.sqrt(mse(y_pred, y_test))

In [16]:
rmse_score

361354.98524531676

In [17]:
rmse_score_train = np.sqrt(mse(y_pred_train, y_train))
rmse_score_train

338733.0641055103

# AutoML

In [None]:
# !pip install tpot;
from tpot import TPOTRegressor

In [None]:
gen_num = 5
max_tot_time = 30
max_eval_time = 3
pop_size = 10
verbose = 2

In [None]:
import os
import shutil

datadir = './pipelines'
if not os.path.exists(datadir):
    os.makedirs(datadir)

tpot = TPOTRegressor(generations=gen_num, max_time_mins=max_tot_time, 
                      max_eval_time_mins=max_eval_time, population_size=pop_size, verbosity = verbose, memory = 'auto',
                     periodic_checkpoint_folder='pipelines/', warm_start = True)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

In [None]:
pd.DataFrame(dict(list(tpot.evaluated_individuals_.items()))).T\
.replace([np.inf, -np.inf], np.nan)\
.dropna()\
.drop('generation', axis = 1)\
.sort_values('internal_cv_score', ascending = False)\
.head()

In [None]:
1.7