In [1]:
# Load dependencies/packages
import numpy as np # for array operations
import pandas as pd # for neat tables, database-like grouping/summary tables & csv_read
import matplotlib.pyplot as plt # for the occasional plot
import seaborn as sns # for the occasional plot
sns.set_theme(style="whitegrid")
import random # for random selection of starting variables/features/predictors

### Regression Machine Learning Model Architectures ###
from sklearn.model_selection import train_test_split, GridSearchCV # train / test split method & Grid Search cross validation
from multiprocessing import cpu_count

# baseline methods
from sklearn.linear_model import ElasticNet 
# (1) ElasticNet, parameter to tune: 'L1 ratio', where L1=1 Lasso, L1=0 i.e. L2=1 Ridge and 0<L1<1 is ElasticNet
from sklearn.neighbors import KNeighborsRegressor # (2) KNN, parameter to tune: 'k neighbors'
from sklearn.svm import SVR # (3) SVM Regression, parameters to tune: 'C regularization par' AND 'epsilon'

# ensemble methods
from sklearn.ensemble import RandomForestRegressor 
# (4) Random Forest (regression) split by 'squared error', parameter to tune: ... 
# ... 'min_samples_leaf' i.e. extent of how much the are individual trees grown AND 'n_estimators'
from sklearn.ensemble import AdaBoostRegressor  
from sklearn.ensemble import GradientBoostingRegressor 
# (5, 6) Boosting (regression), parameter to tune: 'learning rate' AND 'n_estimators'
 
# neural networks
from sklearn.neural_network import MLPRegressor
# (7) Neural Network, parameter to tune: 'alpha' AND 'hidden layer architecture'

# regression model metrics
from sklearn.metrics import mean_absolute_error, r2_score

# Ignore convergence warnings
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [2]:
def normalize_data(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

In [3]:
# Load Data - 36 month Melb house sale data set
model_data = pd.read_csv('data/melb_house_36mth.csv') # load data

In [4]:
model_data

Unnamed: 0,Rooms,Price,Date,Distance,Bedroom2,Bedroom2_nan,Bathroom,Bathroom_nan,Car,Car_nan,...,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,RegionName_Eastern Metropolitan,RegionName_Eastern Victoria,RegionName_Northern Metropolitan,RegionName_Northern Victoria,RegionName_South-Eastern Metropolitan,RegionName_Southern Metropolitan,RegionName_Western Metropolitan,RegionName_Western Victoria
0,2,1035000.0,2016-04-02,2.5,2.0,False,1.0,False,0.0,False,...,1,0,0,0,1,0,0,0,0,0
1,3,1465000.0,2017-04-03,2.5,3.0,False,2.0,False,0.0,False,...,1,0,0,0,1,0,0,0,0,0
2,4,1600000.0,2016-04-06,2.5,3.0,False,1.0,False,2.0,False,...,1,0,0,0,1,0,0,0,0,0
3,3,1876000.0,2016-07-05,2.5,4.0,False,2.0,False,0.0,False,...,1,0,0,0,1,0,0,0,0,0
4,2,1636000.0,2016-08-10,2.5,2.0,False,1.0,False,2.0,False,...,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9353,3,500000.0,2018-02-24,25.5,3.0,False,2.0,False,2.0,False,...,0,0,0,0,1,0,0,0,0,0
9354,3,570000.0,2018-02-24,25.5,3.0,False,2.0,False,2.0,False,...,0,0,0,0,1,0,0,0,0,0
9355,2,888000.0,2018-02-24,6.3,2.0,False,2.0,False,1.0,False,...,0,0,0,0,0,0,0,0,1,0
9356,2,705000.0,2018-02-24,6.3,2.0,False,1.0,False,2.0,False,...,0,0,0,0,0,0,0,0,1,0


In [5]:
# 0-1 Scaling for float/int data
scaling_var_list = ['Rooms', 'Price', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea',
             'YearBuilt', 'PropertyCount']
for name in scaling_var_list:
    model_data[name] = normalize_data(model_data[name]) # cast to 0-1 scale

In [6]:
# drop dates
model_data = model_data.drop(columns='Date')

# drop remaining NaNs
model_data = model_data.dropna()

In [7]:
model_data.shape # (8876, 377)

(8876, 377)

In [8]:
# Split into X and y train and test 80% / 20%
X_train, X_test, y_train, y_test = train_test_split(
    model_data.drop(columns=['Price']), model_data['Price'], test_size=0.2, random_state=12)

In [9]:
# Results DataFrame
col_names = ["Reg Model Type", "MSE best params", "MSE", "r2 best params", "r2"]
results_df = pd.DataFrame(columns=col_names)
results_df["Reg Model Type"] = ["ElasticNet", "KNN", "SVM", "RandomForest",
                               "AdaBoost", "GradientBoost", "NeuralNet"]
MSE_best_params_list = []
MSE_list = []
r2_best_params_list = []
r2_list = []

In [10]:
### Machine Learning Algorithm Applications ###
# (1) Linear Regression, ElasticNet MSE
param_grid = {
    "l1_ratio": [0, 0.1, 0.2, 0.35, 0.5, 0.65, 0.8, 0.9, 1],
}
elasticnet_MSE_model = GridSearchCV(estimator=ElasticNet(),
                             param_grid=param_grid, 
                             scoring='neg_mean_squared_error', cv=5, 
                             n_jobs=cpu_count(), refit=True)
elasticnet_MSE_model.fit(X_train, y_train)
elasticnet_MSE_best_params = elasticnet_MSE_model.best_params_ 
print(elasticnet_MSE_best_params) # i.e l1=0, i.e. Ridge
elasticnet_MSE_score = abs(elasticnet_MSE_model.score(X_test, y_test))
print(elasticnet_MSE_score)

{'l1_ratio': 0}
0.009258048304767733


In [11]:
# (1) Linear Regression, ElasticNet r^2
param_grid = {
    "l1_ratio": [0, 0.1, 0.2, 0.35, 0.5, 0.65, 0.8, 0.9, 1],
}
elasticnet_r2_model = GridSearchCV(estimator=ElasticNet(),
                             param_grid=param_grid, 
                             scoring='r2', cv=5, 
                             n_jobs=cpu_count(), refit=True)
elasticnet_r2_model.fit(X_train, y_train)
elasticnet_r2_best_params = elasticnet_r2_model.best_params_ 
print(elasticnet_r2_best_params) # i.e l1=0, i.e. Ridge
elasticnet_r2_score = elasticnet_r2_model.score(X_test, y_test)
print(elasticnet_r2_score)

{'l1_ratio': 0}
0.1613310150824928


In [12]:
MSE_best_params_list.append(str(elasticnet_MSE_best_params))
MSE_list.append(elasticnet_MSE_score)
r2_best_params_list.append(str(elasticnet_r2_best_params))
r2_list.append(elasticnet_r2_score)

In [13]:
# (2) KNN Regression MSE
param_grid = {
    "n_neighbors": np.arange(start=3, stop=15, step=2),
}

knn_MSE_model = GridSearchCV(estimator=KNeighborsRegressor(),
                      param_grid=param_grid,
                      scoring='neg_mean_squared_error', cv = 5,
                      n_jobs=cpu_count(), refit = True)
knn_MSE_model.fit(X_train, y_train)
knn_MSE_best_params = knn_MSE_model.best_params_ 
print(knn_MSE_best_params)
knn_MSE_score = abs(knn_MSE_model.score(X_test, y_test))
print(knn_MSE_score)

{'n_neighbors': 7}
0.004776185672878425


In [14]:
# (2) KNN Regression r^2
param_grid = {
    "n_neighbors": np.arange(start=3, stop=15, step=2),
}

knn_r2_model = GridSearchCV(estimator=KNeighborsRegressor(),
                      param_grid=param_grid,
                      scoring='r2', cv = 5,
                      n_jobs=cpu_count(), refit = True)
knn_r2_model.fit(X_train, y_train)
knn_r2_best_params = knn_r2_model.best_params_ 
print(knn_r2_best_params)
knn_r2_score = knn_r2_model.score(X_test, y_test)
print(knn_r2_score)

{'n_neighbors': 7}
0.5673344253358825


In [15]:
MSE_best_params_list.append(str(knn_MSE_best_params))
MSE_list.append(knn_MSE_score)
r2_best_params_list.append(str(knn_r2_best_params))
r2_list.append(knn_r2_score)

In [16]:
# (3) SVM Regression MSE
param_grid = {
    "C": [0.5, 1, 5, 10],
    "epsilon": [0.1, 0.2, 0.35, 0.5, 0.8, 1]
}

svm_MSE_model = GridSearchCV(estimator=SVR(),
                      param_grid=param_grid,
                      scoring='neg_mean_squared_error', cv = 5,
                      n_jobs=cpu_count(), refit = True)
svm_MSE_model.fit(X_train, y_train)
svm_MSE_best_params = svm_MSE_model.best_params_ 
print(svm_MSE_best_params)
svm_MSE_score = abs(svm_MSE_model.score(X_test, y_test))
print(svm_MSE_score)

{'C': 10, 'epsilon': 0.1}
0.003581665801323034


In [17]:
# (3) SVM Regression r^2
param_grid = {
    "C": [0.5, 1, 5, 10],
    "epsilon": [0.1, 0.2, 0.35, 0.5, 0.8, 1]
}

svm_r2_model = GridSearchCV(estimator=SVR(),
                      param_grid=param_grid,
                      scoring='r2', cv = 5,
                      n_jobs=cpu_count(), refit = True)
svm_r2_model.fit(X_train, y_train)
svm_r2_best_params = svm_r2_model.best_params_ 
print(svm_r2_best_params)
svm_r2_score = svm_r2_model.score(X_test, y_test)
print(svm_r2_score)

{'C': 10, 'epsilon': 0.1}
0.6755437082389797


In [18]:
MSE_best_params_list.append(str(svm_MSE_best_params))
MSE_list.append(svm_MSE_score)
r2_best_params_list.append(str(svm_r2_best_params))
r2_list.append(svm_r2_score)

In [19]:
# (4) Random Forest MSE
param_grid = {
    "n_estimators": [50, 100, 150, 200, 250],
    "min_samples_leaf": [1, 5, 10, 20, 50],
    "max_features": ['sqrt'],
}

randfor_MSE_model = GridSearchCV(estimator=RandomForestRegressor(),
                        param_grid=param_grid,
                        scoring='neg_mean_squared_error', cv=5,
                        n_jobs=cpu_count(), refit=True)
randfor_MSE_model.fit(X_train, y_train)
randfor_MSE_best_params = randfor_MSE_model.best_params_ 
print(randfor_MSE_best_params)
randfor_MSE_score = abs(randfor_MSE_model.score(X_test, y_test))
print(randfor_MSE_score)

{'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 200}
0.0017876184184862452


In [20]:
# (4) Random Forest r^2
param_grid = {
    "n_estimators": [50, 100, 150, 200, 250],
    "min_samples_leaf": [1, 5, 10, 20, 50],
    "max_features": ['sqrt'],
}

randfor_r2_model = GridSearchCV(estimator=RandomForestRegressor(),
                        param_grid=param_grid,
                        scoring='r2', cv=5,
                        n_jobs=cpu_count(), refit=True)
randfor_r2_model.fit(X_train, y_train)
randfor_r2_best_params = randfor_r2_model.best_params_ 
print(randfor_r2_best_params)
randfor_r2_score = randfor_r2_model.score(X_test, y_test)
print(randfor_r2_score)

{'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 200}
0.8354027035308003


In [21]:
MSE_best_params_list.append(str(randfor_MSE_best_params))
MSE_list.append(randfor_MSE_score)
r2_best_params_list.append(str(randfor_r2_best_params))
r2_list.append(randfor_r2_score)

In [22]:
# (5) AdaBoost MSE
param_grid = {
    "n_estimators": [50, 100, 200, 400],
    "learning_rate": [0.1, 0.25, 0.5],
}

adaboost_MSE_model = GridSearchCV(estimator=AdaBoostRegressor(),
                        param_grid=param_grid,
                        scoring='neg_mean_squared_error', cv=5,
                        n_jobs=cpu_count(), refit=True)
adaboost_MSE_model.fit(X_train, y_train)
adaboost_MSE_best_params = adaboost_MSE_model.best_params_ 
print(adaboost_MSE_best_params)
adaboost_MSE_score = abs(adaboost_MSE_model.score(X_test, y_test))
print(adaboost_MSE_score)

{'learning_rate': 0.1, 'n_estimators': 100}
0.0038273902398584074


In [23]:
# (5) AdaBoost r^2
param_grid = {
    "n_estimators": [50, 100, 200], # narrowed search
    "learning_rate": [0.1, 0.25, 0.5],
}

adaboost_r2_model = GridSearchCV(estimator=AdaBoostRegressor(),
                        param_grid=param_grid,
                        scoring='r2', cv=5,
                        n_jobs=cpu_count(), refit=True)
adaboost_r2_model.fit(X_train, y_train)
adaboost_r2_best_params = adaboost_r2_model.best_params_ 
print(adaboost_r2_best_params)
adaboost_r2_score = adaboost_r2_model.score(X_test, y_test)
print(adaboost_r2_score)

{'learning_rate': 0.1, 'n_estimators': 100}
0.6499037943414444


In [24]:
MSE_best_params_list.append(str(adaboost_MSE_best_params))
MSE_list.append(adaboost_MSE_score)
r2_best_params_list.append(str(adaboost_r2_best_params))
r2_list.append(adaboost_r2_score)

In [25]:
# (6) GradientBoost MSE
param_grid = {
    "n_estimators": [100, 200, 400],
    "learning_rate": [0.1, 0.25, 0.5],
}

gradboost_MSE_model = GridSearchCV(estimator=GradientBoostingRegressor(),
                        param_grid=param_grid,
                        scoring='neg_mean_squared_error', cv=5,
                        n_jobs=cpu_count(), refit=True)
gradboost_MSE_model.fit(X_train, y_train)
gradboost_MSE_best_params = gradboost_MSE_model.best_params_ 
print(gradboost_MSE_best_params)
gradboost_MSE_score = abs(gradboost_MSE_model.score(X_test, y_test))
print(gradboost_MSE_score)

{'learning_rate': 0.25, 'n_estimators': 400}
0.0017398653619833903


In [26]:
# (6) GradientBoost r^2
param_grid = {
    "n_estimators": [100, 200, 400],
    "learning_rate": [0.1, 0.25, 0.5],
}

gradboost_r2_model = GridSearchCV(estimator=GradientBoostingRegressor(),
                        param_grid=param_grid,
                        scoring='r2', cv=5,
                        n_jobs=cpu_count(), refit=True)
gradboost_r2_model.fit(X_train, y_train)
gradboost_r2_best_params = gradboost_r2_model.best_params_ 
print(gradboost_r2_best_params)
gradboost_r2_score = gradboost_r2_model.score(X_test, y_test)
print(gradboost_r2_score)

{'learning_rate': 0.25, 'n_estimators': 400}
0.8373138713910879


In [27]:
MSE_best_params_list.append(str(gradboost_MSE_best_params))
MSE_list.append(gradboost_MSE_score)
r2_best_params_list.append(str(gradboost_r2_best_params))
r2_list.append(gradboost_r2_score)

In [28]:
# (7) Neural Network Regression MSE
param_grid = {
    'alpha': [1e-05, 0.001, 0.1, 0.35, 0.5, 0.8, 1],
    'hidden_layer_sizes': [(10,), (25,), (50,), (100,), (200,), (377,), (10, 2), (15, 2), (20,2), (5, 3)],
}

cvnn_MSE_model = GridSearchCV(estimator=MLPRegressor(activation='relu', solver='adam'),
                        param_grid=param_grid,
                        scoring='neg_mean_squared_error', cv=5,
                        n_jobs=cpu_count(), refit=True)
cvnn_MSE_model.fit(X_train, y_train)
cvnn_MSE_best_params = cvnn_MSE_model.best_params_ 
print(cvnn_MSE_best_params)
cvnn_MSE_score = abs(cvnn_MSE_model.score(X_test, y_test))
print(cvnn_MSE_score)

{'alpha': 0.001, 'hidden_layer_sizes': (377,)}
0.0021478264556019965


In [29]:
# (7) Neural Network Regression r^2
param_grid = {
    'alpha': [1e-05, 0.001, 0.1], # narrowed search
    'hidden_layer_sizes': [(200,), (377,), (20,2), (5, 5)], # narrowed search
}

cvnn_r2_model = GridSearchCV(estimator=MLPRegressor(activation='relu', solver='adam'),
                        param_grid=param_grid,
                        scoring='r2', cv=5,
                        n_jobs=cpu_count(), refit=True)
cvnn_r2_model.fit(X_train, y_train)
cvnn_r2_best_params = cvnn_r2_model.best_params_ 
print(cvnn_r2_best_params)
cvnn_r2_score = cvnn_r2_model.score(X_test, y_test)
print(cvnn_r2_score)

{'alpha': 0.001, 'hidden_layer_sizes': (377,)}
0.802873885932256


In [30]:
MSE_best_params_list.append(str(cvnn_MSE_best_params))
MSE_list.append(cvnn_MSE_score)
r2_best_params_list.append(str(cvnn_r2_best_params))
r2_list.append(cvnn_r2_score)

In [31]:
results_df["MSE best params"] = MSE_best_params_list
results_df["MSE"] = MSE_list
results_df["r2 best params"] = r2_best_params_list
results_df["r2"] = r2_list

In [32]:
print('###### 36 Month Data, Machine Learning Model Performance #####')
results_df

###### 36 Month Data, Machine Learning Model Performance #####


Unnamed: 0,Reg Model Type,MSE best params,MSE,r2 best params,r2
0,ElasticNet,{'l1_ratio': 0},0.009258,{'l1_ratio': 0},0.161331
1,KNN,{'n_neighbors': 7},0.004776,{'n_neighbors': 7},0.567334
2,SVM,"{'C': 10, 'epsilon': 0.1}",0.003582,"{'C': 10, 'epsilon': 0.1}",0.675544
3,RandomForest,"{'max_features': 'sqrt', 'min_samples_leaf': 1...",0.001788,"{'max_features': 'sqrt', 'min_samples_leaf': 1...",0.835403
4,AdaBoost,"{'learning_rate': 0.1, 'n_estimators': 100}",0.003827,"{'learning_rate': 0.1, 'n_estimators': 100}",0.649904
5,GradientBoost,"{'learning_rate': 0.25, 'n_estimators': 400}",0.00174,"{'learning_rate': 0.25, 'n_estimators': 400}",0.837314
6,NeuralNet,"{'alpha': 0.001, 'hidden_layer_sizes': (377,)}",0.002148,"{'alpha': 0.001, 'hidden_layer_sizes': (377,)}",0.802874
