In [None]:
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
import Common_Functions as cmn

# Figuring out which Regressions to apply to our Datasets using a MSE Test on our Training Data

Following Notebooks 1.0 and 1.1, we decided to continue the evaluation without the columns that include strings, like, for example, MSZoning or Street, as the label encoder didn't give significantly different results in our model compared to when not taking them into account. We therefore exported the "new" trianing and test data, without strings into new CSVs called stringless_train.csv and stringless_test.csv to be read by the function in the common_functions.py file for further use. This way, we eliminate the need to reread the original csvs and remove all NaN values and string values.

### Hypothesis 2
We think that the two best regressors will be the random forest and the machine learning model MLPRegressor, as ...

In [None]:
training_data = cmn.get_stringless_training_data()
testing_data = cmn.get_stringless_testing_data()
display(training_data.head(), testing_data.head())

Since the training data has one additional column, containing the sale price of the house with its given parameters, we split the dataframe into the X_train and y_train dataframes to continue with the calculations.

In [None]:
X_train = training_data.iloc[:, :-1]
y_train = training_data.iloc[:, -1:]
display(X_train.shape, X_train.head(), y_train.shape, y_train.head())

In [None]:
X_test = testing_data
y_test = [] # y_test does not exist in the testing data like above, so we are creating an empty list as a placeholder.
display(X_test.shape, X_test.head(), y_test)

# Scoring various Regression Models to verify which we will be using going forward

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_train)
y = np.log1p(y_train)
cross_validation = 10
scores_map = {}

##### Random Forest Regressor

In [None]:
forest = RandomForestRegressor()

scores = cross_val_score(forest, x_scaled, np.ravel(y), cv=cross_validation, scoring='neg_mean_squared_error')
print(f"MSE: {scores.mean()} (+/- {scores.std()})")

scores_map['Random Forest'] = scores

##### Gradient Boosting Regressor

In [None]:
gbr = GradientBoostingRegressor(alpha=0.9,learning_rate=0.05, max_depth=2, min_samples_leaf=5, min_samples_split=2, n_estimators=1000, random_state=1)

scores = cross_val_score(gbr, x_scaled, np.ravel(y), cv=cross_validation, scoring='neg_mean_squared_error')
print(f"MSE: {scores.mean()} (+/- {scores.std()})")

scores_map['Gradient Boosting Regressor'] = scores

##### Decision Tree Regressor

In [None]:
decision_tree = DecisionTreeRegressor(max_depth=5)

scores = cross_val_score(decision_tree, x_scaled, np.ravel(y), cv=cross_validation, scoring='neg_mean_squared_error')
print(f"MSE: {scores.mean()} (+/- {scores.std()})")

scores_map['Decision Tree Regressor'] = scores

##### Support Vector Regression

In [None]:
svr = SVR(kernel='rbf', C=1e3, gamma=0.1)

scores = cross_val_score(svr, x_scaled, np.ravel(y), cv=cross_validation, scoring='neg_mean_squared_error')
print(f"MSE: {scores.mean()} (+/- {scores.std()})")

scores_map['Support Vector Regression'] = scores

##### K Nearest Neighbors Regressor

In [None]:
knn = KNeighborsRegressor(n_neighbors=7)

scores = cross_val_score(knn, x_scaled, np.ravel(y), cv=cross_validation, scoring='neg_mean_squared_error')
print(f"MSE: {scores.mean()} (+/- {scores.std()})")

scores_map['K Nearest Neighbors Regressor'] = scores

##### Linear Regression

In [None]:
linear_regression = LinearRegression()

scores = cross_val_score(linear_regression, x_scaled, np.ravel(y), cv=cross_validation, scoring='neg_mean_squared_error')
print(f"MSE: {scores.mean()} (+/- {scores.std()})")

scores_map['Linear Regression'] = scores

##### XGBoost Regression

In [None]:
xgboost = XGBRegressor(n_estimators=1000)

scores = cross_val_score(xgboost, x_scaled, np.ravel(y), cv=cross_validation, scoring='neg_mean_squared_error')
print(f"MSE: {scores.mean()} (+/- {scores.std()})")

scores_map['XGBoost Regressor'] = scores

##### Multilayer Perceptron Regressor

In [None]:
mlp_regression = MLPRegressor(max_iter=10000)

scores = cross_val_score(mlp_regression, x_scaled, np.ravel(y.astype(float)), cv=cross_validation, scoring='neg_mean_squared_error')
print(f"MSE: {scores.mean()} (+/- {scores.std()})")

scores_map['MLP Regressor'] = scores

In [None]:
# from sklearn.neural_network import MLPClassifier
#
# clf = MLPClassifier(max_iter=10000)
# #print(np.ravel(y))
# #clf.fit(x_scaled, np.ravel(y_train))
# scores = cross_val_score(clf, x_scaled, np.ravel(y.astype(float)), cv=cross_validation, scoring='neg_mean_squared_error')
# print(f"MSE: {scores.mean()} (+/- {scores.std()})")
#
# scores_map['MLPClassifier'] = scores

In [None]:
plt.figure(figsize=(20, 5.2))
scores_map = pd.DataFrame(scores_map)
sns.boxplot(data=scores_map)
plt.ylim(-.05, -.01)
plt.ylabel(r"Regression Error")
#plt.title("Mean-Squared Error of all Regressions")

As we can see above our hypthesis was not correct. In fact, the machine learning model MLP Regressor didnt even make it in the top half of all our models, and we will therefore not continue with that model going forward.

Instead, Gradient boosting regressor was the best for this dataset, closely followed by the random forest, the linear model, and finally XGB.

In [None]:
scores_map