In [1]:
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Loading Data
data = pd.read_csv('preprocess.csv')

column_mapping = {}
for col in list(data.columns):

    new_col = col.replace(']', ')')
    new_col = new_col.replace('[', '(')
    new_col = new_col.replace(', ', '-')
    new_col = new_col.replace(' ', '')
    column_mapping[col] = new_col

data = data.rename(columns=column_mapping)

In [3]:
data.head()

Unnamed: 0,PROPERTYZIP,LOTAREA,SALEPRICE,COUNTYBUILDING,COUNTYLAND,COUNTYTOTAL,LOCALBUILDING,LOCALLAND,LOCALTOTAL,FAIRMARKETBUILDING,...,HEATINGCOOLINGDESC_HeatPump,HEATINGCOOLINGDESC_HeatPumpwithAC,HEATINGCOOLINGDESC_NoHeatbutwithAC,HEATINGCOOLINGDESC_None,HEATINGCOOLINGDESC_Other,HEATINGCOOLINGDESC_UnitHeat,HEATINGCOOLINGDESC_UnitHeatwithAC,HEATINGCOOLINGDESC_WallFurnace,HEATINGCOOLINGDESC_WallFurnacewithAC,HEATINGCOOLINGDESC_nan
0,15222.0,0,525000.0,325400,0,325400,343400,0,343400,343400,...,0,0,0,0,0,0,0,0,0,0
1,15222.0,0,350000.0,325400,0,325400,343400,0,343400,343400,...,0,0,0,0,0,0,0,0,0,0
2,15222.0,0,535000.0,468400,0,468400,468400,0,468400,468400,...,0,0,0,0,0,0,0,0,0,0
3,15222.0,0,394000.0,303200,0,303200,303200,0,303200,303200,...,0,0,0,0,0,0,0,0,0,0
4,15222.0,2024,255000.0,177500,155900,333400,177500,155900,333400,177500,...,0,0,0,0,0,0,0,0,0,1


In [4]:
from sklearn.model_selection import train_test_split

X = data.drop(['PROPERTYZIP', 'SALEPRICE'], axis=1)
Y = data['SALEPRICE']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [5]:
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import GradientBoostingRegressor

def test_model(model_choice, params, model_name, X_train, y_train):
    """Trains model, and evaluates it.
        PARAMS:
            model_choice - SKLearn Model: Model to be trained
            params - dictionary: Dictionary of parameters to feed the model
            X_train - DataFrame: Training Data, Features
            y_train - DataFrame: Training Data, Targets
            X_test - DataFrame: Testing Data, Features
            y_test - DataFrame: Testing Data, Targets
        
        RETURNS:
            rmse - float: Root Mean Squared Error for specified model and parameters. 
    """
    # Run RandomizedSearch instead of GridSearch to conserve time. 
    # If Time, we can run GridSearch on the final models?
    # Really small search space. To increase, update n_iter to a higher number. 
    # To get more reliable results, increase cv to a higher number.
    clf = RandomizedSearchCV(model_choice(), params, cv=3, n_iter=5, scoring='neg_root_mean_squared_error', n_jobs=4, verbose=10)
    clf.fit(X_train, y_train)
    
    best_parameters = clf.best_params_
    best_score = clf.best_score_ * -1
    
    print(f"Root Mean Squared Error for {model_name}: ", best_score)
    
    return (best_score, model_name, best_parameters)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm
import xgboost as xgb

xgb_param_space = {
    'objective':['reg:squarederror'],
    'max_depth': [6, 10, 15, 20],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
    'gamma': [0, 0.25, 0.5, 1.0],
    'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
    'n_estimators': [100]
}

gbr_param_space = {
    'learning_rate':[0.01,0.1],
    'n_estimators':[50,100,150],
    'subsample':[0.5,0.7,1.0],
    'max_depth':[3,6],
    'alpha':[0.7,0.9]
}

svm_param_space = {
    'kernel':['linear','poly','rbf','sigmoid'],
    'degree':[3,4,5,6],
    'tol': [0.0001, 0.001, 0.01],
    'C':[0.8, 0.9, 1.0]
}

models_to_test = [
    (xgb.XGBClassifier, xgb_param_space, "XGBoost"),
    (svm.SVR, svm_param_space, "Support Vector Machine"),
    (GradientBoostingRegressor, gbr_param_space, "GradientBoostingRegressor")
]

results = []

for model in models_to_test:
    results += [test_model(model[0], model[1], model[2], X_train, y_train)]
    
# Sort results by the best rmse.
results.sort(key = lambda x: x[0])

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
print(results)

In [23]:
from sklearn.metrics import mean_squared_error

clf = GradientBoostingRegressor(subsample=1, n_estimators=150, max_depth=3, learning_rate=0.01, alpha=0.7).fit(X_train, y_train)

predictions = clf.predict(X_test)

rmse = mean_squared_error(y_test, predictions) ** 0.5
print("Baseline Root Mean Squared Error: ", rmse)

Baseline Root Mean Squared Error:  229160.64577435007


In [39]:
y_test = y_test.reset_index(drop=True)

In [46]:
pd.concat([pd.Series(predictions), y_test],axis=1).head(30)

Unnamed: 0,0,SALEPRICE
0,107845.480141,73000.0
1,107845.480141,35000.0
2,107845.480141,90000.0
3,146555.676616,55000.0
4,107845.480141,75000.0
5,137365.455912,115000.0
6,113247.106357,136000.0
7,107845.480141,26000.0
8,108482.72087,87500.0
9,107845.480141,84000.0
