![img.png](CampQMIND_banner.png)

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Importing-libraries" data-toc-modified-id="Importing-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Importing libraries</a></span></li><li><span><a href="#Reading-Data" data-toc-modified-id="Reading-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Reading Data</a></span></li><li><span><a href="#Preparing-Data" data-toc-modified-id="Preparing-Data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Preparing Data</a></span><ul class="toc-item"><li><span><a href="#Feature-Engineering" data-toc-modified-id="Feature-Engineering-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Feature Engineering</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Standardization-example" data-toc-modified-id="Standardization-example-3.1.0.1"><span class="toc-item-num">3.1.0.1&nbsp;&nbsp;</span>Standardization example</a></span></li><li><span><a href="#Dummy-variable-example" data-toc-modified-id="Dummy-variable-example-3.1.0.2"><span class="toc-item-num">3.1.0.2&nbsp;&nbsp;</span>Dummy variable example</a></span></li></ul></li></ul></li><li><span><a href="#Spliting-into-train,-val,-test" data-toc-modified-id="Spliting-into-train,-val,-test-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Spliting into train, val, test</a></span></li></ul></li><li><span><a href="#Models" data-toc-modified-id="Models-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Models</a></span><ul class="toc-item"><li><span><a href="#Linear-Regression" data-toc-modified-id="Linear-Regression-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Linear Regression</a></span></li><li><span><a href="#Decision-Tree" data-toc-modified-id="Decision-Tree-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Decision Tree</a></span></li><li><span><a href="#Random-Forests" data-toc-modified-id="Random-Forests-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Random Forests</a></span></li><li><span><a href="#Support-Vector-Regression" data-toc-modified-id="Support-Vector-Regression-4.4"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Support Vector Regression</a></span></li></ul></li><li><span><a href="#Hyperparameter-Tuning" data-toc-modified-id="Hyperparameter-Tuning-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Hyperparameter Tuning</a></span></li></ul></div>

# Importing libraries

In [27]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Reading Data

The dataset is from a kaggle competition.

https://www.kaggle.com/c/house-prices-advanced-regression-techniques

The original dataset has 81 columns but we are going to use 13 features.

In [29]:
selected_cols = [
    "LotFrontage", "LotArea", "OverallQual", "OverallCond", "YearBuilt",
    "YearRemodAdd", "MasVnrArea", "BsmtFinSF1", "BsmtUnfSF", "HouseStyle",
    "SaleCondition", "Neighborhood", "OverallQual", "OverallCond", "SalePrice"
]
df = pd.read_csv("housePriceData.csv", usecols=selected_cols)

In [32]:
df.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,SaleCondition,SalePrice
0,65.0,8450,CollgCr,2Story,7,5,2003,2003,196.0,706,150,Normal,208500
1,80.0,9600,Veenker,1Story,6,8,1976,1976,0.0,978,284,Normal,181500
2,68.0,11250,CollgCr,2Story,7,5,2001,2002,162.0,486,434,Normal,223500
3,60.0,9550,Crawfor,2Story,7,5,1915,1970,0.0,216,540,Abnorml,140000
4,84.0,14260,NoRidge,2Story,8,5,2000,2000,350.0,655,490,Normal,250000


In [35]:
len(df.columns)

13

# Preparing Data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotFrontage    1201 non-null   float64
 1   LotArea        1460 non-null   int64  
 2   Neighborhood   1460 non-null   object 
 3   HouseStyle     1460 non-null   object 
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1452 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtUnfSF      1460 non-null   int64  
 11  SaleCondition  1460 non-null   object 
 12  SalePrice      1460 non-null   int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 148.4+ KB


In [5]:
# Looks like LotFrontage has 259 missing values and MasVnrArea has 8 missing values
# We will assume the reason that they are missing is because they don't exist in the house
# We will fill them with zeroes.

df.fillna(0, inplace=True)

## Feature Engineering

In [28]:
df.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,SalePrice,RelAge,Neighborhood_Blmngtn,...,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.212877,-0.207142,0.651479,-0.5172,0.514104,0.575425,-0.944591,12.247699,-0.555742,0,...,0,1,0,0,0,0,0,0,1,0
1,0.645747,-0.091886,-0.071836,2.179628,-0.57075,1.171992,-0.641228,12.109016,-0.555742,0,...,0,0,0,0,0,0,0,0,1,0
2,0.299451,0.07348,0.651479,-0.5172,0.325915,0.092907,-0.301643,12.317171,-0.514873,0,...,0,1,0,0,0,0,0,0,1,0
3,0.068587,-0.096897,0.651479,-0.5172,-0.57075,-0.499274,-0.06167,11.849405,1.692084,0,...,0,1,0,0,1,0,0,0,0,0
4,0.761179,0.375148,1.374795,-0.5172,1.366489,0.463568,-0.174865,12.42922,-0.555742,0,...,0,1,0,0,0,0,0,0,1,0


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 48 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   LotFrontage            1460 non-null   float64
 1   LotArea                1460 non-null   float64
 2   OverallQual            1460 non-null   float64
 3   OverallCond            1460 non-null   float64
 4   MasVnrArea             1460 non-null   float64
 5   BsmtFinSF1             1460 non-null   float64
 6   BsmtUnfSF              1460 non-null   float64
 7   SalePrice              1460 non-null   float64
 8   RelAge                 1460 non-null   float64
 9   Neighborhood_Blmngtn   1460 non-null   uint8  
 10  Neighborhood_Blueste   1460 non-null   uint8  
 11  Neighborhood_BrDale    1460 non-null   uint8  
 12  Neighborhood_BrkSide   1460 non-null   uint8  
 13  Neighborhood_ClearCr   1460 non-null   uint8  
 14  Neighborhood_CollgCr   1460 non-null   uint8  
 15  Neig

In [8]:
def standardize(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)

In [12]:
# Creating a age when sold feature
df["RelAge"] = df["YearRemodAdd"] - df["YearBuilt"]
df.drop(["YearRemodAdd", "YearBuilt"], axis=1, inplace=True)

In [13]:
for column in df.select_dtypes(['int', "float"]).columns:
    if column != "SalePrice":
        df.loc[:, column] = standardize(df.loc[:,
                                               column].values.reshape(-1, 1))

In [14]:
df = pd.get_dummies(df, columns=df.select_dtypes("object").columns)

In [15]:
# Competition asks for RMSE with logarithm of SalePrice and predicted price

df.loc[:, "SalePrice"] = df.loc[:, "SalePrice"].apply(np.log1p)

In [16]:
df.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,SalePrice,RelAge,Neighborhood_Blmngtn,...,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.212877,-0.207142,0.651479,-0.5172,0.514104,0.575425,-0.944591,12.247699,-0.555742,0,...,0,1,0,0,0,0,0,0,1,0
1,0.645747,-0.091886,-0.071836,2.179628,-0.57075,1.171992,-0.641228,12.109016,-0.555742,0,...,0,0,0,0,0,0,0,0,1,0
2,0.299451,0.07348,0.651479,-0.5172,0.325915,0.092907,-0.301643,12.317171,-0.514873,0,...,0,1,0,0,0,0,0,0,1,0
3,0.068587,-0.096897,0.651479,-0.5172,-0.57075,-0.499274,-0.06167,11.849405,1.692084,0,...,0,1,0,0,1,0,0,0,0,0
4,0.761179,0.375148,1.374795,-0.5172,1.366489,0.463568,-0.174865,12.42922,-0.555742,0,...,0,1,0,0,0,0,0,0,1,0


####  Standardization example

$$Z = \frac{x-\mu}{\sigma}$$

In [9]:
# Lot Area

print(
    f"Before transformation mean: {df.LotArea.mean().round(2)}, std: {df.LotFrontage.std().round(2)}",
)

standard = standardize(df.LotArea.values.reshape(-1, 1))
print(
    f"After transformation mean: {-standard.mean().round(2)}, std: {standard.std().round(2)}",
)

Before transformation mean: 10516.83, std: 34.66
After transformation mean: 0.0, std: 1.0


#### Dummy variable example

In [10]:
dummy_example = pd.DataFrame({"Column": list(range(3)) * 10})
dummy_example.head()

Unnamed: 0,Column
0,0
1,1
2,2
3,0
4,1


In [11]:
pd.get_dummies(dummy_example, columns=["Column"]).head()

Unnamed: 0,Column_0,Column_1,Column_2
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,0,1,0


## Spliting into train, val, test

In [17]:
# Before we move on to the next step we will seperate the dataset into train, validation, test split


# Function from https://datascience.stackexchange.com/questions/15135/train-test-validation-set-splitting-in-sklearn
def data_split(examples, labels, train_frac, random_state=5):
    ''' https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
    param data:       Data to be split
    param train_frac: Ratio of train set to whole dataset

    Randomly split dataset, based on these ratios:
        'train': train_frac
        'valid': (1-train_frac) / 2
        'test':  (1-train_frac) / 2

    Eg: passing train_frac=0.8 gives a 80% / 10% / 10% split
    '''

    X_train, X_tmp, Y_train, Y_tmp = train_test_split(
        examples, labels, train_size=train_frac, random_state=random_state)

    X_val, X_test, Y_val, Y_test = train_test_split(X_tmp,
                                                    Y_tmp,
                                                    train_size=0.5,
                                                    random_state=random_state)

    return X_train, X_val, X_test, Y_train, Y_val, Y_test

In [18]:
X_train, X_val, X_val, Y_train, Y_val, Y_test = data_split(example=df.drop("SalePrice", axis=1),
                                                            labels=df.SalePrice,train_frac=0.7)

In [19]:
print(f"Training size is {X_train.shape[0]}")
print(f"Validation size is {X_val.shape[0]}")
print(f"Test size is {X_val.shape[0]}")

Training size is 1021
Validation size is 219
Test size is 219


# Models

In [20]:
def scores(actual, prediction):
    print("RMSE score is {:3f}".format(
        mean_squared_error(actual, prediction, squared=False)))
    print("R^2 score is {:3f}".format(r2_score(actual, prediction)))

## Linear Regression

In [21]:
linear = LinearRegression()
linear.fit(X_train, Y_train)
linear_predictions = linear.predict(X_val)
print("Validation")
scores(Y_val, linear_predictions)

# print("\nTest")
# linear_predictions_test = linear.predict(X_test)
# scores(Y_test, linear_predictions_test)

Validation
RMSE score is 0.171539
R^2 score is 0.821081


## Decision Tree

In [22]:
tree = DecisionTreeRegressor()
tree.fit(X_train, Y_train)
tree_predictios = tree.predict(X_val)
print("Validation")
scores(Y_val, tree_predictios)

# print("\nTest")
# tree_predictions_test = tree.predict(X_test)
# scores(Y_test, tree_predictions_test)

Validation
RMSE score is 0.274358
R^2 score is 0.542317


## Random Forests

In [23]:
forest = RandomForestRegressor()
forest.fit(X_train, Y_train)
forest_predictios = forest.predict(X_val)
print("Validation")

scores(Y_val, forest_predictios)

# print("\nTest")
# forest_predictios_test = forest.predict(X_test)
# scores(Y_test, forest_predictios_test)

Validation
RMSE score is 0.189281
R^2 score is 0.782157


## Support Vector Regression

In [24]:
svr = SVR()
svr.fit(X_train, Y_train)
svr_predictions = svr.predict(X_val)
print("Validation")
scores(Y_val, svr_predictions)


# print("\nTest")
# svr_predictios_test = svr.predict(X_test)
# scores(Y_test, svr_predictios_test)

Validation
RMSE score is 0.176318
R^2 score is 0.810974


# Hyperparameter Tuning

In [25]:
%%time
# We will tune hyperparameters for Support Vector Machines

params = {
    "kernel": ["rbf", "poly", "linear"],
    "C": [0.01, 0.05, 0.1, *range(1, 100, 20)],
    "degree": range(1, 6, 2)
}

tuner = GridSearchCV(SVR(),
                     param_grid=params,
                     scoring='neg_root_mean_squared_error',
                     cv=3,
                     n_jobs=-1)

tuner.fit(X_train, Y_train)
tuner_predictions = tuner.predict(X_val)
print("Validation")
scores(Y_val, tuner_predictions)

print("\nTest")
tuner_predictions_test = tuner.predict(X_test)
scores(Y_test, tuner_predictions_test)

best_parameters = tuner.best_estimator_.get_params()
print("\n Best parameters:")
for param, value in best_parameters.items():
    print("\t {} : {}".format(param, value))

Validation
RMSE score is 0.176318
R^2 score is 0.810974

Test
RMSE score is 0.174278
R^2 score is 0.785877

 Best parameters:
	 C : 1
	 cache_size : 200
	 coef0 : 0.0
	 degree : 1
	 epsilon : 0.1
	 gamma : scale
	 kernel : rbf
	 max_iter : -1
	 shrinking : True
	 tol : 0.001
	 verbose : False
CPU times: user 593 ms, sys: 90.7 ms, total: 684 ms
Wall time: 40.1 s
