In [18]:
### Importing Packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm
import statsmodels.tools

In [19]:
### Reading in the file saved from the EDA notebook (after cleaning too)
# Deleting the 'Unnamed' column again
df = pd.read_csv('eda_cleaned_data.csv')
df.drop(columns = ['Unnamed: 0'], inplace = True)

In [20]:
### Reducing the price to 150,000-250,000 for this model
# Reducing the number of bedrooms to just 2,3 and 4 for this model
df = df[df['price'] < 250000]
df = df[df['price'] > 150000]
del_beds = [1,5, 6, 7, 8, 9]
df = df[~df['bedroom'].isin(del_beds)]

In [21]:
df['bedroom'].value_counts()

3    575
4    117
2    108
Name: bedroom, dtype: int64

<font size="6"><font color='orange'>Prepping categorical data for modelling (Feature engineering)</font>

In [22]:
### Feature Engineering Function
# Non-numeric columns ('house_type' and 'postcode') are One-Hot Encoded


def feature_eng(df):
    ''' Feature Engineer Function - One Hot encoding of house_type and postcode'''
    df = pd.get_dummies(df, columns = ['house_type'], drop_first = True, prefix = 'type')
    df.drop(columns = ['property_no'], inplace = True)
    df.drop(columns = ['postcode'], inplace = True)
    df.drop(columns = ['longitude'], inplace = True)
    df.drop(columns = ['latitude'], inplace = True)

    df = sm.add_constant(df)

    return df

<font size="6"><font color='orange'>Model</font>

In [23]:
### Train / Test Splitting
feature_cols = list(df.columns)        ## Columns from Dataframe set to Feature Columns for Model Testing
feature_cols.remove('price')            ## Removing the dependent variable, price

X = df[feature_cols]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)  ## Splitting Dataframe data into separate train and test dataframes



In [24]:
## Apply feature engineering function X_Train Dataframe
X_train = feature_eng(X_train)

In [25]:
## Scale mileage and age_log using MinMax Scaler
scaler = StandardScaler()
X_train[['bedroom']] = scaler.fit_transform(X_train[['bedroom']])          ## Fitting the model to X-train data
scaler.scale_       ## Scaling factors to be implemented in the function

array([0.54611331])

In [26]:
## Scale mileage and age_log using MinMax Scaler
scaler = StandardScaler()
X_train[['distance']] = scaler.fit_transform(X_train[['distance']])         ## Fitting the model to X-train data
scaler.scale_       ## Scaling factors to be implemented in the function

array([4.88918986])

In [27]:
X_train.columns

Index(['const', 'bedroom', 'distance', 'type_Semi-Detached', 'type_Terraced'], dtype='object')

In [28]:
# Craeting a constant column (this is a failsafe as sometimes the const column wasn't auto-created)
X_train['const'] = 1

In [29]:
### Linear Regression (with postcode)
feature_cols = ['const','bedroom', 'distance', 'type_Semi-Detached', 'type_Terraced']


                ## Final Features to be used in the model


lin_reg = sm.OLS(y_train, X_train[feature_cols].astype(float))          ## Stats Model Linear Regression
results = lin_reg.fit()
print(results.summary())


X_train['y_pred'] = results.predict(X_train[feature_cols])
X_train['y_pred'] = X_train['y_pred'].astype('float')
rmse = statsmodels.tools.eval_measures.rmse(y_train, X_train['y_pred']) ## RMSE of the Model

print('\n', 'Rmse: ', rmse)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.145
Model:                            OLS   Adj. R-squared:                  0.140
Method:                 Least Squares   F-statistic:                     26.96
Date:                Fri, 23 Dec 2022   Prob (F-statistic):           1.10e-20
Time:                        13:19:44   Log-Likelihood:                -7434.6
No. Observations:                 640   AIC:                         1.488e+04
Df Residuals:                     635   BIC:                         1.490e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               2.136e+05   2741

In [30]:
### VIF
def checkVIF(X):
    '''Function to calculate the VIF of a dataframe '''
    vif = pd.DataFrame()                ## Initialising an empty dataframe
    vif['Features'] = feature_cols      ## The features used in training
    vif['VIF'] = [variance_inflation_factor(X[feature_cols].values, i) for i in range(X[feature_cols].shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)   ## Round VIF to 2 decimal places for readability
    vif = vif.sort_values(by = "VIF", ascending = False)    ## Sorting VIF values in descending order to observe feature VIF scores
    return vif
checkVIF(X_train)   ## If VIF >= 5, test to see if the removal of said feature reduces the multi-co-linearity

Unnamed: 0,Features,VIF
0,const,6.63
4,type_Terraced,1.79
3,type_Semi-Detached,1.77
1,bedroom,1.04
2,distance,1.01


<font size="6"><font color='orange'>Testing the model</font>

In [31]:
## Apply feature engineering function X_Train Dataframe
X_test = feature_eng(X_test)

In [32]:
## Scale mileage and age_log using MinMax Scaler
scaler = StandardScaler()
X_test[['bedroom']] = scaler.fit_transform(X_test[['bedroom']])  ## Fitting the model to X-train data
scaler.scale_  ## Scaling factors to be implemented in the function


array([0.46097722])

In [33]:
## Scale mileage and age_log using MinMax Scaler
scaler = StandardScaler()
X_test[['distance']] = scaler.fit_transform(X_test[['distance']])  ## Fitting the model to X-train data
scaler.scale_  ## Scaling factors to be implemented in the function

array([4.80136788])

In [34]:
X_train['const'] = 1
### Linear Regression (with postcode)
feature_cols = ['const', 'bedroom', 'distance', 'type_Semi-Detached', 'type_Terraced']

## Final Features to be used in the model


lin_reg = sm.OLS(y_test, X_test[feature_cols].astype(float))  ## Stats Model Linear Regression
results = lin_reg.fit()
print(results.summary())

X_test['y_pred'] = results.predict(X_test[feature_cols])
X_test['y_pred'] = X_test['y_pred'].astype('float')
rmse = statsmodels.tools.eval_measures.rmse(y_test, X_test['y_pred'])  ## RMSE of the Model

print('\n', 'Rmse: ', rmse)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.061
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     2.528
Date:                Fri, 23 Dec 2022   Prob (F-statistic):             0.0429
Time:                        13:19:44   Log-Likelihood:                -1863.7
No. Observations:                 160   AIC:                             3737.
Df Residuals:                     155   BIC:                             3753.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               2.018e+05   6343