In [34]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [35]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width',None)
pd.set_option('display.max_colwidth',None)

# Importing Essentials

In [36]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [37]:
housing = pd.read_csv(r"../input/house-prices-advanced-regression-techniques/train.csv")
housing.drop(['Id'],axis=1,inplace=True)

# 1. Dealing with missing values

In [38]:
housing.isnull().sum().sort_values(ascending=False)

PoolQC           1453
MiscFeature      1406
Alley            1369
Fence            1179
FireplaceQu       690
LotFrontage       259
GarageYrBlt        81
GarageCond         81
GarageType         81
GarageFinish       81
GarageQual         81
BsmtExposure       38
BsmtFinType2       38
BsmtCond           37
BsmtQual           37
BsmtFinType1       37
MasVnrArea          8
MasVnrType          8
Electrical          1
MSSubClass          0
Fireplaces          0
Functional          0
KitchenQual         0
KitchenAbvGr        0
BedroomAbvGr        0
HalfBath            0
FullBath            0
BsmtHalfBath        0
TotRmsAbvGrd        0
GarageCars          0
GrLivArea           0
GarageArea          0
PavedDrive          0
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
MiscVal             0
MoSold              0
YrSold              0
SaleType            0
SaleCondition       0
BsmtFullBath        0
CentralAir

# 2. Fixing missing values explicitly


In [39]:
# Replacing categorical columns with None
'''
cat_columns = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition']
'''
df = housing
cat_columns = df.select_dtypes(include=['object']).columns

for col in cat_columns:
    df[col] = df[col].fillna("None")

#Changing LotFrontage to mean LotFrontage in the same Neighborhood
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

#Replacing numerical column null values with 0
num_columns = df.select_dtypes(exclude=['object']).columns
for col in num_columns:
    if col is not 'Electrical':
        df[col] = df[col].fillna(int(0))

#Replacing 'Electrical' with mode
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])

#Dropping Utilities
df = df.drop(['Utilities'],axis=1)

In [40]:
#Checking the count of null values again
df.isnull().apply(sum).max()

0

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

# 3. Dealing with Outliers

In [42]:
# Removing noisy data which is above 0.999 quantile
num_attributes = df[num_columns]

high_quant = df.quantile(.999)

for col in num_columns:
    df = df.drop(df[col][df[col]>high_quant[col]].index)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1422 entries, 0 to 1458
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1422 non-null   int64  
 1   MSZoning       1422 non-null   object 
 2   LotFrontage    1422 non-null   float64
 3   LotArea        1422 non-null   int64  
 4   Street         1422 non-null   object 
 5   Alley          1422 non-null   object 
 6   LotShape       1422 non-null   object 
 7   LandContour    1422 non-null   object 
 8   LotConfig      1422 non-null   object 
 9   LandSlope      1422 non-null   object 
 10  Neighborhood   1422 non-null   object 
 11  Condition1     1422 non-null   object 
 12  Condition2     1422 non-null   object 
 13  BldgType       1422 non-null   object 
 14  HouseStyle     1422 non-null   object 
 15  OverallQual    1422 non-null   int64  
 16  OverallCond    1422 non-null   int64  
 17  YearBuilt      1422 non-null   int64  
 18  YearRemo

# 4. Dealing with correlated attributes

In [43]:
# Removing highly correlated features calculated in the EDA Notebook while viewing scatter plot and corr values

attributes_drop = ['MiscVal', 'MoSold', 'YrSold', 'BsmtFinSF2', 'BsmtHalfBath', 'MSSubClass', 'GarageArea',
                  'GarageYrBlt', '3SsnPorch']
df.drop(attributes_drop, axis=1, inplace=True)

# Removing columns with lots of missing values - PoolQC: 1453, MiscFeature: 1406, Alley: 1369, Fence: 1179
attributes_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence']
df.drop(attributes_drop, axis=1, inplace=True)

# 5. Handling Text and Categorical Values

In [44]:
df.select_dtypes(include=['object']).columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

# 5. Handling Text and Categorical Values

In [45]:
# Transforming Categorial variables using OneHotEncoder
cat_encoder = OneHotEncoder()
df_cat_processed = cat_encoder.fit_transform(df)
df_cat_processed

<1422x7325 sparse matrix of type '<class 'numpy.float64'>'
	with 93852 stored elements in Compressed Sparse Row format>

# Data Transformation

In [46]:
#Separate features and target variables
housing_X = df.drop('SalePrice', axis=1)
housing_y = df['SalePrice']

# Getting list of numerical and categorical values separately
num_attributes = housing_X.select_dtypes(exclude=['object'])
cat_attributes = housing_X.select_dtypes(include=['object'])

num_attribs = list(num_attributes)
cat_attribs = list(cat_attributes)

# Numerical pipeline to impute any missing values with the median and scale attributes
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

In [47]:
#Full pipeline that handles both numerical and categorical column's transformation
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

# Description before applying transforms
print("housing_y:\n",housing_y.describe())

# Applying log transformation to sales price - remember right-skewed data
housing_y_prepared = np.log(housing_y)

# Running transformation pipeline on all other attributes
housing_X_prepared = full_pipeline.fit_transform(housing_X)

# Description before applying transform
print("\nhousing_y_prepared:\n",housing_y_prepared)

housing_X_prepared

housing_y:
 count      1422.000000
mean     178405.042897
std       74506.926127
min       35311.000000
25%      129600.000000
50%      161500.000000
75%      211750.000000
max      611657.000000
Name: SalePrice, dtype: float64

housing_y_prepared:
 0       12.247694
1       12.109011
2       12.317167
3       11.849398
4       12.429216
5       11.870600
6       12.634603
7       12.206073
8       11.774520
9       11.678440
10      11.771436
11      12.751300
12      11.877569
13      12.540758
14      11.964001
15      11.790557
16      11.911702
17      11.407565
18      11.976659
19      11.842229
20      12.692503
21      11.845103
22      12.345835
23      11.774520
24      11.944708
25      12.454104
26      11.811547
27      12.631340
28      12.242887
29      11.134589
30      10.596635
31      11.914048
32      12.100156
33      12.016726
34      12.533576
35      12.641097
36      11.884489
37      11.938193
38      11.599103
39      11.314475
40      11.982929
41      12.0

<1422x273 sparse matrix of type '<class 'numpy.float64'>'
	with 92430 stored elements in Compressed Sparse Row format>

# 6. Creating and Assessing ML Models

# a) Trial 1 with Linear Regression

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [49]:
# Splitting train and test set
X_train, X_test, y_train, y_test = train_test_split(housing_X_prepared, housing_y_prepared, test_size=0.2, random_state=7)

In [50]:
# Training the model on training data

#Training the model on Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluating the model

print("Accuracy%:", model.score(X_test, y_test)*100)

Accuracy%: 89.48307279228523


# b) Training on multiple ML models to see which fits best

RMSE (Root mean sqaure error) will be used and since we took a log of the target variable, we need to inverse it before calculating error

In [51]:
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
import xgboost

# Function to invert target variable array from log scale
def inv_y(transformed_y):
    return np.exp(transformed_y)

# Series to collect RMSE for the different algorithms: "algortihm name + RMSE"
rmse_compare = pd.Series()
rmse_compare.index.name = "Model"

# Series to collect the accuracy for the different algorithms: "algorithms name + score"
scores_compare = pd.Series()
scores_compare.index.name = "Model"

# Model 1: Linear Regression =======================
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

linear_val_predictions = linear_model.predict(X_test)
linear_val_rmse = mean_squared_error(inv_y(linear_val_predictions), inv_y(y_test))
linear_val_rmse = np.sqrt(linear_val_rmse)
rmse_compare['LinearRegression'] = linear_val_rmse

lr_score = linear_model.score(X_test, y_test)*100
scores_compare['LinearRegression'] = lr_score

#Model 2: Decision Tress ===========================
dtree_model = DecisionTreeRegressor(random_state=5)
dtree_model.fit(X_train, y_train)

dtree_val_predictions = dtree_model.predict(X_test)
dtree_val_rmse = mean_squared_error(inv_y(dtree_val_predictions), inv_y(y_test))
dtree_val_rmse = np.sqrt(dtree_val_rmse)
rmse_compare['DecisionTreeRegressor'] = dtree_val_rmse

dtree_score = dtree_model.score(X_test, y_test)*100
scores_compare['DecisionTreeRegressor'] = dtree_score

# Model 3: Random Forest ==========================
rf_model = RandomForestRegressor(random_state=5)
rf_model.fit(X_train, y_train)

rf_val_predictions = rf_model.predict(X_test)
rf_val_rmse = mean_squared_error(inv_y(rf_val_predictions), inv_y(y_test))
rf_val_rmse = np.sqrt(rf_val_rmse)
rmse_compare['RandomForest'] = rf_val_rmse

rf_score = rf_model.score(X_test, y_test)*100
scores_compare['RandomForest'] = rf_score


# Model 4: Gradient Boostinf Regression ===========
gbr_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=5)
gbr_model.fit(X_train, y_train)

gbr_val_predictions = gbr_model.predict(X_test)
gbr_val_rmse = mean_squared_error(inv_y(gbr_val_predictions), inv_y(y_test))
gbr_val_rmse = np.sqrt(gbr_val_rmse)
rmse_compare['GradientBoostingRegression'] = gbr_val_rmse

gbr_score = gbr_model.score(X_test, y_test)*100
scores_compare['GradientBoostingRegression'] = gbr_score



In [52]:
print("RMSE values for different algorithms:")
rmse_compare.sort_values(ascending=True).round()

RMSE values for different algorithms:


Model
LinearRegression              24688.0
GradientBoostingRegression    27434.0
RandomForest                  30168.0
DecisionTreeRegressor         40007.0
dtype: float64

In [53]:
print("Accuracy scores for different algorithms")
scores_compare.sort_values(ascending=False).round(3)

Accuracy scores for different algorithms


Model
GradientBoostingRegression    89.523
LinearRegression              89.483
RandomForest                  86.442
DecisionTreeRegressor         71.964
dtype: float64

# 6.1 Conclusion 1

Conclusion from above 4 models:
* LinearRegression and Random Forest have better accuracy than the rest but still have high RMSE. This means that either we need to improve the features or the model is underfitting.
* Decision Tree should be able to form complex non-linear relationships but it seems that this model is overfitting the training set.
* Random Forest works by training many decision trees on random subsets of features and then averaging the predictions. This is why the accuracy of Random Forest is higher than Decision Tree.

# c) Evaluation using Cross-validation

In [54]:
from sklearn.model_selection import cross_val_score

# Performing K fold cross-validation, with K=10 on Linear model
scores = cross_val_score(linear_model, X_train, y_train,
                        scoring="neg_mean_squared_error", cv=10)
linear_rmse_scores = np.sqrt(-scores)

# Printing results
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation", scores.std())
    
display_scores(linear_rmse_scores)

Scores: [0.11218334 0.13903415 0.1054053  0.12550088 0.10796002 0.11701404
 0.10935645 0.11604085 0.13700937 0.11381849]
Mean: 0.1183322904295776
Standard Deviation 0.01117684194737972


In [55]:
from sklearn.model_selection import cross_val_score

# Performing K fold cross-validation, with K=10 on Randon Forest
scores = cross_val_score(rf_model, X_train, y_train,
                        scoring="neg_mean_squared_error", cv=10)
rf_rmse_scores = np.sqrt(-scores)

# Printing results
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation", scores.std())
    
display_scores(rf_rmse_scores)

Scores: [0.12149917 0.16546388 0.10947893 0.1555309  0.12217862 0.13337342
 0.13154133 0.10900089 0.14349569 0.13339312]
Mean: 0.13249559342708678
Standard Deviation 0.017464681624998774
