In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
data = pd.read_excel("HousePricePrediction.xlsx")

In [9]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0


In [10]:
data.shape

(2919, 13)

# Data Processing

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            2919 non-null   int64  
 1   MSSubClass    2919 non-null   int64  
 2   MSZoning      2915 non-null   object 
 3   LotArea       2919 non-null   int64  
 4   LotConfig     2919 non-null   object 
 5   BldgType      2919 non-null   object 
 6   OverallCond   2919 non-null   int64  
 7   YearBuilt     2919 non-null   int64  
 8   YearRemodAdd  2919 non-null   int64  
 9   Exterior1st   2918 non-null   object 
 10  BsmtFinSF2    2918 non-null   float64
 11  TotalBsmtSF   2918 non-null   float64
 12  SalePrice     1460 non-null   float64
dtypes: float64(3), int64(6), object(4)
memory usage: 296.6+ KB


In [13]:
obj = (data.dtypes == 'object')
object_cols = list(obj[obj].index)
print("Categorical variables:",len(object_cols))
 
int_ = (data.dtypes == 'int')
num_cols = list(int_[int_].index)
print("Integer variables:",len(num_cols))
 
fl = (data.dtypes == 'float')
fl_cols = list(fl[fl].index)
print("Float variables:",len(fl_cols))

Categorical variables: 4
Integer variables: 0
Float variables: 3


# Exploratory Data Analysis

# Data Cleaning

In [14]:
data.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotArea            0
LotConfig          0
BldgType           0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
Exterior1st        1
BsmtFinSF2         1
TotalBsmtSF        1
SalePrice       1459
dtype: int64

In [15]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0


In [19]:
data.drop(['Id'],axis = 1,inplace = True)

In [20]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0


In [18]:
data.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotArea            0
LotConfig          0
BldgType           0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
Exterior1st        1
BsmtFinSF2         1
TotalBsmtSF        1
SalePrice       1459
dtype: int64

In [22]:
#replacing saleprice columns null values wth their mean

data['SalePrice'] = data['SalePrice'].fillna(data['SalePrice'].mean())

In [24]:
data.isnull().sum()

MSSubClass      0
MSZoning        4
LotArea         0
LotConfig       0
BldgType        0
OverallCond     0
YearBuilt       0
YearRemodAdd    0
Exterior1st     1
BsmtFinSF2      1
TotalBsmtSF     1
SalePrice       0
dtype: int64

In [25]:
#drop records with null values (as empty records are very less)

df = data.dropna()

In [26]:
df.isnull().sum()

MSSubClass      0
MSZoning        0
LotArea         0
LotConfig       0
BldgType        0
OverallCond     0
YearBuilt       0
YearRemodAdd    0
Exterior1st     0
BsmtFinSF2      0
TotalBsmtSF     0
SalePrice       0
dtype: int64

In [27]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0


In [None]:
# MSZoning, LotConfig,LotConfig, Exterior1st, 

# OneHotEncoder – For Label categorical features
One hot Encoding is the best way to convert categorical data into binary vectors. 

In [28]:
from sklearn.preprocessing import OneHotEncoder

s = (df.dtypes == 'object')
object_columns = list(s[s].index)
object_columns

['MSZoning', 'LotConfig', 'BldgType', 'Exterior1st']

In [30]:
df['MSZoning'].unique()

array(['RL', 'RM', 'C (all)', 'FV', 'RH'], dtype=object)

In [31]:
df['LotConfig'].unique()

array(['Inside', 'FR2', 'Corner', 'CulDSac', 'FR3'], dtype=object)

In [32]:
df['BldgType'].unique()

array(['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'], dtype=object)

In [33]:
df['Exterior1st'].unique()

array(['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing',
       'CemntBd', 'Plywood', 'AsbShng', 'Stucco', 'BrkComm', 'AsphShn',
       'Stone', 'ImStucc', 'CBlock'], dtype=object)

In [37]:
total_columns = df.columns
total_columns

Index(['MSSubClass', 'MSZoning', 'LotArea', 'LotConfig', 'BldgType',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'Exterior1st', 'BsmtFinSF2',
       'TotalBsmtSF', 'SalePrice'],
      dtype='object')

In [38]:
from sklearn.preprocessing import LabelEncoder

to_encoder = ['MSZoning', 'LotConfig', 'BldgType', 'Exterior1st']
encoded_columns = LabelEncoder()

for columns in to_encoder:
    if columns in total_columns:
        df[columns] = encoded_columns.fit_transform(df[columns])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns] = encoded_columns.fit_transform(df[columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns] = encoded_columns.fit_transform(df[columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns] = encoded_columns.fit_transform(df[columns])
A value is trying to be set on a c

Unnamed: 0,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,60,3,8450,4,0,5,2003,2003,12,0.0,856.0,208500.00000
1,20,3,9600,2,0,8,1976,1976,8,0.0,1262.0,181500.00000
2,60,3,11250,4,0,5,2001,2002,12,0.0,920.0,223500.00000
3,70,3,9550,0,0,5,1915,1970,13,0.0,756.0,140000.00000
4,60,3,14260,2,0,5,2000,2000,12,0.0,1145.0,250000.00000
...,...,...,...,...,...,...,...,...,...,...,...,...
2914,160,4,1936,4,3,7,1970,1970,5,0.0,546.0,180921.19589
2915,160,4,1894,4,4,5,1970,1970,5,0.0,546.0,180921.19589
2916,20,3,20000,4,0,7,1960,1996,12,0.0,1224.0,180921.19589
2917,85,3,10441,4,0,5,1992,1992,6,0.0,912.0,180921.19589


# Spliting the Dataset into training & testing


In [39]:
x = df.drop(['SalePrice'],axis = 1)
y = df['SalePrice']

In [40]:
x

Unnamed: 0,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF
0,60,3,8450,4,0,5,2003,2003,12,0.0,856.0
1,20,3,9600,2,0,8,1976,1976,8,0.0,1262.0
2,60,3,11250,4,0,5,2001,2002,12,0.0,920.0
3,70,3,9550,0,0,5,1915,1970,13,0.0,756.0
4,60,3,14260,2,0,5,2000,2000,12,0.0,1145.0
...,...,...,...,...,...,...,...,...,...,...,...
2914,160,4,1936,4,3,7,1970,1970,5,0.0,546.0
2915,160,4,1894,4,4,5,1970,1970,5,0.0,546.0
2916,20,3,20000,4,0,7,1960,1996,12,0.0,1224.0
2917,85,3,10441,4,0,5,1992,1992,6,0.0,912.0


In [41]:
y

0       208500.00000
1       181500.00000
2       223500.00000
3       140000.00000
4       250000.00000
            ...     
2914    180921.19589
2915    180921.19589
2916    180921.19589
2917    180921.19589
2918    180921.19589
Name: SalePrice, Length: 2913, dtype: float64

In [42]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)

In [44]:
print('shape of x train: ',x_train.shape)
print('shape of x test: ',x_test.shape)
print('shape of y train: ',y_train.shape)
print('shape of y test: ',y_test.shape)

shape of x train:  (2039, 11)
shape of x test:  (874, 11)
shape of y train:  (2039,)
shape of y test:  (874,)


# Linear Regression

In [45]:
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()
model_lr.fit(x_train,y_train)

In [46]:
y_pred = model_lr.predict(x_test)

In [47]:
y_pred

array([126316.82511304, 190145.67173735, 219838.28769444, 174279.31100363,
       206062.03227797, 188103.93822356, 155031.69123039, 143493.64540895,
       182276.31090526, 132464.83047039, 195892.903958  , 185974.47825442,
       185389.29142653, 132007.91215724, 201995.71624768, 167888.36081313,
       165266.59311478, 159588.32513207, 165623.18229116, 124738.54002866,
       167238.71450098, 125514.99295148, 164807.07929157, 162582.85561242,
       228026.70932752, 135492.68758196, 192013.62626016, 192371.58575509,
       145986.12406222, 193257.9722133 , 172287.44685807, 219675.10803845,
       179292.85798811, 195212.65192528, 217985.08353502, 212196.98980713,
       145171.29719035, 206418.37182558, 114704.87084519, 188381.62516625,
       145838.97004701, 202502.47770545, 205679.41737589, 156546.65835622,
       194068.29646219, 177792.12064299, 216624.23517211, 117845.66158459,
       129551.53437843, 172680.07783551, 162329.01503891, 158021.40157525,
       187020.94324441, 1

In [48]:
y_pred.shape

(874,)

In [50]:
from sklearn.metrics import mean_absolute_percentage_error

print(mean_absolute_percentage_error(y_test, y_pred))

0.1936328171656353


# Support Vector Regressor - SVR

In [52]:
from sklearn import svm
from sklearn.svm import SVR

In [54]:
model_svr = svm.SVR()
model_svr.fit(x_train,y_train)

In [57]:
y_pred = model_svr.predict(x_test)

In [58]:
y_pred

array([180921.01441948, 180921.08079752, 180921.28233359, 180921.17244622,
       180921.16766489, 180921.12265556, 180921.13901949, 180921.04535215,
       180921.1813108 , 180920.96535603, 180921.13939829, 180921.12305903,
       180921.15055965, 180921.07316039, 180921.15252171, 180921.04040706,
       180921.12761535, 180921.10061567, 180921.16120246, 180920.96511596,
       180921.17961985, 180921.05478123, 180921.07738972, 180921.15663373,
       180921.28645844, 180920.97418222, 180921.08801378, 180921.22532509,
       180921.08634814, 180921.11134557, 180921.16796057, 180921.29754063,
       180921.07069642, 180921.13958165, 180921.27070821, 180921.2500327 ,
       180921.0716652 , 180921.23854719, 180920.98557254, 180921.19318765,
       180921.07670813, 180921.29467047, 180921.23537034, 180921.09259267,
       180921.17219626, 180923.4604633 , 180921.158861  , 180920.99367275,
       180921.09025953, 180921.11626342, 180921.05351884, 180921.06219857,
       180921.13170342, 1

In [59]:
y_pred.shape

(874,)

In [60]:
print(mean_absolute_percentage_error(y_test, y_pred))

0.19158311711540615


# Random Forest Regression

In [66]:
from sklearn.ensemble import RandomForestRegressor

model_rfr = RandomForestRegressor(n_estimators = 500)
model_rfr.fit(x_train,y_train)

In [67]:
y_pred = model_rfr.predict(x_test)
y_pred

array([135963.59794521, 185300.31753425, 242319.80208767, 166542.46421096,
       218218.09600822, 158497.44689041, 167645.19366027, 156098.83189772,
       192527.15769589, 129076.60750973, 194962.54068425, 173979.87334795,
       183129.09553699, 134667.54196438, 238617.24489041, 160118.19773332,
       150203.79250137, 154435.11361644, 172514.18260548, 149572.09903461,
       164795.25555342, 153123.13083288, 147249.44595448, 175685.13926575,
       252342.6661726 , 133060.15165479, 180883.56353904, 200250.60996438,
       142076.1434274 , 194226.88710137, 169887.86536986, 222461.44198356,
       178373.70994795, 170852.71204658, 215249.14481096, 254308.00370137,
       143095.10878904, 211490.4128137 , 150081.28790479, 245878.9772411 ,
       160609.01399867, 200542.53196438, 252264.35579726, 146978.4935726 ,
       172095.40416986, 177327.32759178, 202961.76953151, 117037.93817671,
       150201.62159452, 152081.42326301, 156307.82934795, 144448.81210959,
       186003.22204658, 1

In [68]:
y_pred.shape

(874,)

In [69]:
mean_absolute_percentage_error(y_test, y_pred)

0.19093317328646436

- Linear Regression-----------0.1936328171656353
- Support Vector Regressor ---0.19158311711540615
- Random Forest Regression----0.19093317328646436

# Conclusion
Clearly, Random Forest Regression model is giving better accuracy as the mean absolute error is the least among all the other regressor models i.e. 0.19093317328646436 .