In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [103]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

In [104]:
df =  pd.read_csv('Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [105]:
df.shape

(545, 13)

In [106]:
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [107]:
df.dtypes

price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object

In [108]:
df['mainroad'].value_counts()

yes    468
no      77
Name: mainroad, dtype: int64

In [109]:
df['guestroom'].value_counts()

no     448
yes     97
Name: guestroom, dtype: int64

In [110]:
df['hotwaterheating'].value_counts()

no     520
yes     25
Name: hotwaterheating, dtype: int64

In [111]:
df['airconditioning'].value_counts()

no     373
yes    172
Name: airconditioning, dtype: int64

In [112]:
df['prefarea'].value_counts()

no     417
yes    128
Name: prefarea, dtype: int64

In [113]:
df['furnishingstatus'].value_counts()

semi-furnished    227
unfurnished       178
furnished         140
Name: furnishingstatus, dtype: int64

In [114]:
from sklearn.preprocessing import LabelEncoder

In [115]:
lb = LabelEncoder()

In [116]:
df['mainroad'] = lb.fit_transform(df['mainroad'])
df['mainroad'].value_counts()

1    468
0     77
Name: mainroad, dtype: int64

In [117]:
df['guestroom'] = lb.fit_transform(df['guestroom'])
df['guestroom'].value_counts()

0    448
1     97
Name: guestroom, dtype: int64

In [118]:
df['hotwaterheating'] = lb.fit_transform(df['hotwaterheating'])
df['hotwaterheating'].value_counts()

0    520
1     25
Name: hotwaterheating, dtype: int64

In [119]:
df['airconditioning'] = lb.fit_transform(df['airconditioning'])
df['airconditioning'].value_counts()

0    373
1    172
Name: airconditioning, dtype: int64

In [120]:
df['prefarea'] = lb.fit_transform(df['prefarea'])
df['prefarea'].value_counts()

0    417
1    128
Name: prefarea, dtype: int64

In [121]:
df['furnishingstatus'] = lb.fit_transform(df['furnishingstatus'])
df['furnishingstatus'].value_counts()

1    227
2    178
0    140
Name: furnishingstatus, dtype: int64

In [122]:
df.dtypes

price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad             int32
guestroom            int32
basement            object
hotwaterheating      int32
airconditioning      int32
parking              int64
prefarea             int32
furnishingstatus     int32
dtype: object

In [123]:
df['basement'] = lb.fit_transform(df['basement'])
df['basement'].value_counts()

0    354
1    191
Name: basement, dtype: int64

In [124]:
df.dtypes

price               int64
area                int64
bedrooms            int64
bathrooms           int64
stories             int64
mainroad            int32
guestroom           int32
basement            int32
hotwaterheating     int32
airconditioning     int32
parking             int64
prefarea            int32
furnishingstatus    int32
dtype: object

### Model-1

In [125]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [126]:
y = df['price'] # df.iloc[:,0]
x = df.iloc[:,1:]
print(type(x),type(y))
print(x.shape,y.shape)

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
(545, 12) (545,)


In [127]:
x.head(3)

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420,4,2,3,1,0,0,0,1,2,1,0
1,8960,4,4,4,1,0,0,0,1,3,0,0
2,9960,3,2,2,1,0,1,0,0,2,1,1


In [128]:
y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

In [129]:
df.shape

(545, 13)

In [130]:
x_tr,x_te,y_tr,y_te = train_test_split(x,y,test_size=0.25)
print(x_tr.shape)
print(x_te.shape)
print(y_tr.shape)
print(y_te.shape)

(408, 12)
(137, 12)
(408,)
(137,)


In [131]:
from sklearn.linear_model import LinearRegression

In [132]:
m1 = LinearRegression()
m1.fit(x_tr,y_tr)

LinearRegression()

In [133]:
# R2 score
print('Training score',m1.score(x_tr,y_tr))
print('Testing score',m1.score(x_te,y_te))

Training score 0.6802087140094784
Testing score 0.6606682067534366


In [134]:
ypred_m1 = m1.predict(x_te)
print(ypred_m1)

[7043966.71286761 2261203.83982427 6572752.40222169 4573180.83110203
 4161355.93900107 5039430.2912641  5389152.90064038 3891434.1742551
 3858301.01484614 3633366.3162807  8045754.72987723 3271329.17663901
 4050463.81765357 4300233.61819365 6401231.12776858 4506736.50335207
 2832423.33808086 7415661.27412043 5071174.66155744 4977640.61800959
 4761931.5184982  7061443.35330562 4863178.18951884 2028487.61134095
 7621729.4274581  3739795.23403222 3881876.74060747 5577788.14542713
 3254620.55863628 4553817.29197798 9428347.24835242 5820775.15538036
 4667904.08881419 7282029.20319559 2770607.76885422 3578319.46986724
 4181565.5368847  5172008.10827798 2744785.76525911 2738169.32346889
 7344167.21367305 6830755.81124747 5686146.05496585 7800589.30520253
 3015071.23864365 2637781.93078982 4962126.09299184 5685533.88906405
 6683551.04528956 5428083.86394348 4539662.61984895 5964611.05525924
 5113301.35629626 4114122.95053948 7038951.82684533 5497234.22505024
 2916128.79230908 3164891.91841891 

In [135]:
mse_m1 = mean_squared_error(y_te,ypred_m1)
rmse_m1 = np.sqrt(mean_squared_error(y_te,ypred_m1))
mae_m1 = mean_absolute_error(y_te,ypred_m1)
print('MSE m1',mse_m1)
print('RMSE m1',rmse_m1)
print('MAE m1',mae_m1)

MSE m1 1433200672093.6616
RMSE m1 1197163.5945407217
MAE m1 872344.7170771698


### Model-2

In [136]:
df.corr()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
price,1.0,0.535997,0.366494,0.517545,0.420712,0.296898,0.255517,0.187057,0.093073,0.452954,0.384394,0.329777,-0.304721
area,0.535997,1.0,0.151858,0.19382,0.083996,0.288874,0.140297,0.047417,-0.009229,0.222393,0.35298,0.234779,-0.171445
bedrooms,0.366494,0.151858,1.0,0.37393,0.408564,-0.012033,0.080549,0.097312,0.046049,0.160603,0.13927,0.079023,-0.123244
bathrooms,0.517545,0.19382,0.37393,1.0,0.326165,0.042398,0.126469,0.102106,0.067159,0.186915,0.177496,0.063472,-0.143559
stories,0.420712,0.083996,0.408564,0.326165,1.0,0.121706,0.043538,-0.172394,0.018847,0.293602,0.045547,0.044425,-0.104672
mainroad,0.296898,0.288874,-0.012033,0.042398,0.121706,1.0,0.092337,0.044002,-0.011781,0.105423,0.204433,0.199876,-0.156726
guestroom,0.255517,0.140297,0.080549,0.126469,0.043538,0.092337,1.0,0.372066,-0.010308,0.138179,0.037466,0.160897,-0.118328
basement,0.187057,0.047417,0.097312,0.102106,-0.172394,0.044002,0.372066,1.0,0.004385,0.047341,0.051497,0.228083,-0.112831
hotwaterheating,0.093073,-0.009229,0.046049,0.067159,0.018847,-0.011781,-0.010308,0.004385,1.0,-0.130023,0.067864,-0.059411,-0.031628
airconditioning,0.452954,0.222393,0.160603,0.186915,0.293602,0.105423,0.138179,0.047341,-0.130023,1.0,0.159173,0.117382,-0.150477


In [137]:
df.corr()['price']

price               1.000000
area                0.535997
bedrooms            0.366494
bathrooms           0.517545
stories             0.420712
mainroad            0.296898
guestroom           0.255517
basement            0.187057
hotwaterheating     0.093073
airconditioning     0.452954
parking             0.384394
prefarea            0.329777
furnishingstatus   -0.304721
Name: price, dtype: float64

In [138]:
df.corr()['price'] >0.35

price                True
area                 True
bedrooms             True
bathrooms            True
stories              True
mainroad            False
guestroom           False
basement            False
hotwaterheating     False
airconditioning      True
parking              True
prefarea            False
furnishingstatus    False
Name: price, dtype: bool

In [139]:
y = df['price']
x = df.iloc[:,[1,2,3,4,9,10,11]]
x.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,airconditioning,parking,prefarea
0,7420,4,2,3,1,2,1
1,8960,4,4,4,1,3,0
2,9960,3,2,2,0,2,1
3,7500,4,2,2,1,3,1
4,7420,4,1,2,1,2,0


In [140]:
x_tr,x_te,y_tr,y_te = train_test_split(x,y,test_size=0.25)
print(x_tr.shape)
print(x_te.shape)
print(y_tr.shape)
print(y_te.shape)

(408, 7)
(137, 7)
(408,)
(137,)


In [141]:
m2 = LinearRegression()
m2.fit(x_tr,y_tr)

LinearRegression()

In [142]:
# R2 score
print('Training score',m2.score(x_tr,y_tr))
print('Testing score',m2.score(x_te,y_te))

Training score 0.6297594795010557
Testing score 0.6604055285948669


In [143]:
from sklearn.metrics import confusion_matrix,classification_report

In [144]:
ypred_m2 = m2.predict(x_te)
print(ypred_m2)

[3125055.54514033 6075936.49474194 3451230.99986177 7821353.59382406
 3632379.63007162 4339490.62421742 6753531.76372861 3520984.15158488
 7389332.28031017 7028419.62033754 3331078.48957497 7071204.52534607
 6442585.35188309 6358882.86176688 4617767.44587996 6101918.48519953
 2904764.97720884 7389332.28031017 3955537.80218495 3677972.83217974
 3513912.93769487 5685352.95699471 2885895.48666065 3995840.20876345
 5306686.59506882 4769673.61264754 4171370.50189697 7407018.04235965
 8509867.58832243 6814338.1654869  7960286.13058762 4966059.71254294
 5877672.1384599  2935149.88313042 3837796.29173881 4185956.67209031
 5174824.05444237 8199123.08203213 5757095.84904955 5222688.24678383
 3574157.73694765 3369059.62197695 5110348.03151279 5110358.84217145
 3966110.02525407 2881976.29776765 4804389.37526334 3098589.82811175
 3899650.51351077 3767845.66897203 3577961.21494616 6460049.50195696
 7225892.33532884 6345182.7546327  5338156.25404417 3546304.90651953
 3008580.07244092 3513912.93769487

In [145]:
mse_m2 = mean_squared_error(y_te,ypred_m2)
rmse_m2 = np.sqrt(mean_squared_error(y_te,ypred_m2))
mae_m2 = mean_absolute_error(y_te,ypred_m2)
print('MSE m2',mse_m2)
print('RMSE m2',rmse_m2)
print('MAE m2',mae_m2)

MSE m2 1264343237370.1475
RMSE m2 1124430.1834129798
MAE m2 847964.4816048648
