In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [103]:
dataset = pd.read_csv('housing.csv')
dataset

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,INLAND
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,INLAND
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,INLAND
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,INLAND


In [104]:
print(dataset.isnull().sum())

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [105]:
# preprocesing
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
dataset[['total_bedrooms']] = imputer.fit_transform(dataset[['total_bedrooms']])
print(dataset.isnull().sum())


longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64


In [106]:
print(dataset[['total_bedrooms']].isnull().sum())

total_bedrooms    0
dtype: int64


In [107]:
# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
A=dataset.iloc[:,0:8].values
A=scaler.fit_transform(A)

In [108]:
A

array([[-1.32783522,  1.05254828,  0.98214266, ..., -0.9744286 ,
        -0.97703285,  2.34476576],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.86143887,
         1.66996103,  2.33223796],
       [-1.33282653,  1.03850269,  1.85618152, ..., -0.82077735,
        -0.84363692,  1.7826994 ],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ..., -0.3695372 ,
        -0.17404163, -1.14259331],
       [-0.87362627,  1.77823747, -0.84539315, ..., -0.60442933,
        -0.39375258, -1.05458292],
       [-0.83369581,  1.75014627, -1.00430931, ..., -0.03397701,
         0.07967221, -0.78012947]])

In [109]:
# Encoding target column
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['ocean_proximity'] = label_encoder.fit_transform(dataset['ocean_proximity'])

In [110]:
print(dataset['ocean_proximity'])

0        3
1        3
2        3
3        3
4        3
        ..
20635    1
20636    1
20637    1
20638    1
20639    1
Name: ocean_proximity, Length: 20640, dtype: int64


**Preprocessing i.e finding missing values in the column "total_bedrooms"**

In [111]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan,strategy='mean')
imputer.fit(dataset[['total_bedrooms']])
dataset[['total_bedrooms']] = imputer.transform(dataset[['total_bedrooms']])

**Scaling the data**

In [112]:
A = dataset.iloc[:,:8].values

In [113]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
A = sc.fit_transform(A)

In [114]:
A

array([[-1.32783522,  1.05254828,  0.98214266, ..., -0.9744286 ,
        -0.97703285,  2.34476576],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.86143887,
         1.66996103,  2.33223796],
       [-1.33282653,  1.03850269,  1.85618152, ..., -0.82077735,
        -0.84363692,  1.7826994 ],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ..., -0.3695372 ,
        -0.17404163, -1.14259331],
       [-0.87362627,  1.77823747, -0.84539315, ..., -0.60442933,
        -0.39375258, -1.05458292],
       [-0.83369581,  1.75014627, -1.00430931, ..., -0.03397701,
         0.07967221, -0.78012947]])

**Encode the ocean_proximity column**

In [115]:
print(dataset['ocean_proximity'])

0        3
1        3
2        3
3        3
4        3
        ..
20635    1
20636    1
20637    1
20638    1
20639    1
Name: ocean_proximity, Length: 20640, dtype: int64


In [116]:
distinct_values = dataset['ocean_proximity'].unique()
print(distinct_values)

[3 0 1 4 2]


In [117]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['ocean_proximity'] = label_encoder.fit_transform(dataset['ocean_proximity'])

In [118]:
p=dataset.iloc[:,-1].values
p

array([3, 3, 3, ..., 1, 1, 1])

In [119]:
distinct_values = dataset['median_house_value'].unique()
print(distinct_values)

[452600 358500 352100 ... 425800 200700  47000]


**Plot correlation matrix to know which columns are needed**

In [120]:
correlation_cal = dataset.corr()['median_house_value'].abs().sort_values(ascending=False)

# Print out the correlations
print(correlation_cal)

median_house_value    1.000000
median_income         0.688075
latitude              0.144160
total_rooms           0.134153
housing_median_age    0.105623
ocean_proximity       0.081750
households            0.065843
total_bedrooms        0.049454
longitude             0.045967
population            0.024650
Name: median_house_value, dtype: float64


**-ve values of correlation matrix implies inversely related, +ve values of correlation matrix implies directly related**

In [121]:
X=dataset.iloc[:,-3].values   #median_income
y=dataset.iloc[:,-2].values   #median_house_value

In [122]:
X

array([8.3252, 8.3014, 7.2574, ..., 1.7   , 1.8672, 2.3886])

In [123]:
y

array([452600, 358500, 352100, ...,  92300,  84700,  89400])

In [124]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)

In [125]:
X_train.shape

(16512,)

In [126]:
X_test.shape

(4128,)

In [127]:
X_train=X_train.reshape(-1,1)

In [128]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [129]:
X_test=X_test.reshape(-1,1)

In [130]:
y_pred = regressor.predict(X_test)
y_pred

array([181402.07011843, 127924.35050918, 213498.79519375, ...,
       257017.7824945 , 249742.18836711, 158498.66802721])

In [131]:
m=regressor.coef_
m

array([42055.4573838])

In [132]:
b=regressor.intercept_
b

44721.833621067955

In [133]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("MAE",mean_absolute_error(y_test,y_pred))

MAE 62184.11511576648


In [134]:
print("MSE",mean_squared_error(y_test,y_pred))

MSE 6926929696.091081


In [135]:
print("RMSE",np.sqrt(mean_squared_error(y_test,y_pred)))

RMSE 83228.17849797675


In [136]:
print("R2_Score",r2_score(y_test,y_pred))
r2 = r2_score(y_test,y_pred)

R2_Score 0.4719083593446771


**Improving the model by adding few other important features i.e latitude, housing_median_age, total_rooms to meadian_income**

In [137]:
X= dataset[['latitude','housing_median_age','total_rooms','median_income']]
y = dataset['median_house_value']

In [138]:
X.shape

(20640, 4)

In [139]:
X_train , X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)

In [140]:
X_train.shape

(16512, 4)

In [141]:
X_test.shape

(4128, 4)

In [142]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [143]:
y_pred = regressor.predict(X_test)
y_pred

array([210003.02333562, 137006.84531556, 235541.34070982, ...,
       232985.18851148, 279622.51582355, 209425.84835973])

In [144]:
coefficient = pd.DataFrame(regressor.coef_,X.columns,columns=['Coefficient'])
print(coefficient)

                     Coefficient
latitude            -4651.910211
housing_median_age   2006.340048
total_rooms             3.837418
median_income       42412.642325


In [145]:
df = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
print(df)

       Actual      Predicted
4712   355000  210003.023336
2151    70700  137006.845316
15927  229400  235541.340710
82     112500  135822.417416
8161   225400  284030.100179
...       ...            ...
2319    68200  127862.791882
5341   225000  174435.682699
16888  350000  232985.188511
6823   227300  279622.515824
11878  141700  209425.848360

[4128 rows x 2 columns]


In [146]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("MAE",mean_absolute_error(y_test,y_pred))

MAE 59398.099471212714


In [147]:
print("MSE",mean_squared_error(y_test,y_pred))

MSE 6343605353.93375


In [148]:
print("RMSE",np.sqrt(mean_squared_error(y_test,y_pred)))

RMSE 79646.75356807552


In [149]:
print("R2_Score",r2_score(y_test,y_pred))
r2 = r2_score(y_test,y_pred)

R2_Score 0.516379535233453
