In [414]:
% pylab inline 
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder #sklearn means skikitlearn which contain many libraries for ml
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt #for visualization or for presenting data to client
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
%matplotlib inline 

Populating the interactive namespace from numpy and matplotlib


# LOAD THE DATA

In [415]:
train = pd.read_csv('housing.csv')

In [416]:
train.shape

(20640, 10)

In [417]:
train.dtypes

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
ocean_proximity        object
median_house_value      int64
dtype: object

# HANDLING MISSING VALUES

In [418]:
train.isnull().sum() 

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [419]:
train['total_bedrooms'].value_counts()

280.0     55
331.0     51
345.0     50
393.0     49
343.0     49
348.0     48
328.0     48
394.0     48
272.0     47
309.0     47
295.0     46
317.0     46
322.0     46
399.0     46
314.0     46
284.0     45
388.0     45
291.0     45
346.0     45
287.0     45
340.0     45
313.0     45
290.0     45
312.0     44
361.0     44
460.0     44
390.0     44
300.0     44
294.0     44
269.0     44
          ..
2574.0     1
2141.0     1
2460.0     1
3479.0     1
1679.0     1
1758.0     1
3224.0     1
1127.0     1
2010.0     1
1571.0     1
1494.0     1
1052.0     1
1570.0     1
2814.0     1
1437.0     1
1887.0     1
3336.0     1
1215.0     1
1736.0     1
940.0      1
980.0      1
2009.0     1
3864.0     1
2289.0     1
1288.0     1
2205.0     1
1448.0     1
1691.0     1
2537.0     1
2546.0     1
Name: total_bedrooms, Length: 1923, dtype: int64

In [420]:
#train['total_bedrooms'] = train['total_bedrooms'].fillna(train['total_bedrooms'].mean())
train['total_bedrooms'] = train['total_bedrooms'].fillna(train['total_bedrooms'].mode()[0])

In [421]:
train.isnull().sum() 

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64

In [422]:
train['total_bedrooms'].value_counts().head(10)

280.0    262
331.0     51
345.0     50
393.0     49
343.0     49
348.0     48
328.0     48
394.0     48
272.0     47
309.0     47
Name: total_bedrooms, dtype: int64

In [423]:
train1 = train.drop(['longitude', 'latitude','ocean_proximity','median_house_value'], axis=1)

In [424]:
scaler = StandardScaler()
x = scaler.fit_transform(train1)

In [425]:
train1.dtypes

housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
dtype: object

In [426]:
train.dtypes

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
ocean_proximity        object
median_house_value      int64
dtype: object

In [427]:
train1['ocean_proximity'] = train['ocean_proximity']

In [428]:
train1['median_house_value'] = train['median_house_value']

# ENCODE CATEGORICAL DATA 

In [429]:
train1['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [430]:
dummy = pd.get_dummies(train1['ocean_proximity']) #this is one hot encoding
dummy.head()

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0


In [431]:
train1 = pd.concat([train1, dummy], axis=1)

In [432]:
train1.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,41,880,129.0,322,126,8.3252,NEAR BAY,452600,0,0,0,1,0
1,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500,0,0,0,1,0
2,52,1467,190.0,496,177,7.2574,NEAR BAY,352100,0,0,0,1,0
3,52,1274,235.0,558,219,5.6431,NEAR BAY,341300,0,0,0,1,0
4,52,1627,280.0,565,259,3.8462,NEAR BAY,342200,0,0,0,1,0


# SPLIT THE DATASET

In [433]:
train1.columns

Index(['housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
       'households', 'median_income', 'ocean_proximity', 'median_house_value',
       '<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype='object')

In [434]:
train1.dtypes

housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
ocean_proximity        object
median_house_value      int64
<1H OCEAN               uint8
INLAND                  uint8
ISLAND                  uint8
NEAR BAY                uint8
NEAR OCEAN              uint8
dtype: object

In [435]:
train1 = train1.drop(['ocean_proximity'], axis=1)

In [436]:
train1.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,41,880,129.0,322,126,8.3252,452600,0,0,0,1,0
1,21,7099,1106.0,2401,1138,8.3014,358500,0,0,0,1,0
2,52,1467,190.0,496,177,7.2574,352100,0,0,0,1,0
3,52,1274,235.0,558,219,5.6431,341300,0,0,0,1,0
4,52,1627,280.0,565,259,3.8462,342200,0,0,0,1,0


In [437]:
train1.shape

(20640, 12)

In [438]:
x = train1.drop('median_house_value', axis=1)
y = train1.median_house_value

In [439]:
#scaler = StandardScaler()
#x = scaler.fit_transform(x)

In [440]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 10)

In [441]:
print(train.shape)
print (x.shape)
print (y.shape)
print (xTrain.shape)
print (xTest.shape)
print (yTrain.shape)
print (yTest.shape)

(20640, 10)
(20640, 11)
(20640,)
(16512, 11)
(4128, 11)
(16512,)
(4128,)


# PERFORM LINEAR REGRESSION

In [442]:
lin = LinearRegression()

In [443]:
lin.fit(xTrain, yTrain)
predictions = lin.predict(xTest)

In [444]:
print(sqrt(mean_squared_error(yTest, predictions)))

70398.39215698164


In [445]:
errors = abs(predictions - yTest)

In [446]:
mape = 100 * (errors / yTest)

In [447]:
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 71.9 %.


# PERFORM DECISION TREE REGRESSION

In [448]:
dtree_reg = DecisionTreeRegressor(max_depth=10)

In [449]:
dtree_reg.fit(xTrain, yTrain)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [450]:
pred = dtree_reg.predict(xTest)
print(sqrt(mean_squared_error(yTest, pred)))

71272.13823644951


In [451]:
errors = abs(predictions - yTest)

In [452]:
mape = 100 * (errors / yTest)

In [453]:
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 71.9 %.


# RANDOM FOREST 

In [454]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(xTrain,yTrain);

In [455]:
predictions = rf.predict(xTest)

In [456]:
print(sqrt(mean_squared_error(yTest, predictions)))

60940.268034806795


In [457]:
errors = abs(predictions - yTest)

In [458]:
mape = 100 * (errors / yTest)

In [459]:
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 75.86 %.
