In [1]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# getting housing data
housing_data = pd.read_csv('datasets/housing.csv')
# getting top 5 records of data
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
19317,-123.04,38.49,30.0,3977.0,930.0,1387.0,582.0,2.6161,132500.0,NEAR OCEAN
1155,-121.44,39.5,26.0,1652.0,325.0,790.0,292.0,3.0446,90800.0,INLAND
18927,-122.2,38.1,5.0,9567.0,1729.0,4620.0,1580.0,4.4821,210000.0,NEAR BAY
18640,-121.98,36.98,29.0,2681.0,632.0,1652.0,620.0,3.075,215800.0,NEAR OCEAN
11792,-121.21,38.83,21.0,3691.0,640.0,1758.0,603.0,3.5607,151900.0,INLAND


In [3]:
# getting shape
housing_data.shape

(20640, 10)

In [4]:
# cleaning data missing value
housing_data = housing_data.dropna()
# getting shape after cleaning
housing_data.shape

(20433, 10)

In [5]:
# getting count of data having outliers or skewed dataset
housing_data.loc[housing_data['median_house_value'] == 500001].count()

longitude             958
latitude              958
housing_median_age    958
total_rooms           958
total_bedrooms        958
population            958
households            958
median_income         958
median_house_value    958
ocean_proximity       958
dtype: int64

In [6]:
# dropping these skewed data accumulated at cap of dataset
housing_data = housing_data.drop(
               housing_data.loc[housing_data['median_house_value'] == 500001].index
)

In [7]:
# getting shape after cleaning skew dataset
housing_data.shape

(19475, 10)

In [8]:
# showing data
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [9]:
# getting categorical colummn data
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [10]:
# converting it to numerical data by using one-hot encoding
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

In [11]:
# getting shape
housing_data.shape
# columns increased from 10 to 14

(19475, 14)

In [12]:
# displaying data
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
9429,-122.68,38.07,26.0,1445.0,244.0,510.0,207.0,5.6305,430000.0,0,0,0,0,1
3297,-122.65,38.97,32.0,1856.0,472.0,703.0,292.0,1.1912,60000.0,0,1,0,0,0
1625,-122.16,37.83,16.0,4596.0,705.0,1480.0,650.0,7.52,370200.0,0,0,0,1,0
19072,-122.49,38.32,30.0,1631.0,284.0,788.0,284.0,3.3098,195500.0,1,0,0,0,0
13337,-117.68,34.04,27.0,574.0,103.0,321.0,103.0,3.9107,186500.0,0,1,0,0,0


In [14]:
# calculating median value
median = housing_data['median_house_value'].median()
print('Median Value :', median)

Median Value : 173800.0


In [16]:
# creating data
housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
13542,-117.31,34.14,38.0,2011.0,448.0,1190.0,403.0,1.8654,89400.0,0,1,0,0,0,False
7963,-118.19,33.86,42.0,1999.0,431.0,1060.0,399.0,3.7031,167100.0,1,0,0,0,0,False
3301,-122.61,38.93,14.0,231.0,36.0,108.0,31.0,4.3897,71300.0,0,1,0,0,0,False
13117,-121.22,38.4,14.0,2655.0,441.0,1277.0,422.0,4.6989,213800.0,0,1,0,0,0,True
631,-122.17,37.72,5.0,1692.0,398.0,814.0,328.0,3.663,158300.0,0,0,0,1,0,False


In [19]:
# creating features and label data
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
Y = housing_data['above_median']

In [20]:
# getting columns
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [21]:
# spliting data into test and train dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [22]:
# getting shape of training and test data
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [23]:
# getting shape of training and test data
y_train.shape, y_test.shape

((15580,), (3895,))

In [24]:
# selecting training model and fitting data into it
from sklearn.linear_model import LogisticRegression
# invoking method and fitting the data in model for training
logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

In [25]:
# getting R-Sqaure that is how best the line fit into the data
print('Training R-Score :', logistic_model.score(x_train, y_train))

Training R-Score : 0.8206033376123235


In [26]:
# predicting the y using x_test
y_pred = logistic_model.predict(x_test)

In [27]:
# comparing results
df_predit_actual = pd.DataFrame({'predicted':y_pred, 'actual':y_test})
df_predit_actual.head(10)

Unnamed: 0,predicted,actual
231,True,True
14688,True,True
19774,False,False
13288,False,False
14808,True,True
7673,True,False
18702,False,False
9870,False,False
15504,True,True
11842,False,False


In [31]:
# getting testing score
from sklearn.metrics import accuracy_score
print('Accuracy Score :', accuracy_score(y_test, y_pred))

Accuracy Score : 0.8225930680359436
