In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
housing_data=pd.read_csv('datasets/housing.csv')
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
5377,-118.39,34.04,49.0,1230.0,279.0,669.0,269.0,3.9038,308300.0,<1H OCEAN
9910,-122.29,38.31,25.0,4927.0,1005.0,2756.0,998.0,2.7325,162900.0,NEAR BAY
4246,-118.33,34.12,23.0,1894.0,416.0,769.0,392.0,6.0352,500001.0,<1H OCEAN
6219,-117.91,34.06,29.0,3250.0,521.0,1382.0,513.0,5.112,218300.0,<1H OCEAN
12162,-117.11,33.75,17.0,4174.0,851.0,1845.0,780.0,2.2618,96100.0,INLAND


# Data Prep


In [3]:
housing_data=housing_data.dropna()

In [4]:
housing_data.shape

(20433, 10)

In [5]:
housing_data.loc[housing_data['median_house_value'] == 500001].count()

longitude             958
latitude              958
housing_median_age    958
total_rooms           958
total_bedrooms        958
population            958
households            958
median_income         958
median_house_value    958
ocean_proximity       958
dtype: int64

In [6]:
housing_data=housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)

In [7]:
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [8]:
housing_data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [9]:
housing_data=pd.get_dummies(housing_data)

In [10]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
11207,-117.92,33.83,52.0,1514.0,301.0,855.0,293.0,3.6042,166400.0,1,0,0,0,0
18459,-121.8,37.27,10.0,3301.0,593.0,2190.0,575.0,6.223,260700.0,1,0,0,0,0
1079,-121.82,39.73,44.0,2923.0,659.0,1371.0,626.0,2.2925,85800.0,0,1,0,0,0
17666,-121.87,37.32,36.0,1471.0,360.0,1182.0,326.0,2.7031,175800.0,1,0,0,0,0
15228,-117.2,33.07,5.0,10394.0,1617.0,4496.0,1553.0,5.9289,411300.0,0,0,0,0,1


In [11]:
median = housing_data['median_house_value'].median()
median

173800.0

In [13]:
housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
5808,-118.26,34.14,6.0,1727.0,506.0,1200.0,439.0,4.1083,210700.0,1,0,0,0,0,True
19062,-122.47,38.3,15.0,4885.0,988.0,2175.0,924.0,3.4031,209500.0,1,0,0,0,0,True
7156,-118.15,34.04,44.0,647.0,142.0,457.0,143.0,3.6875,162500.0,1,0,0,0,0,False
31,-122.28,37.84,52.0,2153.0,481.0,1168.0,441.0,1.9615,115200.0,0,0,0,1,0,False
18263,-122.09,37.37,34.0,2165.0,355.0,776.0,339.0,5.2971,442100.0,0,0,0,1,0,True


In [16]:
X=housing_data.drop(['median_house_value','above_median'], axis=1)
Y=housing_data['above_median']
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

# Model Training

In [20]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

In [21]:
print("Training Score: ", logistic_model.score(x_train, y_train))

Training Score:  0.8223363286264441


In [22]:
y_pred = logistic_model.predict(x_test)

In [23]:
df_pred_actual = pd.DataFrame({'predicted':y_pred, 'actual':y_test})
df_pred_actual.head(10)

Unnamed: 0,predicted,actual
2866,False,False
19423,False,True
18741,False,False
18664,True,True
20349,True,False
2832,False,False
6578,True,True
1633,False,True
11794,False,False
13421,False,False


In [25]:
from sklearn.metrics import accuracy_score
print('Testing score: ', accuracy_score(y_test, y_pred))

Testing score:  0.8120667522464698
