# Exploratory Data Analysis on the California Housing Prices
<img></img>

### Importing Libraries

In [1]:
import os
import tarfile
import urllib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

import seaborn as sns
sns.set_style('darkgrid')
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
%matplotlib inline 

### Importing the Dataset on California Housing Prices

In [2]:

dataset_path = os.path.join("dataset")
download_url="https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz"

In [3]:
def fetch_data(download_url= download_url, dataset_path= dataset_path):
    os.makedirs(dataset_path, exist_ok= True)
    tgz_path = os.path.join(dataset_path,"housing.tgz")
    urllib.request.urlretrieve(download_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path= dataset_path)
    housing_tgz.close()

In [4]:
fetch_data()

In [5]:
def load_data(dataset_path= dataset_path):
    csv_path = os.path.join(dataset_path, "housing.csv")
    return pd.read_csv(csv_path)

In [6]:
df = load_data()

### Glance of the DataSet

In [7]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


### Inference:
We can observe that the dataset consists of 20,639 households across 10 different attributes

### Cleaning and Filtering the data

In [8]:
df = df.drop_duplicates() 
df.duplicated().values.any()  #Finding any duplicates


False

In [9]:
df = df.fillna(method="ffill")
pd.isnull(df).any()  # Checking for Null Values

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms        False
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

### Feature Engineering

In [10]:
df['avgRooms'] = df['total_rooms'] / df['households']
df['avgBedrooms'] = df['total_bedrooms'] / df['households']
df['pop_per_household'] = df['population'] / df['households']

I have added more number of features which can help with the proper distribution and predicting better values in our model
<li>Average Rooms per House</li>
<li>Average Bedrooms per House</li>
<li>Number of people per household</li>

### Feature Engineering: Dummy Variable

I have created a dummy variable inorder to categorize the ocean proximity attribute in the dataset.

In [11]:
dum = pd.get_dummies(df.ocean_proximity)

In [12]:
merged_df = pd.concat([df, dum], axis = 'columns')

In [13]:
merged_df = merged_df.drop(['ocean_proximity', 'ISLAND'], axis= 'columns')
merged_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,avgRooms,avgBedrooms,pop_per_household,<1H OCEAN,INLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,6.984127,1.02381,2.555556,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,6.238137,0.97188,2.109842,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,8.288136,1.073446,2.80226,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,5.817352,1.073059,2.547945,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,6.281853,1.081081,2.181467,0,0,1,0


In [14]:
X = merged_df.drop('median_house_value', axis= 'columns')
y = merged_df['median_house_value']

## Training and Test Dataset Split

In [15]:
prices = merged_df['median_house_value']
features = merged_df.drop('median_house_value', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2)

#len(X_train)/len(features)

In [16]:
regr = LinearRegression()
regr.fit(X_train, y_train)

print('Intercept', regr.intercept_)
pd.DataFrame(data = regr.coef_, index=X_train.columns, columns = ['Coef'])

Intercept -2248891.016222005


Unnamed: 0,Coef
longitude,-28509.315578
latitude,-27555.910579
housing_median_age,1004.781233
total_rooms,-2.165877
total_bedrooms,38.834678
population,-47.513245
households,117.615799
median_income,38314.862941
avgRooms,-683.33246
avgBedrooms,10791.962057


In [17]:
regr.score(X_train, y_train)

0.6475739471000801

In [18]:
pickle.dump(regr, open('model.pkl','wb'))
pickle.load(open('model.pkl','rb'))

LinearRegression()

In [19]:
X_train.shape

(16512, 15)

In [20]:
X_train.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'avgRooms', 'avgBedrooms', 'pop_per_household', '<1H OCEAN', 'INLAND',
       'NEAR BAY', 'NEAR OCEAN'],
      dtype='object')

In [24]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,avgRooms,avgBedrooms,pop_per_household,<1H OCEAN,INLAND,NEAR BAY,NEAR OCEAN
4645,-118.31,34.06,47.0,3038.0,1533.0,4225.0,1472.0,1.6725,2.063859,1.041440,2.870245,1,0,0,0
7726,-118.12,33.92,27.0,6336.0,1628.0,4673.0,1505.0,2.5893,4.209967,1.081728,3.104983,1,0,0,0
9259,-120.07,36.96,42.0,963.0,216.0,471.0,211.0,2.2898,4.563981,1.023697,2.232227,0,1,0,0
3434,-118.43,34.25,35.0,1447.0,335.0,1630.0,306.0,2.9205,4.728758,1.094771,5.326797,1,0,0,0
560,-122.24,37.76,52.0,1846.0,471.0,827.0,446.0,2.6833,4.139013,1.056054,1.854260,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4083,-118.38,34.15,36.0,2933.0,619.0,1115.0,579.0,4.3036,5.065630,1.069085,1.925734,1,0,0,0
9674,-118.74,37.58,20.0,3301.0,779.0,1085.0,448.0,3.7315,7.368304,1.738839,2.421875,0,1,0,0
9950,-122.33,38.38,28.0,1020.0,169.0,504.0,164.0,4.5694,6.219512,1.030488,3.073171,0,1,0,0
12149,-117.20,33.70,23.0,6323.0,1196.0,1984.0,1124.0,2.3276,5.625445,1.064057,1.765125,1,0,0,0


In [25]:
X_train.info

<bound method DataFrame.info of        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
4645     -118.31     34.06                47.0       3038.0          1533.0   
7726     -118.12     33.92                27.0       6336.0          1628.0   
9259     -120.07     36.96                42.0        963.0           216.0   
3434     -118.43     34.25                35.0       1447.0           335.0   
560      -122.24     37.76                52.0       1846.0           471.0   
...          ...       ...                 ...          ...             ...   
4083     -118.38     34.15                36.0       2933.0           619.0   
9674     -118.74     37.58                20.0       3301.0           779.0   
9950     -122.33     38.38                28.0       1020.0           169.0   
12149    -117.20     33.70                23.0       6323.0          1196.0   
2334     -119.69     36.83                32.0       1098.0           189.0   

       population  

In [21]:
regr.score(X_test, y_test)

0.6330221530324702

## We find that our model has an accuracy of around "64.72%"
### We can use our model to predict the value of any house by providing the given attributes with 65% accuracy