# California Housing Dataset

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
pd.__version__

'1.5.3'

## Get Data

In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

--2023-09-16 16:28:54--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv’


2023-09-16 16:28:54 (20.5 MB/s) - ‘housing.csv’ saved [1423529/1423529]



In [4]:
!ls

housing.csv  sample_data


## Data Exploration

In [5]:
df = pd.read_csv("housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
df.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND
20639,-121.24,39.37,16.0,2785.0,616.0,1387.0,530.0,2.3886,89400.0,INLAND


In [7]:
df.shape

(20640, 10)

The dataset has 20640 rows and 10 columns.

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [9]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### Column with missing values

In [10]:
# returns columns with rows that have missing values and the data type of the column
df.columns[df.isnull().any()]

Index(['total_bedrooms'], dtype='object')

In [11]:
# returns name of columns that contains null rows
df.columns[df.isnull().any()].tolist()

['total_bedrooms']

In [12]:
df.columns[df.isnull().any()]

Index(['total_bedrooms'], dtype='object')

### Unique Values in a Column

Series.nunique() returns the number of unique elements in the column.


In [13]:
df['ocean_proximity'].nunique()

5

In [14]:
#Series.unique() returns a list containing all unique elements only
df['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [15]:
pd.unique(df['ocean_proximity'])

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

### average value of the median_house_value for the houses located near the bay

In [16]:
bay_area_mean = df[df['ocean_proximity'] == 'NEAR BAY']['median_house_value'].mean()
bay_area_mean


259212.31179039303

### Calculate the average of total_bedrooms column in the dataset.

In [17]:
average_total_bedrooms = df['total_bedrooms'].mean()
average_total_bedrooms

537.8705525375618

### Fill null values with the mean
The fillna() method replaces the NaN value in the dataframe.

In [18]:
df.fillna(value=average_total_bedrooms,inplace=True)
df.columns[df.isnull().any()]

Index([], dtype='object')

In [19]:
adjusted_average_total_bedrooms = df['total_bedrooms'].mean()
adjusted_average_total_bedrooms

537.8705525375617

### Island Houses

In [20]:
value_count = df['ocean_proximity'].value_counts()
value_count

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [21]:
island_df = df.loc[df['ocean_proximity'] =='ISLAND']
island_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
8314,-118.32,33.35,27.0,1675.0,521.0,744.0,331.0,2.1579,450000.0,ISLAND
8315,-118.33,33.34,52.0,2359.0,591.0,1100.0,431.0,2.8333,414700.0,ISLAND
8316,-118.32,33.33,52.0,2127.0,512.0,733.0,288.0,3.3906,300000.0,ISLAND
8317,-118.32,33.34,52.0,996.0,264.0,341.0,160.0,2.7361,450000.0,ISLAND
8318,-118.48,33.43,29.0,716.0,214.0,422.0,173.0,2.6042,287500.0,ISLAND


In [22]:
columns = ['housing_median_age','total_rooms', 'total_bedrooms']
new_island_df = pd.DataFrame(island_df, columns=columns)
new_island_df.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms
8314,27.0,1675.0,521.0
8315,52.0,2359.0,591.0
8316,52.0,2127.0,512.0
8317,52.0,996.0,264.0
8318,29.0,716.0,214.0


In [23]:
X = new_island_df.values
XTX = X.T.dot(X)
XTX_inv = np.linalg.inv(XTX)

In [24]:
y = np.array([950, 1300, 800, 1000, 1300])
XTXiXT = XTX_inv.dot(X.T)
w = XTXiXT.dot(y)
print(f"The w vector is: {w}")
print(f"[The last element of w is: {w[-1]}")

The w vector is: [23.12330961 -1.48124183  5.69922946]
[The last element of w is: 5.699229455065586
