In [29]:
import pandas as pd
import numpy as np
import seaborn as sns

In [30]:
df = pd.read_csv('housing.csv')

In [31]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [33]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [34]:
df.nunique()

longitude               844
latitude                862
housing_median_age       52
total_rooms            5926
total_bedrooms         1923
population             3888
households             1815
median_income         12928
median_house_value     3842
ocean_proximity           5
dtype: int64

In [35]:
df.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [36]:
df.groupby(['ocean_proximity'])[['median_house_value']].mean()

Unnamed: 0_level_0,median_house_value
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,240084.285464
INLAND,124805.392001
ISLAND,380440.0
NEAR BAY,259212.31179
NEAR OCEAN,249433.977427


In [37]:
# Avg of total bedrooms column
mean_tot_beds = df['total_bedrooms'].mean()
mean_tot_beds

537.8705525375618

In [40]:
# Use the fillna method to fill the missing values 
# in total_bedrooms with the mean value from the previous step.
df['total_bedrooms'] = df['total_bedrooms'].fillna(mean_tot_beds)

In [41]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [44]:
mean_tot_beds

537.8705525375618

In [45]:
# Select all the options located on islands.
df[df.ocean_proximity == 'ISLAND']

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
8314,-118.32,33.35,27.0,1675.0,521.0,744.0,331.0,2.1579,450000.0,ISLAND
8315,-118.33,33.34,52.0,2359.0,591.0,1100.0,431.0,2.8333,414700.0,ISLAND
8316,-118.32,33.33,52.0,2127.0,512.0,733.0,288.0,3.3906,300000.0,ISLAND
8317,-118.32,33.34,52.0,996.0,264.0,341.0,160.0,2.7361,450000.0,ISLAND
8318,-118.48,33.43,29.0,716.0,214.0,422.0,173.0,2.6042,287500.0,ISLAND


In [47]:
# Select only columns housing_median_age, total_rooms, total_bedrooms.
X = df[df.ocean_proximity == 'ISLAND'][['housing_median_age', 'total_rooms', 'total_bedrooms']]

In [48]:
# Get the underlying NumPy array. Let's call it X.
X

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms
8314,27.0,1675.0,521.0
8315,52.0,2359.0,591.0
8316,52.0,2127.0,512.0
8317,52.0,996.0,264.0
8318,29.0,716.0,214.0


In [49]:
X = np.array(X)

In [50]:
X

array([[  27., 1675.,  521.],
       [  52., 2359.,  591.],
       [  52., 2127.,  512.],
       [  52.,  996.,  264.],
       [  29.,  716.,  214.]])

In [51]:
# Compute matrix-matrix multiplication between 
# the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
XTX = X.T.dot(X)
XTX

array([[9.6820000e+03, 3.5105300e+05, 9.1357000e+04],
       [3.5105300e+05, 1.4399307e+07, 3.7720360e+06],
       [9.1357000e+04, 3.7720360e+06, 9.9835800e+05]])

In [52]:
# Compute the inverse of XTX.
XTX_inv = np.linalg.inv(XTX)
XTX_inv

array([[ 9.19403586e-04, -3.66412216e-05,  5.43072261e-05],
       [-3.66412216e-05,  8.23303633e-06, -2.77534485e-05],
       [ 5.43072261e-05, -2.77534485e-05,  1.00891325e-04]])

In [54]:
# Create an array y with values [950, 1300, 800, 1000, 1300].
y = [950, 1300, 800, 1000, 1300]
y

[950, 1300, 800, 1000, 1300]

In [55]:
# Multiply the inverse of XTX with the transpose of X, 
# and then multiply the result by y. Call the result w.
w = XTX_inv.dot(X.T).dot(y)
w

array([23.12330961, -1.48124183,  5.69922946])