In [0]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import pprint

In [0]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

--2022-09-19 13:43:34--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv.1’


2022-09-19 13:43:34 (21.6 MB/s) - ‘housing.csv.1’ saved [1423529/1423529]



In [0]:
# Reading Data with Pandas 
house_df = pd.read_csv('housing.csv')

In [0]:
house_df.dtypes

Out[27]: longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [0]:
print(house_df.shape, len(house_df))

(20640, 10) 20640


In [0]:
features = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value']

house_df = house_df[features]

In [0]:
house_df.head

Out[30]: <bound method NDFrame.head of        latitude  longitude  housing_median_age  total_rooms  total_bedrooms  \
0         37.88    -122.23                41.0        880.0           129.0   
1         37.86    -122.22                21.0       7099.0          1106.0   
2         37.85    -122.24                52.0       1467.0           190.0   
3         37.85    -122.25                52.0       1274.0           235.0   
4         37.85    -122.25                52.0       1627.0           280.0   
...         ...        ...                 ...          ...             ...   
20635     39.48    -121.09                25.0       1665.0           374.0   
20636     39.49    -121.21                18.0        697.0           150.0   
20637     39.43    -121.22                17.0       2254.0           485.0   
20638     39.43    -121.32                18.0       1860.0           409.0   
20639     39.37    -121.24                16.0       2785.0           616.0   

       popul

In [0]:
house_df.isnull().sum()

Out[31]: latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [0]:
house_df.population.describe()

Out[32]: count    20640.000000
mean      1425.476744
std       1132.462122
min          3.000000
25%        787.000000
50%       1166.000000
75%       1725.000000
max      35682.000000
Name: population, dtype: float64

In [0]:
house_df.population.median()

Out[33]: 1166.0

In [0]:
n = len(house_df)
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [0]:
house_df_train = house_df.iloc[idx[:n_train]]
house_df_val = house_df.iloc[idx[n_train:n_train+n_val]]
house_df_test = house_df.iloc[idx[n_train+n_val:]]

house_df_train = house_df_train.reset_index(drop=True)
house_df_val = house_df_val.reset_index(drop=True)
house_df_test = house_df_test.reset_index(drop=True)

y_train = np.log1p(house_df_train.median_house_value.values)
y_val = np.log1p(house_df_val.median_house_value.values)
y_test = np.log1p(house_df_test.median_house_value.values)

del house_df_train['median_house_value']
del house_df_val['median_house_value']
del house_df_test['median_house_value']

In [0]:
total_bedrooms_train_mean_value = house_df_train.total_bedrooms.mean()
total_bedrooms_train_mean_value

Out[36]: 533.4803317730147

In [0]:
def prepare_data(house_df, replace_value=None):
    house_df_temp = house_df.copy()
    if replace_value is None:
        X = house_df_temp.values
    else:
        house_df_temp['total_bedrooms'] = house_df_temp['total_bedrooms'].fillna(replace_value)
        X = house_df_temp.values
    return X
    
prepare_data(house_df_train, total_bedrooms_train_mean_value)

Out[37]: array([[ 3.6060e+01, -1.1901e+02,  2.5000e+01, ...,  1.3920e+03,
         3.5900e+02,  1.6812e+00],
       [ 3.5140e+01, -1.1946e+02,  3.0000e+01, ...,  1.5650e+03,
         5.8400e+02,  2.5313e+00],
       [ 3.7800e+01, -1.2244e+02,  5.2000e+01, ...,  1.3100e+03,
         9.6300e+02,  3.4801e+00],
       ...,
       [ 3.8980e+01, -1.2091e+02,  1.3000e+01, ...,  3.2640e+03,
         1.1980e+03,  3.6530e+00],
       [ 3.4090e+01, -1.1772e+02,  3.6000e+01, ...,  7.8500e+02,
         2.9900e+02,  3.2566e+00],
       [ 3.7760e+01, -1.2247e+02,  3.4000e+01, ...,  1.1520e+03,
         4.4500e+02,  5.1893e+00]])

In [0]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [0]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [0]:
X_train = prepare_data(house_df_train, 0)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_data(house_df_val, 0)
y_pred = w0 + X_val.dot(w)
rmse_zero_fill = round(rmse(y_val, y_pred), 2)
rmse_zero_fill

Out[40]: 0.33

In [0]:
X_train = prepare_data(house_df_train, total_bedrooms_train_mean_value)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_data(house_df_val, total_bedrooms_train_mean_value)
y_pred = w0 + X_val.dot(w)
rmse_mean_fill = round(rmse(y_val, y_pred), 2)
rmse_mean_fill

Out[41]: 0.33

In [0]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [0]:
results = dict()
X_train = prepare_data(house_df_train, 0)
X_val = prepare_data(house_df_val, 0)
best_rmse = None
best_rmse_arg = None

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w0 + X_val.dot(w)
    raw_rmse = rmse(y_val, y_pred)
    if best_rmse is None:
        best_rmse = raw_rmse
        best_rmse_arg = r
    elif raw_rmse < best_rmse:
        best_rmse = raw_rmse
        best_rmse_arg = r
    results[r] = round(raw_rmse, 2)

pprint.pprint(results)
print(f" Best RMSE is {best_rmse} for r value: {best_rmse_arg}")

{0: 0.33,
 1e-06: 0.33,
 0.0001: 0.33,
 0.001: 0.33,
 0.01: 0.33,
 0.1: 0.33,
 1: 0.33,
 5: 0.34,
 10: 0.34}
 Best RMSE is 0.3295319365992662 for r value: 0.01


In [0]:
scores = list()
n = len(house_df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    house_df_train = house_df.iloc[idx[:n_train]]
    house_df_val = house_df.iloc[idx[n_train:n_train+n_val]]
    house_df_test = house_df.iloc[idx[n_train+n_val:]]

    house_df_train = house_df_train.reset_index(drop=True)
    house_df_val = house_df_val.reset_index(drop=True)
    house_df_test = house_df_test.reset_index(drop=True)

    y_train = np.log1p(house_df_train.median_house_value.values)
    y_val = np.log1p(house_df_val.median_house_value.values)
    y_test = np.log1p(house_df_test.median_house_value.values)

    del house_df_train['median_house_value']
    del house_df_val['median_house_value']
    del house_df_test['median_house_value']
    
    X_train = prepare_data(house_df_train, 0)
    w0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_data(house_df_val, 0)
    y_pred = w0 + X_val.dot(w)
    scores.append(rmse(y_val, y_pred))
    
print(f" The standard deviation of all the scores is: {round(np.std(scores), 3)}")

 The standard deviation of all the scores is: 0.004


In [0]:
n = len(house_df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

house_df_train = house_df.iloc[idx[:n_train+n_val]]
house_df_test = house_df.iloc[idx[n_train+n_val:]]

house_df_train = house_df_train.reset_index(drop=True)
house_df_test = house_df_test.reset_index(drop=True)

y_train = np.log1p(house_df_train.median_house_value.values)
y_test = np.log1p(house_df_test.median_house_value.values)

del house_df_train['median_house_value']
del house_df_test['median_house_value']

X_train = prepare_data(house_df_train, 0)
w0, w = train_linear_regression_reg(X_train, y_train, r=0.001)

X_test = prepare_data(house_df_test, 0)
y_pred = w0 + X_test.dot(w)


print(f" The RMSE score in test dataset is: {round(rmse(y_test, y_pred), 2)}")

 The RMSE score in test dataset is: 0.35
