In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Data

### Training Data

In [None]:
train = pd.read_csv("/content/sample_data/california_housing_train.csv")

In [None]:
train.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0


In [None]:
train.shape

(17000, 9)

In [None]:
x_train = np.array(train.iloc[:, :-1])
x_train

array([[-114.31  ,   34.19  ,   15.    , ..., 1015.    ,  472.    ,
           1.4936],
       [-114.47  ,   34.4   ,   19.    , ..., 1129.    ,  463.    ,
           1.82  ],
       [-114.56  ,   33.69  ,   17.    , ...,  333.    ,  117.    ,
           1.6509],
       ...,
       [-124.3   ,   41.84  ,   17.    , ..., 1244.    ,  456.    ,
           3.0313],
       [-124.3   ,   41.8   ,   19.    , ..., 1298.    ,  478.    ,
           1.9797],
       [-124.35  ,   40.54  ,   52.    , ...,  806.    ,  270.    ,
           3.0147]])

In [None]:
x_train.shape

(17000, 8)

In [None]:
y_train = np.array(train["median_house_value"])
y_train

array([ 66900.,  80100.,  85700., ..., 103600.,  85800.,  94600.])

In [None]:
y_train.shape

(17000,)

### Test Data

In [None]:
test = pd.read_csv("/content/sample_data/california_housing_test.csv")

In [None]:
test.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0


In [None]:
x_test = np.array(test.iloc[:, :-1])
x_test

array([[-122.05  ,   37.37  ,   27.    , ..., 1537.    ,  606.    ,
           6.6085],
       [-118.3   ,   34.26  ,   43.    , ...,  809.    ,  277.    ,
           3.599 ],
       [-117.81  ,   33.78  ,   27.    , ..., 1484.    ,  495.    ,
           5.7934],
       ...,
       [-119.7   ,   36.3   ,   10.    , ...,  693.    ,  220.    ,
           2.2895],
       [-117.12  ,   34.1   ,   40.    , ...,   46.    ,   14.    ,
           3.2708],
       [-119.63  ,   34.42  ,   42.    , ...,  753.    ,  260.    ,
           8.5608]])

In [None]:
x_test.shape

(3000, 8)

In [None]:
y_test = np.array(test["median_house_value"])
y_test

array([344700., 176500., 270500., ...,  62000., 162500., 500001.])

# Using Normal Equation

### Adding Bias Term

In [None]:
def add_bias(x):
  x_bias = np.ones(x.shape[0])
  x = np.insert(x, 0, [x_bias], axis = 1)
  return x

In [None]:
add_bias(x_train)

array([[ 1.0000e+00, -1.1431e+02,  3.4190e+01, ...,  1.0150e+03,
         4.7200e+02,  1.4936e+00],
       [ 1.0000e+00, -1.1447e+02,  3.4400e+01, ...,  1.1290e+03,
         4.6300e+02,  1.8200e+00],
       [ 1.0000e+00, -1.1456e+02,  3.3690e+01, ...,  3.3300e+02,
         1.1700e+02,  1.6509e+00],
       ...,
       [ 1.0000e+00, -1.2430e+02,  4.1840e+01, ...,  1.2440e+03,
         4.5600e+02,  3.0313e+00],
       [ 1.0000e+00, -1.2430e+02,  4.1800e+01, ...,  1.2980e+03,
         4.7800e+02,  1.9797e+00],
       [ 1.0000e+00, -1.2435e+02,  4.0540e+01, ...,  8.0600e+02,
         2.7000e+02,  3.0147e+00]])

### Normal Equation

In [None]:
def ne_theta_calc(x, y):
  x_trans = np.transpose(x)
  # theta = np.matmul(np.matmul( ( np.linalg.pinv( np.matmul( x_trans, x ) ) ), x_trans) , y )
  # 2 terms x_trans*x ka inverse and x_trans*y
  t1 = np.matmul(x_trans, x)
  t2 = np.matmul(x_trans, y)
  theta = np.matmul(np.linalg.pinv(t1), t2) 
  return theta

In [None]:
theta_ne = ne_theta_calc(x_train, y_train)
theta_ne

array([-2.24235468e+03, -8.47422590e+03,  1.78614847e+03, -1.53597034e+01,
        7.99622914e+01, -3.99046218e+01,  1.33170287e+02,  4.58387489e+04])

In [None]:
y_train_pred = np.matmul(x_train, theta_ne)
y_train_pred

array([100592.91081649, 133644.42278633,  82574.3325727 , ...,
       105904.95030925,  64143.09131916, 166188.36243097])

### Cost Function

In [None]:
def cost_function(y, y_pred):
  m = len(y)
  cost = np.subtract(y, y_pred)
  sum = 0
  for i in cost:
    sum += i**2
  return (sum/(2*m))

In [None]:
j_train = cost_function(y_train, y_train_pred)
j_train

2800779594.5539427