In [221]:
import numpy as np
import pandas as pd

np.set_printoptions(precision=4, suppress=True)

In [222]:
age = np.array([5, 6, 7, 8, 9])
height = np.array([100, 105, 108, 112, 115])

In [223]:
age_mean = np.mean(age)
age_mean

height_mean = np.mean(height)
height_mean

np.float64(7.0)

np.float64(108.0)

In [224]:
denominator = np.sum((age - age_mean)**2)
denominator

numerator = np.sum((age - age_mean) * (height - height_mean))
numerator

np.float64(10.0)

np.float64(37.0)

In [225]:
m = numerator / denominator
m

b = height_mean - (m * age_mean)
b

np.float64(3.7)

np.float64(82.1)

Final Equation:$$Height = 3.7(Age) + 82.1$$

Prediction

For Age 10: $y = 3.7(10) + 82.1 =$ $119.1 \text{ cm}$

In [226]:
x = 10
y = m * x + b
y

np.float64(119.1)

In [227]:
for x in age:
  y = m * x + b
  print(y)

100.6
104.3
108.0
111.69999999999999
115.4


In [228]:
height_pred = m * age + b
height_pred

array([100.6, 104.3, 108. , 111.7, 115.4])

In [229]:
# Sum Squared Residual
ssr = np.sum((height - height_pred) ** 2)
ssr

np.float64(1.1000000000000085)

In [230]:
# Mean Squared Error
np.mean((height - height_pred) ** 2)

np.float64(0.2200000000000017)

In [231]:
# Root Mean Squared Error
np.sqrt(np.mean((height - height_pred) ** 2))

np.float64(0.46904157598234475)

Formula:$$R^2 = 1 - \frac{SS_{res}}{SS_{tot}}$$Where:

$SS_{res}$ (Residual Sum of Squares): $\sum (y_{true} - \hat{y})^2$ — The error our model makes.

$SS_{tot}$ (Total Sum of Squares): $\sum (y_{true} - \bar{y})^2$ — The variation in the data itself.

In [232]:
ss_res = np.sum((height - height_pred) ** 2)
ss_res

ss_tot = np.sum((height - height_mean) ** 2)
ss_tot

r2 = 1 - (ss_res / ss_tot)
r2

np.float64(1.1000000000000085)

np.float64(138.0)

np.float64(0.9920289855072463)

### 2. Multiple linear regression (with just 1 feature)

`X` should be a matrix (two dimensions)

In [233]:
age = np.array([5, 6, 7, 8, 9])
height = np.array([100, 105, 108, 112, 115])

In [234]:
np.expand_dims(age, 1)
np.expand_dims(age, 1).shape

array([[5],
       [6],
       [7],
       [8],
       [9]])

(5, 1)

In [235]:
X = age.reshape(-1, 1)
X
X.shape

array([[5],
       [6],
       [7],
       [8],
       [9]])

(5, 1)

In [236]:
bias_col = np.ones(len(age))
bias_col

array([1., 1., 1., 1., 1.])

In [237]:
X = np.c_[bias_col, X]
X
X.shape

array([[1., 5.],
       [1., 6.],
       [1., 7.],
       [1., 8.],
       [1., 9.]])

(5, 2)

In [238]:
y = height.reshape(-1, 1)
y
y.shape

array([[100],
       [105],
       [108],
       [112],
       [115]])

(5, 1)

The Transpose ($X^T$)

Shape: $(2 \times 5)$

In [239]:
X_tr = X.T
X_tr.shape

(2, 5)

The Gram Matrix ($X^T X$)

In [240]:
X_tr.shape, X.shape

((2, 5), (5, 2))

Matrix Multiplication

For matrices $A \in \mathbb{R}^{m \times n}$ and $B \in \mathbb{R}^{n \times p}$, the elements of the product $C = AB$ are given by:$$C_{ij} = \sum_{k=1}^{n} A_{ik} B_{kj}$$

Explanation:

$A$ is an $m \times n$ matrix (rows $\times$ columns).

$B$ is an $n \times p$ matrix.

The resulting matrix $C$ is $m \times p$.

To find the value at row $i$, column $j$ of the result, you perform a dot product of the $i$-th row of $A$ and the $j$-th column of $B$.

In [241]:
C = np.zeros(shape=(2, 2))

for i in range(2):
  for j in range(2):
    C[i, j] = np.dot(X_tr[i], X[:, j])

C

array([[  5.,  35.],
       [ 35., 255.]])

The Gram Matrix ($X^T X$)

- This is the "Sum of Squares" matrix.
- Captures the "spread" (variance) of your features and how much they overlap with each other (covariance).

Gram matrix will look like this:

$$\begin{bmatrix} \text{Count}(n) & \sum x \\ \sum x & \sum x^2 \end{bmatrix}$$

In [242]:
gram_matrix = X_tr @ X
gram_matrix
gram_matrix.shape

array([[  5.,  35.],
       [ 35., 255.]])

(2, 2)

The Moment Vector ($X^T y$)

This represents the correlation between features and target.

The moment vector will look like this:

$$\begin{bmatrix} \sum y \\ \sum (x \cdot y) \end{bmatrix}$$

In [243]:
moment_vector = X_tr @ y
moment_vector
moment_vector.shape

array([[ 540.],
       [3817.]])

(2, 1)

The Inverse ($(X^T X)^{-1}$)

In [244]:
gram_matrix_inv = np.linalg.inv(X_tr @ X)
gram_matrix_inv
gram_matrix_inv.shape

array([[ 5.1, -0.7],
       [-0.7,  0.1]])

(2, 2)

Finally: Solve for $\beta$

Multiply `The Inverse` by `The Moment Vector`.

$$\beta = (X^T X)^{-1} X^T y$$

In [245]:
gram_matrix_inv.shape, moment_vector.shape

((2, 2), (2, 1))

In [246]:
beta = gram_matrix_inv @ moment_vector
beta
beta.shape

array([[82.1],
       [ 3.7]])

(2, 1)

Once you have calculated the $\beta$ vector, you have "trained" your model.

- $\beta_0$ (Intercept): $\mathbf{82.1}$

- $\beta_1$ (Slope): $\mathbf{3.7}$

Final Equation:

$$Height = 3.7(Age) + 82.1$$

Let's make the predictions

$$\hat{y} = X_{test} \cdot \beta$$

In [247]:
X.shape, beta.shape

((5, 2), (2, 1))

In [248]:
y_pred = X @ beta
y_pred
y_pred.shape

array([[100.6],
       [104.3],
       [108. ],
       [111.7],
       [115.4]])

(5, 1)

Evaluation metrics

In [249]:
y
y.shape

y_pred
y_pred.shape

array([[100],
       [105],
       [108],
       [112],
       [115]])

(5, 1)

array([[100.6],
       [104.3],
       [108. ],
       [111.7],
       [115.4]])

(5, 1)

In [250]:
y = y.flatten()
y
y.shape

y_pred = y_pred.flatten()
y_pred
y_pred.shape

array([100, 105, 108, 112, 115])

(5,)

array([100.6, 104.3, 108. , 111.7, 115.4])

(5,)

In [251]:
ssr = np.sum((y - y_pred)**2)
ssr

np.float64(1.0999999999999972)

In [252]:
mse = np.mean((y-y_pred)**2)
mse

np.float64(0.21999999999999945)

In [253]:
rmse = np.sqrt(mse)
rmse

np.float64(0.46904157598234236)

In [254]:
ss_res = np.sum((y - y_pred)**2)
ss_tot = np.sum((y-y.mean())**2)

r2 = 1 - ss_res / ss_tot
r2

np.float64(0.9920289855072464)

# 3. Multiple linear regression (with 2 features)

We are predicting Height ($y$) based on Age ($x_1$) and Weight ($x_2$).

In [255]:
data = np.array([
    [5,  20, 100],  # age(x1), weight(x2), height(y)
    [6,  30, 110],
    [8,  25, 115],
    [7,  40, 120],
    [4,  50, 105],
    [5,  70, 140],
])

data.shape

(6, 3)

### 3D Visualization

* **Concept**: Features map to the "ground", Target maps to "elevation".
* **Axes**:
    * **$x, y$ (Age, Weight)**: The floor plane.
    * **$z$ (Height)**: The vertical axis.
* **Camera Control**:
    * **`eye`**: Sets initial camera position $(x, y, z)$.
    * **Values**: Relative to the center $(0,0,0)$.

In [256]:
df = pd.DataFrame(data, columns=['Age', 'Weight', 'Height'])
df

Unnamed: 0,Age,Weight,Height
0,5,20,100
1,6,30,110
2,8,25,115
3,7,40,120
4,4,50,105
5,5,70,140


In [257]:
import plotly.express as px

fig = px.scatter_3d(df, x='Age', y='Weight', z='Height')

fig.update_layout(
    width=800,
    height=800
)

Step 1 ($X$): Create the Design Matrix. Remember the Bias Trick (Column of 1s first).

In [258]:
bias_col = np.ones(len(data))
bias_col

array([1., 1., 1., 1., 1., 1.])

In [259]:
X_ = data[:, :-1]
X_
X_.shape

array([[ 5, 20],
       [ 6, 30],
       [ 8, 25],
       [ 7, 40],
       [ 4, 50],
       [ 5, 70]])

(6, 2)

In [260]:
X = np.c_[bias_col, X_]
X
X.shape

array([[ 1.,  5., 20.],
       [ 1.,  6., 30.],
       [ 1.,  8., 25.],
       [ 1.,  7., 40.],
       [ 1.,  4., 50.],
       [ 1.,  5., 70.]])

(6, 3)

Step 2 ($y$): Create the Target Vector.

In [261]:
y = data[:, [-1]]
y
y.shape

array([[100],
       [110],
       [115],
       [120],
       [105],
       [140]])

(6, 1)

The Gram Matrix ($X^T X$)

Captures the "spread" (variance) of your features and how much they overlap with each other (covariance).

Gram matrix will look like this:

$$\begin{bmatrix} \text{Count}(n) & \sum x_1 & \sum x_2 \\ - & \sum x_1^2 & \sum x_1 x_2 \\ - & - & \sum x_2^2 \end{bmatrix}$$

In [262]:
X.T.shape, X.shape

((3, 6), (6, 3))

In [263]:
gram_matrix = X.T @ X
gram_matrix
gram_matrix.shape

array([[    6.,    35.,   235.],
       [   35.,   215.,  1310.],
       [  235.,  1310., 10925.]])

(3, 3)

The Moment Vector ($X^T y$)

Captures the "alignment" (correlation) between your features and the target variable.

Moment vector will look like:

$$\begin{bmatrix} \sum y \\ \sum (x_1 \cdot y) \\ \sum (x_2 \cdot y) \end{bmatrix}$$

In [264]:
X.T.shape, y.shape

((3, 6), (6, 1))

In [265]:
moment_vector = X.T @ y
moment_vector
moment_vector.shape

array([[  690.],
       [ 4040.],
       [28025.]])

(3, 1)

The Inverse ($(X^T X)^{-1}$)

In [266]:
gram_matrix_inv = np.linalg.inv(gram_matrix)
gram_matrix_inv
gram_matrix_inv.shape

array([[ 7.0583, -0.8313, -0.0521],
       [-0.8313,  0.1152,  0.0041],
       [-0.0521,  0.0041,  0.0007]])

(3, 3)

Finally: Solve for $\beta$

Multiply `The Inverse` by `The Moment Vector`.

$$\beta = (X^T X)^{-1} X^T y$$

In [267]:
gram_matrix_inv.shape, moment_vector.shape

((3, 3), (3, 1))

In [268]:
beta = gram_matrix_inv @ moment_vector
beta
beta.shape

array([[50.3834],
       [ 5.7989],
       [ 0.7861]])

(3, 1)

Once you have calculated the $\beta$ vector, you have "trained" your model.

- Intercept ($\beta_0$): $\mathbf{50.38}$

- Age Slope ($\beta_1$): $\mathbf{5.79}$

- Weight Slope ($\beta_2$): $\mathbf{0.78}$

The Final Equation:

$$Height = 50.38 + 5.79(\text{Age}) + 0.78(\text{Weight})$$

Interpretation:

- Base Height: A child with 0 age and 0 weight would theoretically be 50.38 cm.

- Age Factor: For every year older, they grow about 5.79 cm.

- Weight Factor: For every kg heavier, they grow about 0.78 cm.

Let's make the predictions

$$\hat{y} = X_{test} \cdot \beta$$

In [269]:
X.shape, beta.shape

((6, 3), (3, 1))

In [270]:
y_pred = X @ beta
y_pred
y_pred.shape

array([[ 95.1004],
       [108.7605],
       [116.4278],
       [122.4205],
       [112.8848],
       [134.406 ]])

(6, 1)

In [271]:
y
y.shape

y_pred
y_pred.shape

array([[100],
       [110],
       [115],
       [120],
       [105],
       [140]])

(6, 1)

array([[ 95.1004],
       [108.7605],
       [116.4278],
       [122.4205],
       [112.8848],
       [134.406 ]])

(6, 1)

In [272]:
y = y.flatten()
y_pred = y_pred.flatten()

y
y.shape

y_pred
y_pred.shape

array([100, 110, 115, 120, 105, 140])

(6,)

array([ 95.1004, 108.7605, 116.4278, 122.4205, 112.8848, 134.406 ])

(6,)

### Visualizing the Regression Plane

* **Concept**:
    * **1 Feature**: We fit a **Line** ($y = mx + b$).
    * **2 Features**: We fit a **Plane** ($z = b + w_1x + w_2y$).
* **Meshgrid (`np.meshgrid`)**:
    * Creates a grid of coordinates (like floor tiles) spanning the min/max of the data.
    * Necessary to plot a continuous surface rather than just a line.
* **The Prediction ($z$)**:
    * We calculate the predicted height ($z$) for every intersection on the grid.
    * Equation: $z = \beta_{intercept} + \beta_{age} \cdot x + \beta_{weight} \cdot y$.

In [273]:
x_grid = np.linspace(data[:, 0].min(), data[:, 0].max(), 10)
x_grid
x_grid.shape

y_grid = np.linspace(data[:, 1].min(), data[:, 1].max(), 10)
y_grid
y_grid.shape

array([4.    , 4.4444, 4.8889, 5.3333, 5.7778, 6.2222, 6.6667, 7.1111,
       7.5556, 8.    ])

(10,)

array([20.    , 25.5556, 31.1111, 36.6667, 42.2222, 47.7778, 53.3333,
       58.8889, 64.4444, 70.    ])

(10,)

In [274]:
xx, yy = np.meshgrid(x_grid, y_grid)
xx.shape
yy.shape

(10, 10)

(10, 10)

In [275]:
# Calculate Z for Grid (Plane Equation)
# z = intercept + w1*x + w2*y
zz = beta[0] + beta[1] * xx + beta[2] * yy
zz.shape

(10, 10)

In [276]:
import plotly.graph_objects as go

fig = go.Figure()

# Scatter Points
_ = fig.add_trace(go.Scatter3d(
    x=data[:, 0],  # age
    y=data[:, 1],  # weight
    z=data[:, 2],  # height
    mode='markers',
    marker=dict(size=5, color='red'),
    name='Actual Data'
))

# Regression Plane
_ = fig.add_trace(go.Surface(
    x=xx, y=yy, z=zz, opacity=0.5,
    colorscale='blues', showscale=False,
    name='Regression Plane'
))

# Layout
fig.update_layout(
    title='3D Linear Regression (Plane)',
    scene=dict(
        xaxis_title='Age',
        yaxis_title='Weight',
        zaxis_title='Height',
        camera=dict(
            eye=dict(x=1.5, y=0.5, z=1)  # Position of the "eye" relative to center (0,0,0)
        )
    ),
    width=800, height=600,
    margin=dict(l=0, r=0, b=0, t=50)
)

Let's calculate the errors

In [277]:
ssr = np.sum((y - y_pred)**2)
ssr

np.float64(126.90323480200807)

In [278]:
mse = np.mean((y - y_pred)**2)
mse

np.float64(21.150539133668012)

In [279]:
rmse = np.sqrt(mse)
rmse

np.float64(4.598971529991028)

In [280]:
ss_res = np.sum((y-y_pred)**2)
ss_tot = np.sum((y-y.mean())**2)

r2 = 1 - ss_res / ss_tot
r2

np.float64(0.873096765197992)