In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
""" 
The equation is:
 y = β₀ + β₁x₁ + β₂x₂ + ... + βₙxₙ + ε

EAMPLE: Predicting house prices
    Price = β₀ + β₁(Size) + β₂(Bedrooms) + β₃(Age) + ε

MATRIX FORM

We can write multiple regression in matrix form:
    Y = Xβ + ε

Where:
- Y is an (n×1) vector of outcomes
- X is an (n×p) matrix of predictors (includes column of 1s for intercept)
- β is a (p×1) vector of coefficients
- ε is an (n×1) vector of errors

NORMAL EQUATION (Least Squares Solution)

To find the best β that minimizes the sum of squared errors, we use:

    β = (XᵀX)⁻¹Xᵀy

DERIVATION:
1. We want to minimize: RSS = ||Y - Xβ||²
2. RSS = (Y - Xβ)ᵀ(Y - Xβ)
3. Expand: RSS = YᵀY - 2βᵀXᵀY + βᵀXᵀXβ
4. Take derivative with respect to β: ∂RSS/∂β = -2XᵀY + 2XᵀXβ
5. Set to zero: -2XᵀY + 2XᵀXβ = 0
6. Solve for β: XᵀXβ = XᵀY
7. Final solution: β = (XᵀX)⁻¹XᵀY

"""

In [3]:
# Creating dummy data 

# Set random seed for reproducibility 
np.random.seed(42)
# Genarate dummy data 
n_sample = 200

# Feature 1: House size (1000- 3000 sqft)
# Every value in that range has equal probability (that's what "uniform" means)
size = np.random.uniform(1000, 3000, n_sample)

# Feature 2: Number of bedrooms (2- 5)
bedrooms = np.random.randint(2, 6, n_sample)

# Feature 3: House age (0 - 30 years)
age = np.random.uniform(0, 30, n_sample)

# True relationship (what we are trying to discover)
# Price = 50000 + 150 * size  + 20000 * bedrooms - 2000 * age + noise

true_intercept = 50000
true_coef_size = 150
true_coef_bedrooms = 20000
true_coef_age = -2000

# Generate target variable with some noise 
# Use when values cluster around a mean:(random.normal)
noise = np.random.normal(0, 30, n_sample)
price = (true_intercept +
         true_coef_size * size +
         true_coef_bedrooms * bedrooms +
         true_coef_age * age 
         + noise
        )

In [29]:
print(f"Generated {n_sample} samples\n")
print(f"True coefficients")
print(f"  Intercept: ${true_intercept:,.2f}")
print(f"  Size coefficient: ${true_coef_size:,.2f}")
print(f"  Bedrooms coefficient: ${true_coef_bedrooms:,.2f}")
print(f"  Age coefficient: ${true_coef_age:,.2f}\n")

df = pd.DataFrame({
    "Size": size,
    "Bedrooms": bedrooms,
    "Age": age,
    "Price": price
})
df.head()

Generated 200 samples

True coefficients
  Intercept: $50,000.00
  Size coefficient: $150.00
  Bedrooms coefficient: $20,000.00
  Age coefficient: $-2,000.00



Unnamed: 0,Size,Bedrooms,Age,Price
0,1749.080238,5,1.550452,409271.385063
1,2901.428613,5,15.940639,553389.299154
2,2463.987884,4,16.219054,467188.587962
3,2197.316968,2,19.122897,381334.44406
4,1312.037281,5,21.78274,303213.159669


In [None]:
""" 
fit the model using the Normal Equation

Parameters:
X : array - like shape(n_samples, n_feature)
    Training data
y : array - lije shape(n_samples,)
    Target values

normalize : bool 
    Whether to normalize features (recommended)

In [None]:
""" Normalization:
- Normalization or standardization means scalling featyres to a similar range, 
typically 

X_normalized = (X - mean) /std  --> mean = 0, sd = 1

Min-Max Scalling:
X_normalized = (X - min)/(max - min) --> values between 0 and 1 


Why do we need it ?
1. Gradient Descent converges faster 
  - When features have different scales, the loss function becomes, elongated/elliptical
   eg: Small in β₂ (size) → huge change in loss (# Feature 1: House size (1000- 3000 sqft))
       Large change in β₁ (bedrooms) → tiny change in loss (Feature 2: Number of bedrooms (2- 5))
       Gradient descent zigzags, takes forever

       And loss function becomes circular, 

       Benefits of standardization:
       - Steps are more direct toward minimum 
       - Converges much faster 
       - More stable training 
-----------------------------------------------------------------------------------
2. Prevent feature Dominance
eg: without normalization 

#Features 
size = 2000  #large numbers
bedrooms = 3  #small numbers

#Initial random weights
 β₁  = 0.001 #for size
 β₂  = 0.001 #for bedrooms

 # Contribution to prediction 
 contribution_size = 0.001 * 2000 = 2.0
 contribution_bedrooms = 0.001 * 3 = 0.001

 Problem : size dominates the prediction! the model can't properly learn the importance 
 of bedrooms because the numbers are so different 


 eg: with normalization 
 # Normalized features (mean = 0, sd = 1)
 size_norm = 0.1
 bedrooms_norm = 0.3

 # Same weights 
  β₁  = 0.001 #for size
  β₂  = 0.001 #for bedrooms

 # Contribution to prediction 
 contribution_size = 0.001 * 0.5 = 0.0005
 contribution_bedrooms = 0.001 * 0.3 = 0.002

 --> Now they are comparable! The model can learn proper weights.

----------------------------------------------------------------------------------
3. Regularization works fairly 

with L2 regularization (Ridge Regression) -) regularization penalty is fairly acorss features

----------------------------------------------------------------------------------

4. Interpretability of Coefficients

after normalization :
β₁ = 0.8  # for size
β₂ = 0.6  # for bedrooms

Interpretation:  A1 standard deviation increase in size has a bigger effect(0.8) than 
A1 standard deviation increase in bedrooms (0.6)

In [None]:
class MultipleRegression:
    def __init__(self):
        self.coefficients = None
        self.intercept = None
        self.X_mean = None
        self.X_std = None

    def fit(self, X, y, normalize = True):
        # Convert to numpy arrays
        X = np.array(X)
        y = np.array(y).reshape(-1, 1)

        # Store original dimensions 
        n_samples, n_features = X.shape

        # Optional : normalize features for better numerical stability 
        