# Q6 â€“ Housing Price Prediction (Manual OLS & Gradient Descent)

This notebook parses the provided `housing_data.csv` **(SquareFootage,Price)** and implements:

1. **Ordinary Least Squares (OLS)** using the closed-form formulas (no sklearn/scipy)
2. **Gradient Descent (GD)** for a linear model \(y = m x + b\)

**Outputs**
- Predicted price for a **2,500 sq ft** house (two decimals)
- Best-fit line plot for OLS and GD

**Allowed libraries:** `numpy` and `matplotlib` only.

In [None]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

CSV_PATH = Path('housing_data.csv')  # expected CSV with header: SquareFootage,Price

def load_data(csv_path: Path):
    xs, ys = [], []
    if csv_path.exists():
        with open(csv_path, 'r', newline='') as f:
            rdr = csv.reader(f)
            header = next(rdr, None)
            for row in rdr:
                if not row or len(row) < 2: continue
                xs.append(float(row[0]))
                ys.append(float(row[1]))
        return np.array(xs), np.array(ys)
    
    # Fallback data from the assignment screenshot (same format)
    data = [
        (1100,199000),(1400,245000),(1425,230000),(1550,215000),(1600,280000),
        (1700,295000),(1750,345000),(1800,315000),(1875,325000),(2000,360000),
        (2100,350000),(2250,385000),(2300,390000),(2400,425000),(2450,415000),
        (2600,455000),(2800,465000),(2900,495000),(3000,510000),(3150,545000),
        (3300,570000),
    ]
    xs = np.array([x for x,_ in data], dtype=float)
    ys = np.array([y for _,y in data], dtype=float)
    return xs, ys

x, y = load_data(CSV_PATH)
len(x), x[:3], y[:3]

## 1) Ordinary Least Squares (closed-form)

In [None]:
def ols_fit(x, y):
    x_mean = x.mean(); y_mean = y.mean()
    m = np.sum((x - x_mean) * (y - y_mean)) / np.sum((x - x_mean)**2)
    b = y_mean - m * x_mean
    return m, b

def predict(m, b, x):
    return m * x + b

m_ols, b_ols = ols_fit(x, y)
pred_2500_ols = predict(m_ols, b_ols, 2500.0)
print(f"OLS slope m: {m_ols:.6f}")
print(f"OLS intercept b: {b_ols:.6f}")
print(f"Predicted price (2,500 sq ft): {pred_2500_ols:.2f}")

In [None]:
plt.figure()
plt.scatter(x, y, label='Data')
x_line = np.linspace(x.min(), x.max(), 200)
y_line = predict(m_ols, b_ols, x_line)
plt.plot(x_line, y_line, label='Best-fit (OLS)')
plt.xlabel('Square Footage')
plt.ylabel('Price')
plt.legend()
plt.tight_layout()
plt.savefig('best_fit_ols.png', dpi=160)
plt.close()
print('Saved best_fit_ols.png')

## 2) Gradient Descent (manual)

In [None]:
def grad_step(x, y, m, b, lr):
    N = x.size
    y_pred = m * x + b
    err = y_pred - y
    dm = (2.0 / N) * np.sum(err * x)
    db = (2.0 / N) * np.sum(err)
    return m - lr * dm, b - lr * db

def mse(x, y, m, b):
    return np.mean((m * x + b - y)**2)

m, b = 0.0, 0.0
lr = 1e-7     # small due to scale of targets
steps = 150000
last = float('inf')
for t in range(steps):
    m, b = grad_step(x, y, m, b, lr)
    if t % 5000 == 0 or t == steps-1:
        cur = mse(x, y, m, b)
        if abs(last - cur) < 1e-6:
            break
        last = cur

pred_2500_gd = m * 2500.0 + b
print(f"GD slope m: {m:.6f}")
print(f"GD intercept b: {b:.6f}")
print(f"Predicted price (2,500 sq ft): {pred_2500_gd:.2f}")

plt.figure()
plt.scatter(x, y, label='Data')
x_line = np.linspace(x.min(), x.max(), 200)
y_line = m * x_line + b
plt.plot(x_line, y_line, label='Best-fit (GD)')
plt.xlabel('Square Footage')
plt.ylabel('Price')
plt.legend()
plt.tight_layout()
plt.savefig('best_fit_gd.png', dpi=160)
plt.close()
print('Saved best_fit_gd.png')