## Theory

Chapter 7 of Introduction To Statistical Learning

## Setup

In [10]:
import plotly.express as px
import numpy as np
from scipy.optimize import least_squares
import plotly.io as pio
import pandas as pd

pio.templates.default = "plotly_dark"

## Data

In [13]:
df = pd.read_csv('data/wage.csv')
x = df["age"]
y = df["wage"]

px.scatter(x=x, y=y)

## Linear Regression

In [33]:
def model(x, coef):
    return coef[0] + x*coef[1]

def residual(coef, x, y):
    return y - model(x, coef)

initial_coef = np.array([0, 0])

res = least_squares(residual, initial_coef, args=(x, y))

x_range = np.linspace(x.min(), x.max(), 100)
fitted_line = model(x_range, coef=res.x)

fitted_coef = res.x
residuals = residual(fitted_coef, x, y)

# Degrees of Freedom (n - p)
df = len(y) - len(fitted_coef)

# Variance of residuals
residual_var = np.sum(residuals**2) / df


fig = px.scatter(x=x, y=y)
fig.add_scatter(x=x_range, y=fitted_line)

## Polynomial Regression

In [18]:
def model(x, coef):
    return coef[0] + coef[1]*x + coef[2]*x**2

def residual(coef, x, y):
    return y - model(x, coef)

initial_coef = np.zeros(3)

res = least_squares(residual, initial_coef, args=(x, y))

x_range = np.linspace(x.min(), x.max(), 100)
fitted_line = model(x_range, coef=res.x)

fig = px.scatter(x=x, y=y)
fig.add_scatter(x=x_range, y=fitted_line)

In [32]:
degree=10

def model(x, coef, degree=degree):
    return np.array([x**(i)*coef[i] for i in range(degree)]).sum(axis=0)

def residual(coef, x, y):
    return y - model(x, coef)

initial_coef = np.zeros(degree+1)

res = least_squares(residual, initial_coef, args=(x, y))

x_range = np.linspace(x.min(), x.max(), 100)
fitted_line = model(x_range, coef=res.x)

fig = px.scatter(x=x, y=y)
fig.add_scatter(x=x_range, y=fitted_line)

## Cubic Spline

In [20]:
def truncated_power_basis(x, knot):
    return np.where(x > knot, (x - knot)**3, 0)

knots = [2.5, 5, 7.5]

def model(x, coef, knots=knots):
    return (
        coef[0]
        + x * coef[1]
        + x**2 * coef[2]
        + x**3 * coef[3]
        + truncated_power_basis(x, knot=knots[0]) * coef[4]
        + truncated_power_basis(x, knot=knots[1]) * coef[5] 
        + truncated_power_basis(x, knot=knots[2]) * coef[6]
    )

In [21]:
def residual(coef, x, y):
    return y - model(x, coef)

initial_coef = np.zeros(4 + len(knots))

res = least_squares(residual, initial_coef, args=(x, y))

x_range = np.linspace(x.min(), x.max(), 100)
fitted_line = model(x_range, coef=res.x)

fig = px.scatter(x=x, y=y)
fig.add_scatter(x=x_range, y=fitted_line)