<a href="https://colab.research.google.com/github/v-y-l/Machine-Learning-Notebooks/blob/main/Victor's_biased_estimation_using_krocker_products.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Biased estimation: GELU Taylor Series decomposition using Kronecker Products
## Section author: Victor Lin (vl2580)

## Implementation


In [1]:
import math
import numpy as np

# Approximates GELU using a 5-term Taylor expansion with Kronecker-feature decomposition
class TaylorSeriesGeluEstimator:
    def __init__(self, n_terms):
        self.n = n_terms
        self.coeffs = self._compute_coeffs()
        self.powers = [1] + [2 * i + 2 for i in range(self.n)]

    def _compute_coeffs(self):
        coeffs = [0.5]
        base = 1 / math.sqrt(2 * math.pi)
        for i in range(self.n):
            c = (-1)**i / (math.factorial(i) * (2**i) * (2 * i + 1))
            coeffs.append(base * c)
        return coeffs

    def _kron_n(self, vec, times):
        out = vec
        for _ in range(times - 1):
            out = np.kron(out, vec)
        return out

    def transform(self, vec, coeffs=None):
        if coeffs is None:
            coeffs = [1.0] * len(self.powers)
        return np.concatenate([c * self._kron_n(vec, p) for c, p in zip(coeffs, self.powers)])

    def get_phi(self, W):
        return np.vstack([self.transform(w, self.coeffs) for w in W])

    def get_psi(self, x):
        return self.transform(x, coeffs=None)


# Creates a toy dataset such that Wx falls within [-3, 3], ensuring accuracy of the 5-term Taylor approximation
class ToyDataGenerator:
    def __init__(self):
        self.x = np.array([1.0, 0.1, 2.0])
        self.W = np.array([
            [-0.8, -0.8, -0.8],
            [ 0.5,  0.5,  0.5],
            [ 0.2,  0.3,  1.0]
        ])

    def generate(self):
        x_prime = self.W @ self.x
        print("### 5-term Taylor Series Approximation Range")
        print("This shows us that Wx should be within [−3, 3] in our demonstration.\n")
        print("| Range of Wx | Max Error       |")
        print("|-------------|------------------|")
        print("| [-1, 1]     | 9 × 10⁻⁶         |")
        print("| [-3, 3]     | 3.0              |")
        print("| [-5, 5]     | 7.8 × 10²        |")
        print("| [-7, 7]     | 2.7 × 10⁴        |")
        print("\nGenerated Wx = ", x_prime)
        print("→ This dataset satisfies the ideal range for ≤ 3.0 error.\n")
        return self.W, self.x, x_prime

## Evaluation

In [2]:
# Compares GELU Taylor vs tanh approximation vs φ·ψ result
class GELUComparator:
    def __init__(self, n):
        self.n = n

    def gelu_tanh(self, x):
        return 0.5 * x * (1 + np.tanh(np.sqrt(2/np.pi) * (x + 0.044715 * x**3)))

    def gelu_taylor(self, x):
        leading = 1 / math.sqrt(2 * math.pi)
        s = 0
        for i in range(self.n):
            s += (-1)**i / (math.factorial(i) * (2**i) * (2*i+1)) * x**(2*i+1)
        return x * (0.5 + leading * s)

    def get_error(self, actual, expected):
        return np.sqrt(np.mean((actual - expected)**2))

    def compare(self, x_prime, phi_dot_psi):
        print("==== Problem 3: Linearization of GELU Feedforward ====\n")
        print("We want to approximate:")
        print("  y = GELU(Wx)    with    y' = Φ(W) · Ψ(x)")
        print(f"Using a Taylor expansion with n = {self.n} terms.\n")

        taylor = self.gelu_taylor(x_prime)
        tanh = self.gelu_tanh(x_prime)

        print(f"x' = Wx = {x_prime}")
        print(f"GELU_Taylor_{self.n}(x') = {taylor}")
        print(f"Linearized y' = Φ(W) · Ψ(x) = {phi_dot_psi}")
        print(f"GELU_Tanh(x') = {tanh}")

        error_taylor_vs_tanh = self.get_error(taylor, tanh)
        error_phi_vs_taylor = self.get_error(phi_dot_psi, taylor)

        print("\n--- Error Metrics ---")
        print(f"RMSE: Taylor (n={self.n}) vs Tanh GELU:         {error_taylor_vs_tanh:.5f}")
        print(f"RMSE: Linearized φ · ψ vs Taylor GELU:         {error_phi_vs_taylor:.5f}")

        print("\nConclusion: The linearized form approximates y = GELU(Wx) by constructing φ(W) and ψ(x) using Kronecker-lifted Taylor monomials.\n")

# Bundled demo for dataset, transform, and comparison
class GELUDemo:
    def __init__(self, n_terms=5):
        self.n_terms = n_terms
        self.estimator = TaylorSeriesGeluEstimator(n_terms)
        self.generator = ToyDataGenerator()
        self.comparator = GELUComparator(n_terms)

    def run(self):
        W, x, x_prime = self.generator.generate()

        phi = self.estimator.get_phi(W)
        psi = self.estimator.get_psi(x)
        phi_dot_psi = phi @ psi

        print("Φ(W) shape:", phi.shape)
        print("Ψ(x) shape:", psi.shape)
        print("Linearized output y' = Φ(W) · Ψ(x) =", phi_dot_psi, "\n")

        self.comparator.compare(x_prime, phi_dot_psi)

# Execute the full GELU decomposition and comparison demo
GELUDemo(n_terms=5).run()

### 5-term Taylor Series Approximation Range
This shows us that Wx should be within [−3, 3] in our demonstration.

| Range of Wx | Max Error       |
|-------------|------------------|
| [-1, 1]     | 9 × 10⁻⁶         |
| [-3, 3]     | 3.0              |
| [-5, 5]     | 7.8 × 10²        |
| [-7, 7]     | 2.7 × 10⁴        |

Generated Wx =  [-2.48  1.55  2.23]
→ This dataset satisfies the ideal range for ≤ 3.0 error.

Φ(W) shape: (3, 66432)
Ψ(x) shape: (66432,)
Linearized output y' = Φ(W) · Ψ(x) = [0.33582152 1.45766421 2.30608443] 

==== Problem 3: Linearization of GELU Feedforward ====

We want to approximate:
  y = GELU(Wx)    with    y' = Φ(W) · Ψ(x)
Using a Taylor expansion with n = 5 terms.

x' = Wx = [-2.48  1.55  2.23]
GELU_Taylor_5(x') = [0.33582152 1.45766421 2.30608443]
Linearized y' = Φ(W) · Ψ(x) = [0.33582152 1.45766421 2.30608443]
GELU_Tanh(x') = [-0.01585868  1.45591211  2.20158061]

--- Error Metrics ---
RMSE: Taylor (n=5) vs Tanh GELU:         0.21182
RMSE: Linearized φ 