<a href="https://colab.research.google.com/github/v-y-l/Machine-Learning-Notebooks/blob/main/Victor's_unbiased_estimation_using_2_neural_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unbiased estimation using two neural networks
## Section author: Victor Lin (vl2580)

## Implementation

In [3]:
import torch
import torch.nn as nn
import math
from torch.utils.data import Dataset, DataLoader

class GeluDataset(Dataset):
    def __init__(self, N, d):
        self.W = torch.randn(N, d, d)
        self.x = torch.randn(N, d)
        Z = torch.einsum('bij,bj->bi', self.W, self.x)
        self.y = 0.5 * Z * (1 + torch.erf(Z / math.sqrt(2)))

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.W[idx], self.x[idx], self.y[idx]

class PsiNet(nn.Module):
    def __init__(self, d, m, hidden=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d, hidden),
            nn.ReLU(),
            nn.Linear(hidden, m)
        )

    def forward(self, x):
        return self.net(x)

class PhiNet(nn.Module):
    def __init__(self, d, m, hidden=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d, hidden),
            nn.ReLU(),
            nn.Linear(hidden, m)
        )

    def forward(self, W):
        B, D, _ = W.shape
        W_flat = W.view(B * D, D)
        out_flat = self.net(W_flat)
        return out_flat.view(B, D, -1)

class GeluTrainer:
    def __init__(self, d=2, m=16, N=1024, batch_size=64, hidden=64, lr=1e-3):
        self.d = d
        self.m = m
        self.N = N
        self.batch_size = batch_size
        self.dataset = GeluDataset(N, d)
        self.loader = DataLoader(self.dataset, batch_size=batch_size, shuffle=True)
        self.psi = PsiNet(d, m, hidden)
        self.phi = PhiNet(d, m, hidden)
        self.optimizer = torch.optim.Adam(
            list(self.psi.parameters()) + list(self.phi.parameters()), lr=lr
        )
        self.criterion = nn.MSELoss()

    def train(self, num_epochs):
        for epoch in range(1, num_epochs + 1):
            total_loss = 0.0
            for Wb, xb, yb in self.loader:
                self.optimizer.zero_grad()
                ψ = self.psi(xb)
                Φ = self.phi(Wb)
                y_pred = torch.bmm(Φ, ψ.unsqueeze(-1)).squeeze(-1)
                loss = self.criterion(y_pred, yb)
                loss.backward()
                self.optimizer.step()
                total_loss += loss.item() * Wb.size(0)
            mse = total_loss / self.N
            print(f"Epoch {epoch}: MSE = {mse:.4f}")
        return self.phi, self.psi

## Evaluation

In [5]:
import numpy as np

# --- Evaluation ---
class NeuralGELUComparator:
    def __init__(self, phi_model, psi_model):
        self.phi = phi_model
        self.psi = psi_model

    def gelu_tanh(self, x):
        return 0.5 * x * (1 + np.tanh(np.sqrt(2/np.pi) * (x + 0.044715 * x**3)))

    def compare(self, W_test, x_test):
        with torch.no_grad():
            ψ = self.psi(x_test)                         # (1, m)
            Φ = self.phi(W_test)                         # (1, d, m)
            y_approx = torch.bmm(Φ, ψ.unsqueeze(-1)).squeeze().numpy()
            x_proj = torch.bmm(W_test, x_test.unsqueeze(-1)).squeeze().numpy()
            y_true = self.gelu_tanh(x_proj)
        rmse = np.sqrt(np.mean((y_approx - y_true) ** 2))
        return rmse, y_approx, y_true

# --- Runner ---
def run_demo(epoch_counts):
    results = []
    for count in epoch_counts:
        print(f"\n=== Training for {count} epochs ===")
        trainer = GeluTrainer(d=2, m=16, N=1024)
        phi, psi = trainer.train(num_epochs=count)

        # Evaluate
        W_test = torch.randn(1, 2, 2)
        x_test = torch.randn(1, 2)
        comparator = NeuralGELUComparator(phi, psi)
        rmse, y_hat, y_true = comparator.compare(W_test, x_test)

        print("\n--- Prediction Breakdown ---")
        print(f"Wx = {torch.matmul(W_test, x_test.unsqueeze(-1)).squeeze().numpy()}")
        print(f"φ(W) · ψ(x) = {y_hat}")
        print(f"GELU_tanh  = {y_true}")
        print(f"RMSE = {rmse:.5f}")
        results.append((count, rmse))

    best = min(results, key=lambda x: x[1])
    print(f"\n Best model: {best[0]} epochs (RMSE = {best[1]:.5f})")

# Run for 3 epoch settings
run_demo(epoch_counts=[3, 10, 30])


=== Training for 3 epochs ===
Epoch 1: MSE = 0.6499
Epoch 2: MSE = 0.2438
Epoch 3: MSE = 0.1443

--- Prediction Breakdown ---
Wx = [0.70764357 1.4289331 ]
φ(W) · ψ(x) = [0.65369165 1.1865686 ]
GELU_tanh  = [0.53804669 1.31937287]
RMSE = 0.12452

=== Training for 10 epochs ===
Epoch 1: MSE = 0.6455
Epoch 2: MSE = 0.2836
Epoch 3: MSE = 0.1449
Epoch 4: MSE = 0.1196
Epoch 5: MSE = 0.0992
Epoch 6: MSE = 0.0877
Epoch 7: MSE = 0.0792
Epoch 8: MSE = 0.0742
Epoch 9: MSE = 0.0700
Epoch 10: MSE = 0.0629

--- Prediction Breakdown ---
Wx = [-0.81988746  0.5952919 ]
φ(W) · ψ(x) = [-0.0795441  0.3570227]
GELU_tanh  = [-0.16910204  0.43106397]
RMSE = 0.08217

=== Training for 30 epochs ===
Epoch 1: MSE = 0.6745
Epoch 2: MSE = 0.2674
Epoch 3: MSE = 0.1620
Epoch 4: MSE = 0.1238
Epoch 5: MSE = 0.0981
Epoch 6: MSE = 0.0842
Epoch 7: MSE = 0.0713
Epoch 8: MSE = 0.0644
Epoch 9: MSE = 0.0572
Epoch 10: MSE = 0.0529
Epoch 11: MSE = 0.0474
Epoch 12: MSE = 0.0439
Epoch 13: MSE = 0.0400
Epoch 14: MSE = 0.0358
Epo