# Import

In [None]:
%load_ext autoreload
%autoreload 2

import random

import matplotlib.pyplot as plt
import numpy as np
import torch
from sklearn.datasets import load_digits, make_moons
from sklearn.decomposition import PCA

from hw6_q1 import autoencode
from hw6_q3 import hw6_q3_autograd as ad
from hw6_q3 import hw6_q3_nn as nn
from hw6_q3 import hw6_q3_optim as optim
from hw6_q3_utils import compute_num_params, train

torch.use_deterministic_algorithms(True)
torch.set_default_dtype(torch.float64)


def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


set_seed()

# Linear AE recovers PCA

We will first visualize PCA components on a simple dataset.

In [None]:
digits_dataset = load_digits()
print(f"There are {digits_dataset.data.shape[0]} samples with {digits_dataset.data.shape[1]} features.")

def plot(data: np.ndarray, labels: np.ndarray, title: str):
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(data[:, 0], data[:, 1], c=labels, cmap="tab10", alpha=0.7)
    plt.colorbar(scatter, label="Digit Label")
    plt.title(title)
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.xlim(-30, 30)
    plt.ylim(-30, 30)
    plt.grid(True)
    plt.show()

pca = PCA(n_components=2, random_state=0)
pca_components = pca.fit_transform(digits_dataset.data)

plot(pca_components, digits_dataset.target, "PCA of Digits Dataset")

We will then run your implementation of the linear autoencoder. Since the AE will learn the same subspace as the PCA, we will have some similar-looking but not exactly the same components.

In [None]:
set_seed(0)
ae_components = autoencode(torch.tensor(digits_dataset.data)).numpy()
plot(ae_components, digits_dataset.target, "Autoencoder of Digits Dataset")

To find the exact transformation from our learned components to PCA components, we can perform a simple least squares. The autoencoder visualization should now match the PCA one.

In [None]:
transformation = np.linalg.lstsq(ae_components, pca_components)[0]
print(f"Learned linear transformation from AE space to PCA space:\n{transformation}")
transformed_ae_components = ae_components @ transformation
plot(transformed_ae_components, digits_dataset.target, "Transformed Autoencoder of Digits Dataset")

# Neural Network

We test the implementation of our automatic differentiation on the same written problem. The output below should match your manually derived results (with very small numerical differences).

In [None]:
from pprint import pprint

x0 = [0, 1]
y0 = 1
x1 = [1, 0]
y1 = 0
ad
w = [ad.Scalar(1), ad.Scalar(1), ad.Scalar(1)]
h = [ad.Scalar(1), ad.Scalar(1)]


def forward_pass(x, y):
    wx = sum(wi * xi for wi, xi in zip(w, [1] + x))
    w_out = ad.ReLUFn().forward(wx)

    y_hat = sum(hi * ui for hi, ui in zip(h, [1, w_out]))

    loss = (y_hat - y) ** 2
    return loss


def backward_pass(loss, lr=0.05):
    for wi in w:
        wi.grad = 0
    for hi in h:
        hi.grad = 0

    loss.backward()

    for wi in w:
        wi.data -= wi.grad * lr
    for hi in h:
        hi.data -= hi.grad * lr


loss0 = forward_pass(x0, y0)
backward_pass(loss0)

print("First training step")
print("Layer 1: w")
pprint(w)
print("Layer 2: h")
pprint(h)

loss1 = forward_pass(x1, y1)
backward_pass(loss1)

print("Second training step")
print("Layer 1: w")
pprint(w)
print("Layer 2: h")
pprint(h)

We provide a simple test on your binary cross-entropy implementation. The cell below should run successfully without error.

In [None]:
def bce_test(i):
    a1 = ad.Scalar(i)
    a2 = torch.tensor([i * 1.0], requires_grad=True)

    def impl_bce_with_logits(a):
        return ad.BCEWithLogitsLossFn().forward(a, 1)

    def torch_bce_with_logits(a):
        return torch.nn.functional.binary_cross_entropy_with_logits(a, torch.Tensor([1]))

    b1 = impl_bce_with_logits(a1)
    b2 = torch_bce_with_logits(a2)
    print(f"For i = {i}")

    b1.backward()
    b2.backward()
    print("- Gradient", b1.item(), b2.item())

    assert torch.isclose(torch.tensor(b1.item()), torch.tensor(b2.item()))
    assert torch.isclose(torch.tensor(a1.grad), torch.tensor(a2.grad.item()))


for i in [-100, -50, -5, -1, 1, 5, 50, 100]:
    bce_test(i)

We will now build a simple MLP to test our neural network framework. First, we load a simple dataset.

In [None]:
X, y = make_moons(n_samples=100, noise=0.1, random_state=0)
plt.figure(figsize=(5, 5))
plt.scatter(X[:, 0], X[:, 1], c=y, s=20, cmap="jet")
plt.show()

We will then define our version of MLP and PyTorch-based MLP. Our parameters are already initialized to $0.1$. Therefore, we would also need to initialize Pytorch MLP to $0.1$ for comparison.

In [None]:
class OurMLP(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def __call__(self, inputs):
        outputs = self.relu(self.linear1(inputs))
        outputs = self.relu(self.linear2(outputs))
        return self.linear3(outputs)


class TorchMLP(torch.nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        super().__init__()
        self.linear1 = torch.nn.Linear(input_size, hidden_size)
        self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
        self.linear3 = torch.nn.Linear(hidden_size, output_size)
        self.relu = torch.nn.ReLU()

        for linear in [self.linear1, self.linear2, self.linear3]:
            torch.nn.init.constant_(linear.weight, 0.1)
            torch.nn.init.constant_(linear.bias, 0.1)

    def __call__(self, inputs):
        outputs = self.relu(self.linear1(inputs))
        outputs = self.relu(self.linear2(outputs))
        return self.linear3(outputs)

We will run our implementation. The final result should be about $0.89$. The training should take a few minutes.

In [None]:
our_mlp = OurMLP(2, 16, 1)
print("OurMLP trainable parameters:", compute_num_params(our_mlp.parameters()))
our_loss_fn = nn.BCEWithLogitsLoss()
our_optimizer = optim.SGD(our_mlp.parameters(), lr=0.5)
our_X = X
our_y = y

our_losses, out_accs = train(
    mlp=our_mlp,
    loss_fn=our_loss_fn,
    optimizer=our_optimizer,
    X=our_X,
    y=our_y,
    batch_size=8,
    num_steps=50,
)

plt.subplots(1, 2, figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(our_losses)
plt.title("Our MLP Training Loss")
plt.xlabel("Training Step")
plt.ylabel("BCE Loss")
plt.subplot(1, 2, 2)
plt.plot(out_accs)
plt.title("Our MLP Training Accuracy")
plt.xlabel("Training Step")
plt.ylabel("Accuracy")
plt.show()

We will now run the PyTorch implementation. Your implementation should match the PyTorch implementation.

In [None]:
torch_mlp = TorchMLP(2, 16, 1)
print("TorchMLP trainable parameters:", compute_num_params(torch_mlp.parameters()))
torch_loss_fn = torch.nn.BCEWithLogitsLoss()
torch_optimizer = torch.optim.SGD(torch_mlp.parameters(), lr=0.5)
torch_X = torch.Tensor(X)
torch_y = torch.Tensor(y)

torch_losses, torch_accs = train(
    mlp=torch_mlp,
    loss_fn=torch_loss_fn,
    optimizer=torch_optimizer,
    X=torch_X,
    y=torch_y,
    batch_size=8,
    num_steps=50,
)

plt.subplots(1, 2, figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(torch_losses)
plt.title("Torch MLP Training Loss")
plt.xlabel("Training Step")
plt.ylabel("BCE Loss")
plt.subplot(1, 2, 2)
plt.plot(torch_accs)
plt.title("Torch MLP Training Accuracy")
plt.xlabel("Training Step")
plt.ylabel("Accuracy")
plt.show()