# Zipperhead Experiment Notebook
This notebook implements the experiments described in the README to compare **Supervised Fine-Tuning (SFT)** and **Guided Reward Policy Optimization (GRPO)**.

In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## Model Definition

In [2]:
# Define a simple autoencoder-like model
class SimpleSeq2Seq(nn.Module):
    def __init__(self):
        super(SimpleSeq2Seq, self).__init__()
        self.encoder = nn.Linear(1, 8)
        self.decoder = nn.Linear(8, 1)
        self.activation = nn.ReLU()

    def forward(self, x):
        encoded = self.activation(self.encoder(x))
        decoded = self.decoder(encoded)
        return decoded

## Data Generation

In [None]:
def generate_biased_data(n=50):
    a, b = 2, 3 # y = ax + b
    x_values = np.random.randint(1, 50, size=n).astype(np.float32)
    y_values = (a * x_values + b).astype(np.float32)

    # Separate even and odd values
    odd_values = y_values[y_values % 2 == 1]
    even_values = y_values[y_values % 2 == 0]

    # Use only odd values for biased training
    final_y_values = odd_values
    final_x_values = x_values[:len(final_y_values)]

    return final_x_values, final_y_values

In [None]:
def generate_full_data(n=50):
    a, b = 2, 3
    x_values = np.random.randint(1, 50, size=n).astype(np.float32)
    y_values = (a * x_values + b).astype(np.float32)
    return x_values, y_values

## Supervised Fine-Tuning (SFT)

In [None]:
def train_supervised(model, x_train, y_train, epochs=100, lr=0.01):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        optimizer.zero_grad()
        x_tensor = torch.tensor(x_train).unsqueeze(1)
        y_tensor = torch.tensor(y_train).unsqueeze(1)

        y_pred = model(x_tensor)
        loss = criterion(y_pred, y_tensor)
        loss.backward()
        optimizer.step()

        if epoch % 50 == 0:
            print(f"Epoch {epoch}: Loss = {loss.item():.4f}")

## Reward Function for GRPO

In [None]:
def reward_function(y_pred, y_true):
    error = torch.abs(y_pred - y_true)
    even_bonus = (y_pred % 2 == 0).float() * 0.2 # Extra reward for even numbers
    reward = torch.exp(-error) + even_bonus
    return reward

## GRPO Fine-Tuning

In [None]:
def train_grpo(model, x_train, y_train, steps=500, lr=0.01):
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for step in range(steps):
        optimizer.zero_grad()
        x_tensor = torch.tensor(x_train).unsqueeze(1)
        y_tensor = torch.tensor(y_train).unsqueeze(1)

        y_pred = model(x_tensor)
        rewards = reward_function(y_pred, y_tensor)

        loss = -torch.mean(rewards) # Minimize negative rewards (maximize rewards)
        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            print(f"Step {step}: Reward = {rewards.mean().item():.4f}")