# Denial Prompting RL - Google Colab Pilot Test
- Model: GPT-2 (124M parameters)
- Data: NeoCoder dataset (subset of 10 problems)
- Training: 50 GRPO steps with gradient updates
- GPU: Google Colab T4 (free)

**Runtime:** Around 5 minutes (check pipeline before deploying to NSCC)

## 1. Setup Runtime
Enable GPU in Colab (T4)

In [None]:
# Check that GPU is available
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("GPU not available. Enable in settings")

## 2. Clone Repository

In [None]:
# Clone your repository (replace with your actual repo URL)
!git clone https://github.com/thongthornpatch/denial_prompting_RL_pilot.git

## 3. Install Dependencies

In [None]:
# Install required packages
!pip install -q transformers>=4.35.0 datasets>=2.14.0 RestrictedPython>=6.0 tqdm pyyaml
print("Dependencies installed")

## 4. Download NeoCoder Dataset (use just subset in this pilot)

In [None]:
# Download NeoCoder dataset
!python scripts/download_neocoder.py

## 5. Check Setup

In [None]:
# Test all components
!python scripts/test_setup.py

## 6. Pilot Training (50 steps)
- Load GPT-2 model
- Generate code
- Compute rewards
- Update model weights using GRPO
- Save metrics and checkpoints

**Runtime:** around 5 mins

In [None]:
# Run actual training with real model
!python scripts/train.py \
    --config configs/config_colab.yaml \
    --output_dir outputs/colab_pilot

## 7. Analyze Results

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

# Load training metrics
with open('outputs/colab_pilot/metrics.json') as f:
    metrics = json.load(f)

# Extract data
steps = sorted([int(k) for k in metrics.keys()])
rewards = [metrics[str(s)]['mean_reward'] for s in steps]
violations = [metrics[str(s)]['mean_violations'] for s in steps]
success_rates = [metrics[str(s)]['success_rate'] for s in steps]
losses = [metrics[str(s)]['loss'] for s in steps]

print("="*80)
print("TRAINING RESULTS SUMMARY")
print("="*80)
print(f"\nTotal steps: {len(steps)}")
print(f"\nReward Statistics:")
print(f"  Initial reward: {rewards[0]:.3f}")
print(f"  Final reward: {rewards[-1]:.3f}")
print(f"  Change: {rewards[-1] - rewards[0]:+.3f}")
print(f"  Max reward: {max(rewards):.3f}")

# Compare first vs second half
mid = len(rewards) // 2
first_half = sum(rewards[:mid]) / mid
second_half = sum(rewards[mid:]) / (len(rewards) - mid)
print(f"\nLearning Progress:")
print(f"  First half average: {first_half:.3f}")
print(f"  Second half average: {second_half:.3f}")
if second_half > first_half:
    print(f"Improving! (+{second_half - first_half:.3f})")
else:
    print(f"Declining ({second_half - first_half:.3f})")

print(f"\nViolations:")
print(f"  Average: {sum(violations)/len(violations):.2f}")
print(f"  Initial: {violations[0]:.2f}")
print(f"  Final: {violations[-1]:.2f}")

print(f"\nSuccess Rate:")
print(f"  Average: {sum(success_rates)/len(success_rates):.1%}")
print(f"  Initial: {success_rates[0]:.1%}")
print(f"  Final: {success_rates[-1]:.1%}")

print("\n" + "="*80)

## 8. Visualise Training Progress

In [None]:
# Create visualisation
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Reward over time
axes[0, 0].plot(steps, rewards, marker='o', linewidth=2, markersize=4)
axes[0, 0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[0, 0].set_title('Mean Reward Over Time', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Step')
axes[0, 0].set_ylabel('Reward')
axes[0, 0].grid(True, alpha=0.3)

# Violations over time
axes[0, 1].plot(steps, violations, marker='o', color='red', linewidth=2, markersize=4)
axes[0, 1].set_title('Constraint Violations Over Time', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Step')
axes[0, 1].set_ylabel('Violations per Solution')
axes[0, 1].grid(True, alpha=0.3)

# Success rate over time
axes[1, 0].plot(steps, [s*100 for s in success_rates], marker='o', color='green', linewidth=2, markersize=4)
axes[1, 0].set_title('Success Rate Over Time', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Step')
axes[1, 0].set_ylabel('Success Rate (%)')
axes[1, 0].set_ylim([0, 105])
axes[1, 0].grid(True, alpha=0.3)

# Training loss over time
axes[1, 1].plot(steps, losses, marker='o', color='purple', linewidth=2, markersize=4)
axes[1, 1].set_title('Training Loss Over Time', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Step')
axes[1, 1].set_ylabel('Loss')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/colab_pilot/training_curves.png', dpi=150, bbox_inches='tight')
plt.show()

print("Training curves saved to outputs/colab_pilot/training_curves.png")

## 9. Inspect Code Examples

In [None]:
# Show example of what the model generates
import sys
sys.path.insert(0, 'src')

from models.model_wrapper import ModelWrapper

# Load the trained model
print("Loading trained model from checkpoint")
model = ModelWrapper(
    model_name="outputs/colab_pilot/checkpoints/final_model",
    device="cuda" if torch.cuda.is_available() else "cpu",
)

# Test prompt
test_prompt = """# Write a function that returns the sum of two numbers
# DO NOT use: while loop
def solve(a, b):
    """

print("\nGenerating code from trained model...")
print("="*80)
print("PROMPT:")
print(test_prompt)
print("="*80)
print("\nGenerated solutions:\n")

# Generate 3 solutions
solutions = model.generate(
    prompt=test_prompt,
    max_new_tokens=100,
    temperature=0.8,
    num_return_sequences=3
)

for i, sol in enumerate(solutions, 1):
    print(f"Solution {i}:")
    print("-" * 80)
    print(test_prompt + sol)
    print("-" * 80)
    print()

## 10. Interpretation and Next Steps:

If results look good, then deploy to NSCC for full training (5000 steps)

## 11. Download Results

Download the results to local

In [None]:
# Zip results
!zip -r colab_pilot_results.zip outputs/colab_pilot/

# Download in Colab
from google.colab import files
files.download('colab_pilot_results.zip')

print("Ready")