# Colab A100 Runbook (Minimal)

Run cells in order.

This notebook does only what is required: pull latest code, run experiments, aggregate results, and export figures/tables.

In [None]:
# Cell 1: Pull latest repo + setup environment
import os
import sys
import subprocess
from pathlib import Path

REPO_URL = "https://github.com/thenileshmishra/AS-RoPE.git"  # change if needed
ROOT = Path('/content/Neur')

if ROOT.exists() and (ROOT / '.git').exists():
    subprocess.run(['git', '-C', str(ROOT), 'fetch', 'origin'], check=True)
    subprocess.run(['git', '-C', str(ROOT), 'reset', '--hard', 'origin/main'], check=True)
else:
    if ROOT.exists():
        subprocess.run(['rm', '-rf', str(ROOT)], check=True)
    subprocess.run(['git', 'clone', REPO_URL, str(ROOT)], check=True)

os.environ['PYTHONHASHSEED'] = '0'
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', '-r', str(ROOT / 'requirements.txt')], check=True)
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'pandas', 'matplotlib', 'seaborn', 'tabulate'], check=True)

print('ROOT =', ROOT)
print('Python =', sys.executable)

In [9]:
# Cell 2: PHASE 1 — Full baseline suite (5 methods x 3 seeds), then eval at 1024/2048/4096/8192
import json
import shlex

# Set to False for full paper-scale run. True is for error-free end-to-end debug run.
DEBUG_QUICK = True

METHODS = ['rope', 'scaled_rope', 'as_rope', 'alibi', 'ntk_scaled_rope']
SEEDS = [42, 123, 999]

TRAIN_CFG = {
    'layers': 6,
    'n_heads': 8,
    'd_model': 512,
    'context_length': 1024,
    'batch_size': 32,
    'optimizer': 'adamw',
    'lr': 3e-4,
    'weight_decay': 0.1,
    'max_steps': 60000 if not DEBUG_QUICK else 20,
    'warmup_steps': 3000 if not DEBUG_QUICK else 5,
    'eval_interval': 5000 if not DEBUG_QUICK else 20,
    'grad_clip': 1.0,
    'fp16': True,
    'grad_accum_steps': 4,
}

EVAL_LENGTHS = '1024,2048,4096,8192'

def run_cmd(cmd):
    print('\n$', ' '.join(shlex.quote(c) for c in cmd))
    proc = subprocess.run(cmd, cwd=ROOT, text=True, capture_output=True)
    if proc.stdout:
        print(proc.stdout)
    if proc.returncode != 0:
        if proc.stderr:
            print(proc.stderr)
        raise RuntimeError(f'Command failed with exit code {proc.returncode}')

for method in METHODS:
    for seed in SEEDS:
        run_dir = ROOT / 'results' / 'train_runs' / method / f'seed_{seed}'
        out_json = ROOT / 'results' / method / f'seed_{seed}.json'
        out_json.parent.mkdir(parents=True, exist_ok=True)

        train_cmd = [
            sys.executable, '-m', 'src.train',
            '--positional_encoding', method,
            '--seed', str(seed),
            '--layers', str(TRAIN_CFG['layers']),
            '--n_heads', str(TRAIN_CFG['n_heads']),
            '--d_model', str(TRAIN_CFG['d_model']),
            '--context_length', str(TRAIN_CFG['context_length']),
            '--batch_size', str(TRAIN_CFG['batch_size']),
            '--optimizer', TRAIN_CFG['optimizer'],
            '--lr', str(TRAIN_CFG['lr']),
            '--weight_decay', str(TRAIN_CFG['weight_decay']),
            '--max_steps', str(TRAIN_CFG['max_steps']),
            '--warmup_steps', str(TRAIN_CFG['warmup_steps']),
            '--eval_interval', str(TRAIN_CFG['eval_interval']),
            '--grad_clip', str(TRAIN_CFG['grad_clip']),
            '--grad_accum_steps', str(TRAIN_CFG['grad_accum_steps']),
            '--output_dir', 'results/train_runs',
            '--device', 'cuda',
        ]
        if TRAIN_CFG['fp16']:
            train_cmd.append('--fp16')

        run_cmd(train_cmd)

        ckpt = run_dir / 'checkpoint.pt'
        eval_cmd = [
            sys.executable, '-m', 'src.eval_perplexity',
            '--checkpoint_path', str(ckpt),
            '--context_lengths', EVAL_LENGTHS,
            '--seed', str(seed),
            '--device', 'cuda',
            '--results_json', str(out_json),
        ]
        run_cmd(eval_cmd)

print('Phase 1 complete.')


$ /usr/bin/python3 -m src.train --positional_encoding rope --seed 42 --layers 6 --n_heads 8 --d_model 512 --context_length 1024 --batch_size 32 --optimizer adamw --lr 0.0003 --weight_decay 0.1 --max_steps 20 --warmup_steps 5 --eval_interval 20 --grad_clip 1.0 --grad_accum_steps 4 --output_dir results/train_runs --device cuda --fp16
positional_encoding=rope
seeds=[42]

--- Seed run start | positional_encoding=rope | seed=42 ---
Training on cuda
Parameters: 44.63M
d_model=512 n_layers=6 n_heads=8 context_length=1024 batch_size=32 grad_accum_steps=4 fp16=True
step     1 | train_loss 1817.1959 | lr 6.000000e-05 | tokens 131072 | 1.9s
Seed 42 outputs saved to: /content/Neur/results/train_runs/rope/seed_42
All-seed metrics saved: /content/Neur/results/train_runs/rope/all_seeds_metrics.json


$ /usr/bin/python3 -m src.eval_perplexity --checkpoint_path /content/Neur/results/train_runs/rope/seed_42/checkpoint.pt --context_lengths 1024,2048,4096,8192 --seed 42 --device cuda --results_json /cont

In [10]:
# Cell 4: PHASE 2 — Statistical aggregation + LaTeX + plots (in notebook)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_context('talk')
sns.set_style('whitegrid')

rows = []
for method in METHODS:
    for seed in SEEDS:
        p = ROOT / 'results' / method / f'seed_{seed}.json'
        if not p.exists():
            continue
        with p.open() as f:
            obj = json.load(f)
        for length_str, ppl in obj['perplexity'].items():
            rows.append({'method': method, 'seed': seed, 'length': int(length_str), 'ppl': float(ppl)})

df = pd.DataFrame(rows)
assert len(df) > 0, 'No result JSON files found. Run Phase 1 first.'

summary = (
    df.groupby(['method', 'length'], as_index=False)
      .agg(mean_ppl=('ppl', 'mean'), std_ppl=('ppl', 'std'))
      .fillna(0.0)
)

rope_ref = summary[summary['method'] == 'rope'][['length', 'mean_ppl']].rename(columns={'mean_ppl': 'rope_mean'})
summary = summary.merge(rope_ref, on='length', how='left')
summary['rel_improve_vs_rope_pct'] = 100.0 * (summary['rope_mean'] - summary['mean_ppl']) / summary['rope_mean']
summary = summary.sort_values(['length', 'method']).reset_index(drop=True)

results_dir = ROOT / 'results'
fig_dir = ROOT / 'figures'
results_dir.mkdir(parents=True, exist_ok=True)
fig_dir.mkdir(parents=True, exist_ok=True)

summary_csv = results_dir / 'summary.csv'
summary_tex = results_dir / 'summary.tex'
summary.to_csv(summary_csv, index=False)

pivot = summary.pivot(index='method', columns='length', values='mean_ppl')
pivot = pivot[[c for c in sorted(pivot.columns)]]
with summary_tex.open('w', encoding='utf-8') as f:
    f.write(pivot.to_latex(float_format=lambda x: f'{x:.4f}', caption='Perplexity across context lengths', label='tab:ppl_summary'))

# Plot 1: means
plt.figure(figsize=(8, 5))
for method in METHODS:
    sub = summary[summary['method'] == method].sort_values('length')
    if len(sub) == 0:
        continue
    plt.plot(sub['length'], sub['mean_ppl'], marker='o', label=method)
plt.xscale('log', base=2)
plt.xlabel('Context length')
plt.ylabel('Perplexity')
plt.title('Perplexity vs Context Length')
plt.legend()
plt.tight_layout()
plt.savefig(fig_dir / 'ppl_vs_length.pdf')
plt.close()

# Plot 2: means + std error bars
plt.figure(figsize=(8, 5))
for method in METHODS:
    sub = summary[summary['method'] == method].sort_values('length')
    if len(sub) == 0:
        continue
    plt.errorbar(sub['length'], sub['mean_ppl'], yerr=sub['std_ppl'], marker='o', capsize=4, label=method)
plt.xscale('log', base=2)
plt.xlabel('Context length')
plt.ylabel('Perplexity')
plt.title('Perplexity vs Context Length (Mean ± Std)')
plt.legend()
plt.tight_layout()
plt.savefig(fig_dir / 'ppl_vs_length_with_errorbars.pdf')
plt.close()

print('Saved:', summary_csv)
print('Saved:', summary_tex)
print('Saved:', fig_dir / 'ppl_vs_length.pdf')
print('Saved:', fig_dir / 'ppl_vs_length_with_errorbars.pdf')
summary.head()

Saved: /content/Neur/results/summary.csv
Saved: /content/Neur/results/summary.tex
Saved: /content/Neur/figures/ppl_vs_length.pdf
Saved: /content/Neur/figures/ppl_vs_length_with_errorbars.pdf


Unnamed: 0,method,length,mean_ppl,std_ppl,rope_mean,rel_improve_vs_rope_pct
0,alibi,1024,5.078258e+23,1.414139e+23,8.274925e+23,38.630762
1,as_rope,1024,8.265627e+23,2.594658e+23,8.274925e+23,0.112365
2,ntk_scaled_rope,1024,8.266672e+23,2.637909e+23,8.274925e+23,0.099736
3,rope,1024,8.274925e+23,2.649881e+23,8.274925e+23,0.0
4,scaled_rope,1024,8.274104e+23,2.64969e+23,8.274925e+23,0.009919


In [11]:
# Cell 4: PHASE 3 — Scaling check (single seed=42; rope vs as_rope)
SCALING_METHODS = ['rope', 'as_rope']
SCALING_SEED = 42

for method in SCALING_METHODS:
    run_dir = ROOT / 'results' / 'scaling' / method / f'seed_{SCALING_SEED}'
    out_json = ROOT / 'results' / 'scaling' / f'{method}_seed_{SCALING_SEED}.json'
    out_json.parent.mkdir(parents=True, exist_ok=True)

    train_cmd = [
        sys.executable, '-m', 'src.train',
        '--positional_encoding', method,
        '--seed', str(SCALING_SEED),
        '--layers', '8',
        '--n_heads', '8',
        '--d_model', '768',
        '--context_length', '1024',
        '--batch_size', '8',
        '--grad_accum_steps', '16',
        '--optimizer', 'adamw',
        '--lr', '3e-4',
        '--weight_decay', '0.1',
        '--max_steps', '40000' if not DEBUG_QUICK else '10',
        '--warmup_steps', '2000' if not DEBUG_QUICK else '3',
        '--eval_interval', '5000' if not DEBUG_QUICK else '10',
        '--grad_clip', '1.0',
        '--fp16',
        '--output_dir', 'results/scaling',
        '--device', 'cuda',
    ]
    run_cmd(train_cmd)

    ckpt = run_dir / 'checkpoint.pt'
    eval_cmd = [
        sys.executable, '-m', 'src.eval_perplexity',
        '--checkpoint_path', str(ckpt),
        '--context_lengths', '1024,4096,8192',
        '--seed', str(SCALING_SEED),
        '--device', 'cuda',
        '--results_json', str(out_json),
    ]
    run_cmd(eval_cmd)

print('Phase 3 complete.')


$ /usr/bin/python3 -m src.train --positional_encoding rope --seed 42 --layers 8 --n_heads 8 --d_model 768 --context_length 1024 --batch_size 8 --grad_accum_steps 16 --optimizer adamw --lr 3e-4 --weight_decay 0.1 --max_steps 10 --warmup_steps 3 --eval_interval 10 --grad_clip 1.0 --fp16 --output_dir results/scaling --device cuda
positional_encoding=rope
seeds=[42]

--- Seed run start | positional_encoding=rope | seed=42 ---
Training on cuda
Parameters: 95.28M
d_model=768 n_layers=8 n_heads=8 context_length=1024 batch_size=8 grad_accum_steps=16 fp16=True
step     1 | train_loss 10502.0366 | lr 1.000000e-04 | tokens 131072 | 2.2s
Seed 42 outputs saved to: /content/Neur/results/scaling/rope/seed_42
All-seed metrics saved: /content/Neur/results/scaling/rope/all_seeds_metrics.json


$ /usr/bin/python3 -m src.eval_perplexity --checkpoint_path /content/Neur/results/scaling/rope/seed_42/checkpoint.pt --context_lengths 1024,4096,8192 --seed 42 --device cuda --results_json /content/Neur/results/s

In [15]:
# Cell 5: PHASE 4 — AS-RoPE ablations (20k steps)
ABLATIONS = [
    {
        'name': 'as_rope_per_layer_gates',
        'extra_train_args': ['--as_rope_per_layer_gates'],
    },
    {
        'name': 'as_rope_without_positivity',
        'extra_train_args': ['--allow_negative_gates'],
    },
    {
        'name': 'as_rope_gate_weight_decay',
        'extra_train_args': ['--gate_weight_decay', '0.1'],
    },
]

for ab in ABLATIONS:
    run_dir = ROOT / 'results' / 'ablations' / ab['name'] / 'as_rope' / 'seed_42'
    out_json = ROOT / 'results' / 'ablations' / f"{ab['name']}.json"
    out_json.parent.mkdir(parents=True, exist_ok=True)

    train_cmd = [
        sys.executable, '-m', 'src.train',
        '--positional_encoding', 'as_rope',
        '--seed', '42',
        '--layers', '6',
        '--n_heads', '8',
        '--d_model', '512',
        '--context_length', '1024',
        '--batch_size', '32',
        '--optimizer', 'adamw',
        '--lr', '3e-4',
        '--weight_decay', '0.1',
        '--max_steps', '20000' if not DEBUG_QUICK else '10',
        '--warmup_steps', '1000' if not DEBUG_QUICK else '3',
        '--eval_interval', '5000' if not DEBUG_QUICK else '10',
        '--grad_clip', '1.0',
        '--grad_accum_steps', '4',
        '--fp16',
        '--output_dir', f"results/ablations/{ab['name']}",
        '--device', 'cuda',
    ] + ab['extra_train_args']

    run_cmd(train_cmd)

    ckpt = run_dir / 'checkpoint.pt'
    eval_cmd = [
        sys.executable, '-m', 'src.eval_perplexity',
        '--checkpoint_path', str(ckpt),
        '--context_lengths', '1024,4096',
        '--seed', '42',
        '--device', 'cuda',
        '--results_json', str(out_json),
    ]
    run_cmd(eval_cmd)

print('Phase 4 complete.')


$ /usr/bin/python3 -m src.train --positional_encoding as_rope --seed 42 --layers 6 --n_heads 8 --d_model 512 --context_length 1024 --batch_size 32 --optimizer adamw --lr 3e-4 --weight_decay 0.1 --max_steps 10 --warmup_steps 3 --eval_interval 10 --grad_clip 1.0 --grad_accum_steps 4 --fp16 --output_dir results/ablations/as_rope_per_layer_gates --device cuda --as_rope_per_layer_gates
positional_encoding=as_rope
seeds=[42]

--- Seed run start | positional_encoding=as_rope | seed=42 ---
Training on cuda
Parameters: 44.64M
d_model=512 n_layers=6 n_heads=8 context_length=1024 batch_size=32 grad_accum_steps=4 fp16=True
as_rope_per_layer_gates=True allow_negative_gates=False
step     1 | train_loss 1817.2002 | lr 1.000000e-04 | tokens 131072 | 1.6s
Seed 42 outputs saved to: /content/Neur/results/ablations/as_rope_per_layer_gates/as_rope/seed_42
All-seed metrics saved: /content/Neur/results/ablations/as_rope_per_layer_gates/as_rope/all_seeds_metrics.json


$ /usr/bin/python3 -m src.eval_perplex

RuntimeError: Command failed with exit code 1

In [13]:
# Cell 7: PHASE 5 — Spectral analysis plots for final AS-RoPE checkpoint (seed=42 main run)
import math
import torch

ckpt_path = ROOT / 'results' / 'train_runs' / 'as_rope' / 'seed_42' / 'checkpoint.pt'
assert ckpt_path.exists(), f'Checkpoint not found: {ckpt_path}'

ckpt = torch.load(ckpt_path, map_location='cpu')
state = ckpt['model_state_dict']
config = ckpt.get('config', {})

gate_key = 'freq_gates'
assert gate_key in state, 'freq_gates not found in checkpoint; ensure AS-RoPE run completed.'
gates = state[gate_key].float().cpu()

if gates.ndim == 2:  # per-layer gates -> take mean profile
    gates_plot = gates.mean(dim=0)
else:
    gates_plot = gates

d_model = int(config.get('d_model', 512))
half_dim = d_model // 2
idx = torch.arange(half_dim, dtype=torch.float32)
omega = 1.0 / (10000.0 ** (2.0 * idx / d_model))

g = gates_plot[:half_dim]
effective = g * omega
period = (2.0 * math.pi) / (effective.abs() + 1e-12)

fig_dir = ROOT / 'figures'
fig_dir.mkdir(parents=True, exist_ok=True)

# spectral_profile.pdf (3-panel)
fig, ax = plt.subplots(1, 3, figsize=(16, 4.5))
ax[0].plot(g.numpy())
ax[0].set_title('Gate vs Frequency Index')
ax[0].set_xlabel('Index i')
ax[0].set_ylabel('g_i')

ax[1].plot(omega.numpy(), label='original $\\omega_i$')
ax[1].plot(effective.numpy(), label='effective $g_i\\omega_i$')
ax[1].set_yscale('log')
ax[1].set_title('Original vs Effective Frequency')
ax[1].set_xlabel('Index i')
ax[1].legend()

ax[2].hist(g.numpy(), bins=40)
ax[2].set_title('Histogram of Gates')
ax[2].set_xlabel('g_i')

fig.tight_layout()
fig.savefig(fig_dir / 'spectral_profile.pdf')
plt.close(fig)

# effective_period.pdf
fig, ax = plt.subplots(1, 1, figsize=(7, 4.5))
ax.hist(period.numpy(), bins=50)
ax.set_title('Histogram of Effective Periods')
ax.set_xlabel('T_i = 2π / |g_iω_i|')
ax.set_ylabel('Count')
fig.tight_layout()
fig.savefig(fig_dir / 'effective_period.pdf')
plt.close(fig)

print('Saved:', fig_dir / 'spectral_profile.pdf')
print('Saved:', fig_dir / 'effective_period.pdf')

Saved: /content/Neur/figures/spectral_profile.pdf
Saved: /content/Neur/figures/effective_period.pdf


In [14]:
# Cell 8: PHASE 6 — Artifact check
from pathlib import Path

must_exist = [
    ROOT / 'results' / 'summary.csv',
    ROOT / 'results' / 'summary.tex',
    ROOT / 'figures' / 'ppl_vs_length.pdf',
    ROOT / 'figures' / 'ppl_vs_length_with_errorbars.pdf',
    ROOT / 'figures' / 'spectral_profile.pdf',
    ROOT / 'figures' / 'effective_period.pdf',
]

for p in must_exist:
    print(('OK  ' if p.exists() else 'MISS'), p)

print('\nSample result files:')
for p in sorted((ROOT / 'results').rglob('*.json'))[:12]:
    print('-', p)

OK   /content/Neur/results/summary.csv
OK   /content/Neur/results/summary.tex
OK   /content/Neur/figures/ppl_vs_length.pdf
OK   /content/Neur/figures/ppl_vs_length_with_errorbars.pdf
OK   /content/Neur/figures/spectral_profile.pdf
OK   /content/Neur/figures/effective_period.pdf

Sample result files:
- /content/Neur/results/ablations/as_rope_per_layer_gates/as_rope/all_seeds_metrics.json
- /content/Neur/results/ablations/as_rope_per_layer_gates/as_rope/seed_42/metrics.json
- /content/Neur/results/alibi/seed_123.json
- /content/Neur/results/alibi/seed_42.json
- /content/Neur/results/alibi/seed_999.json
- /content/Neur/results/as_rope/seed_123.json
- /content/Neur/results/as_rope/seed_42.json
- /content/Neur/results/as_rope/seed_999.json
- /content/Neur/results/ntk_scaled_rope/seed_123.json
- /content/Neur/results/ntk_scaled_rope/seed_42.json
- /content/Neur/results/ntk_scaled_rope/seed_999.json
- /content/Neur/results/rope/seed_123.json
