In [1]:
!pip install datasets torch transformers matplotlib numpy tqdm



In [2]:
from google.colab import drive
drive.mount("/content/drive")

# PROJECT = "/content/drive/MyDrive/DLFinalProject/rank-collapse"
PROJECT = "/content/drive/MyDrive/rank-collapse"
%cd {PROJECT}

from utils import *

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1dwgmS-Fk3Kgg-RWfFuIvr8z2CnGXc00h/rank-collapse
üñ•Ô∏è  Using device: cuda
   GPU: NVIDIA A100-SXM4-40GB
‚úÖ Setup complete!


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00004-2d5a1467fff108(‚Ä¶):   0%|          | 0.00/249M [00:00<?, ?B/s]

data/train-00001-of-00004-5852b56a2bd28f(‚Ä¶):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/train-00002-of-00004-a26307300439e9(‚Ä¶):   0%|          | 0.00/246M [00:00<?, ?B/s]

data/train-00003-of-00004-d243063613e5a0(‚Ä¶):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/validation-00000-of-00001-869c898b5(‚Ä¶):   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

‚úÖ Datasets defined:
   - Copy (easy, diagonal attention)
   - Reverse (medium, anti-diagonal)
   - Sort (hard, global reasoning)
   - Text (language modeling)
   - Image (classification)
‚úÖ Transformer architectures defined:
   - TINY: 2 layers, 64d, 4 heads (~0K params)
   - SMALL: 4 layers, 128d, 8 heads (~4K params)
   - MEDIUM: 6 layers, 256d, 8 heads (~12K params)
   - LARGE: 8 layers, 512d, 16 heads (~65K params)
‚úÖ Rank metrics defined:
   - Stable rank (Frobenius / Operator norm)
   - Entropy effective rank
   - Top-k energy fraction
   - Attention sparsity
‚úÖ Training system ready!
‚úÖ Visualization tools ready!


In [3]:
def run_collapse_expand_combos(
        task_name="copy", dataset_size_name="medium", arch="small",
        collapse_func=None, expand_func=None, steps=2000,
        batch_size=64, out_dir="logs/weight0.01", seed=42):
    """
    Runs:
      - collapse early (first25) then expand late (last25)
      - expand early then collapse late
      - collapse early only
      - expand early only
    Uses collapse_func and expand_func from dictionaries.
    """
    combos = [
        ("collapseEarlyThenExpandLate", [ (0, int(0.25*steps)-1, collapse_func), (int(0.75*steps), steps-1, expand_func) ]),
        ("expandEarlyThenCollapseLate", [ (0, int(0.25*steps)-1, expand_func), (int(0.75*steps), steps-1, collapse_func) ]),
        ("collapseEarlyOnly", [ (0, int(0.25*steps)-1, collapse_func) ]),
        ("expandEarlyOnly", [ (0, int(0.25*steps)-1, expand_func) ]),
    ]
    sizes = {"small": 1024, "medium": 4096, "large": 8192}

    for combo_name, funcs in combos:
        print(f"RUN combo {combo_name}")
        # build TrainingScheduler from funcs mapping to (start,end,func)
        scheduler = TrainingScheduler(which_layers="all", base_func=cross_entropy, functions_at_times=funcs)
        # build datasets & model
        dataset_size = sizes[dataset_size_name]
        ds_train, task_type, vocab_size, _ = create_dataset(
            task_name, seq_len=32, vocab_size=64, dataset_size=dataset_size,
            split="train")
        ds_probe, _, _, _ = create_dataset(
            task_name, seq_len=32, vocab_size=64,
            dataset_size=min(512, dataset_size), split="train")
        model, arch_config = instantiate_architecture(arch, vocab_size=max(32, vocab_size), seq_len=32, task_type=task_type)
        model = model.to(device)

        # wrapper loss to ensure scheduled functions get model/layers/activations
        # we reuse run_experiment but pass scheduler object directly: run_experiment calls scheduler.call_function with **kwargs
        log_name = f"{task_name}_{dataset_size_name}_{arch}_{combo_name}_steps{steps}_seed{seed}.npy"
        out_path = os.path.join(out_dir, log_name)
        os.makedirs(out_dir, exist_ok=True)

        if not os.path.exists(out_path):
            model, _ = run_experiment((ds_train, task_type, None, None),
                (ds_probe, task_type, None, None),
                model, arch_config,
                steps=steps, batch_size=batch_size,
                checkpoint_every=max(1, steps//10), lr=3e-4,
                seed=seed, loss_scheduler=scheduler, metric_func=full_spectrum_metrics,
                log_store_name=out_path, weight_decay=0.01)
            print(f"Saved combo log to {out_path}")
        else:
            print(f"{out_path} already done!")

def run_collapse_expand_sweeps(
        collapse_func, expand_func, tasks=None, archs=None,
        dataset_sizes=None, steps=2000, batch_size=64,
        out_dir="logs", seed=42):
    """
    Iterate over tasks √ó archs √ó dataset sizes √ó regs √ó timings and run experiments.
    By default uses the increase_dict keys then decrease_dict keys.
    """
    if tasks is None:
        tasks = ["copy", "sort", "tiny_stories"]
    if archs is None:
        archs = ["small", "medium", "large"]
    if dataset_sizes is None:
        dataset_sizes = ["medium"]

    all_runs = []

    for task in tasks:
        for size_name in dataset_sizes:
            for arch in archs:
                print(f"RUN: task={task} size={size_name} arch={arch}")
                run_collapse_expand_combos(
                    task, size_name, arch, collapse_func, expand_func, steps,
                    batch_size, out_dir=out_dir, seed=seed)
    return all_runs

In [4]:
def run_tests(weight=1.0):
    collapse_funcs = ["logdet", "erank"]
    expand_funcs = ["orthonorm", "spectral_norm"]

    for collapse_name in collapse_funcs:
        for expand_name in expand_funcs:
            if collapse_funcs == "erank" and expand_funcs == "spectral_norm":
                return
            collapse_func = lambda logits, yb, **kwargs: weight * increase_rank_regularizers[collapse_name](logits, yb, **kwargs)
            expand_func = decrease_rank_regularizers[expand_name]
            out_dir = f"logs/{weight}/{collapse_name}_{expand_name}"
            run_collapse_expand_sweeps(collapse_func, expand_func, out_dir=out_dir)

In [None]:
run_tests(1.0)

RUN: task=copy size=medium arch=small
RUN combo collapseEarlyThenExpandLate
logs/1.0/logdet_orthonorm/copy_medium_small_collapseEarlyThenExpandLate_steps2000_seed42.npy already done!
RUN combo expandEarlyThenCollapseLate
logs/1.0/logdet_orthonorm/copy_medium_small_expandEarlyThenCollapseLate_steps2000_seed42.npy already done!
RUN combo collapseEarlyOnly
logs/1.0/logdet_orthonorm/copy_medium_small_collapseEarlyOnly_steps2000_seed42.npy already done!
RUN combo expandEarlyOnly
logs/1.0/logdet_orthonorm/copy_medium_small_expandEarlyOnly_steps2000_seed42.npy already done!
RUN: task=copy size=medium arch=medium
RUN combo collapseEarlyThenExpandLate
logs/1.0/logdet_orthonorm/copy_medium_medium_collapseEarlyThenExpandLate_steps2000_seed42.npy already done!
RUN combo expandEarlyThenCollapseLate
logs/1.0/logdet_orthonorm/copy_medium_medium_expandEarlyThenCollapseLate_steps2000_seed42.npy already done!
RUN combo collapseEarlyOnly
logs/1.0/logdet_orthonorm/copy_medium_medium_collapseEarlyOnly_step

Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [03:06<00:00, 10.74step/s, loss=0.0048, acc=1.0000]


‚úÖ Training complete in 186.2s
Saved combo log to logs/1.0/logdet_orthonorm/copy_medium_large_collapseEarlyOnly_steps2000_seed42.npy
RUN combo expandEarlyOnly
Model: 8 layers, 512d, 16 heads
Parameters: 25,547,840
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [02:56<00:00, 11.35step/s, loss=0.0000, acc=1.0000]


‚úÖ Training complete in 176.2s
Saved combo log to logs/1.0/logdet_orthonorm/copy_medium_large_expandEarlyOnly_steps2000_seed42.npy
RUN: task=sort size=medium arch=small
RUN combo collapseEarlyThenExpandLate
Model: 4 layers, 128d, 8 heads
Parameters: 875,328
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [00:50<00:00, 39.26step/s, loss=2.9652, acc=0.1323]


‚úÖ Training complete in 51.0s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_small_collapseEarlyThenExpandLate_steps2000_seed42.npy
RUN combo expandEarlyThenCollapseLate
Model: 4 layers, 128d, 8 heads
Parameters: 875,328
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [00:51<00:00, 39.15step/s, loss=2.9498, acc=0.1457]


‚úÖ Training complete in 51.1s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_small_expandEarlyThenCollapseLate_steps2000_seed42.npy
RUN combo collapseEarlyOnly
Model: 4 layers, 128d, 8 heads
Parameters: 875,328
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [00:48<00:00, 40.86step/s, loss=2.9250, acc=0.1318]


‚úÖ Training complete in 49.0s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_small_collapseEarlyOnly_steps2000_seed42.npy
RUN combo expandEarlyOnly
Model: 4 layers, 128d, 8 heads
Parameters: 875,328
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [00:48<00:00, 41.45step/s, loss=2.1968, acc=0.3062]


‚úÖ Training complete in 48.3s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_small_expandEarlyOnly_steps2000_seed42.npy
RUN: task=sort size=medium arch=medium
RUN combo collapseEarlyThenExpandLate
Model: 6 layers, 256d, 8 heads
Parameters: 4,902,976
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:20<00:00, 24.97step/s, loss=3.0866, acc=0.0577]


‚úÖ Training complete in 80.1s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_medium_collapseEarlyThenExpandLate_steps2000_seed42.npy
RUN combo expandEarlyThenCollapseLate
Model: 6 layers, 256d, 8 heads
Parameters: 4,902,976
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:20<00:00, 24.69step/s, loss=3.3038, acc=0.1157]


‚úÖ Training complete in 81.0s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_medium_expandEarlyThenCollapseLate_steps2000_seed42.npy
RUN combo collapseEarlyOnly
Model: 6 layers, 256d, 8 heads
Parameters: 4,902,976
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:19<00:00, 25.31step/s, loss=3.0422, acc=0.0588]


‚úÖ Training complete in 79.0s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_medium_collapseEarlyOnly_steps2000_seed42.npy
RUN combo expandEarlyOnly
Model: 6 layers, 256d, 8 heads
Parameters: 4,902,976
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:15<00:00, 26.51step/s, loss=1.9416, acc=0.3214]


‚úÖ Training complete in 75.4s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_medium_expandEarlyOnly_steps2000_seed42.npy
RUN: task=sort size=medium arch=large
RUN combo collapseEarlyThenExpandLate
Model: 8 layers, 512d, 16 heads
Parameters: 25,547,840
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [03:06<00:00, 10.71step/s, loss=3.2590, acc=0.0425]


‚úÖ Training complete in 186.8s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_large_collapseEarlyThenExpandLate_steps2000_seed42.npy
RUN combo expandEarlyThenCollapseLate
Model: 8 layers, 512d, 16 heads
Parameters: 25,547,840
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [03:08<00:00, 10.61step/s, loss=3.6595, acc=0.0792]


‚úÖ Training complete in 188.6s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_large_expandEarlyThenCollapseLate_steps2000_seed42.npy
RUN combo collapseEarlyOnly
Model: 8 layers, 512d, 16 heads
Parameters: 25,547,840
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [03:02<00:00, 10.96step/s, loss=3.1826, acc=0.0422]


‚úÖ Training complete in 182.4s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_large_collapseEarlyOnly_steps2000_seed42.npy
RUN combo expandEarlyOnly
Model: 8 layers, 512d, 16 heads
Parameters: 25,547,840
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [02:57<00:00, 11.27step/s, loss=1.0948, acc=0.2845]


‚úÖ Training complete in 177.5s
Saved combo log to logs/1.0/logdet_orthonorm/sort_medium_large_expandEarlyOnly_steps2000_seed42.npy
RUN: task=tiny_stories size=medium arch=small
RUN combo collapseEarlyThenExpandLate


Token indices sequence length is longer than the specified maximum sequence length for this model (1087 > 1024). Running this sequence through the model will result in indexing errors


Model: 4 layers, 128d, 8 heads
Parameters: 13,774,929
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:10<00:00, 28.39step/s, loss=3.9438, acc=0.2525]


‚úÖ Training complete in 70.4s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_small_collapseEarlyThenExpandLate_steps2000_seed42.npy
RUN combo expandEarlyThenCollapseLate
Model: 4 layers, 128d, 8 heads
Parameters: 13,774,929
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:10<00:00, 28.27step/s, loss=3.7951, acc=0.2850]


‚úÖ Training complete in 70.8s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_small_expandEarlyThenCollapseLate_steps2000_seed42.npy
RUN combo collapseEarlyOnly
Model: 4 layers, 128d, 8 heads
Parameters: 13,774,929
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:09<00:00, 28.78step/s, loss=3.9119, acc=0.2586]


‚úÖ Training complete in 69.5s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_small_collapseEarlyOnly_steps2000_seed42.npy
RUN combo expandEarlyOnly
Model: 4 layers, 128d, 8 heads
Parameters: 13,774,929
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:09<00:00, 28.72step/s, loss=3.4018, acc=0.3256]


‚úÖ Training complete in 69.6s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_small_expandEarlyOnly_steps2000_seed42.npy
RUN: task=tiny_stories size=medium arch=medium
RUN combo collapseEarlyThenExpandLate
Model: 6 layers, 256d, 8 heads
Parameters: 30,651,985
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:58<00:00, 16.93step/s, loss=3.7691, acc=0.2698]


‚úÖ Training complete in 118.1s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_medium_collapseEarlyThenExpandLate_steps2000_seed42.npy
RUN combo expandEarlyThenCollapseLate
Model: 6 layers, 256d, 8 heads
Parameters: 30,651,985
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:57<00:00, 16.95step/s, loss=3.8347, acc=0.2841]


‚úÖ Training complete in 118.0s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_medium_expandEarlyThenCollapseLate_steps2000_seed42.npy
RUN combo collapseEarlyOnly
Model: 6 layers, 256d, 8 heads
Parameters: 30,651,985
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:57<00:00, 17.00step/s, loss=3.7340, acc=0.2737]


‚úÖ Training complete in 117.6s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_medium_collapseEarlyOnly_steps2000_seed42.npy
RUN combo expandEarlyOnly
Model: 6 layers, 256d, 8 heads
Parameters: 30,651,985
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:56<00:00, 17.22step/s, loss=2.9782, acc=0.3791]


‚úÖ Training complete in 116.2s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_medium_expandEarlyOnly_steps2000_seed42.npy
RUN: task=tiny_stories size=medium arch=large
RUN combo collapseEarlyThenExpandLate
Model: 8 layers, 512d, 16 heads
Parameters: 76,995,665
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [04:42<00:00,  7.07step/s, loss=3.7134, acc=0.2745]


‚úÖ Training complete in 283.0s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_large_collapseEarlyThenExpandLate_steps2000_seed42.npy
RUN combo expandEarlyThenCollapseLate
Model: 8 layers, 512d, 16 heads
Parameters: 76,995,665
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [04:44<00:00,  7.02step/s, loss=4.0850, acc=0.2744]


‚úÖ Training complete in 284.7s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_large_expandEarlyThenCollapseLate_steps2000_seed42.npy
RUN combo collapseEarlyOnly
Model: 8 layers, 512d, 16 heads
Parameters: 76,995,665
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [04:40<00:00,  7.12step/s, loss=3.6753, acc=0.2786]


‚úÖ Training complete in 280.9s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_large_collapseEarlyOnly_steps2000_seed42.npy
RUN combo expandEarlyOnly
Model: 8 layers, 512d, 16 heads
Parameters: 76,995,665
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [04:32<00:00,  7.34step/s, loss=2.6413, acc=0.4295]


‚úÖ Training complete in 272.6s
Saved combo log to logs/1.0/logdet_orthonorm/tiny_stories_medium_large_expandEarlyOnly_steps2000_seed42.npy
RUN: task=copy size=medium arch=small
RUN combo collapseEarlyThenExpandLate
Model: 4 layers, 128d, 8 heads
Parameters: 875,328
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:07<00:00, 29.57step/s, loss=0.0035, acc=1.0000]


‚úÖ Training complete in 67.6s
Saved combo log to logs/1.0/logdet_spectral_norm/copy_medium_small_collapseEarlyThenExpandLate_steps2000_seed42.npy
RUN combo expandEarlyThenCollapseLate
Model: 4 layers, 128d, 8 heads
Parameters: 875,328
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:04<00:00, 30.82step/s, loss=0.0686, acc=1.0000]


‚úÖ Training complete in 64.9s
Saved combo log to logs/1.0/logdet_spectral_norm/copy_medium_small_expandEarlyThenCollapseLate_steps2000_seed42.npy
RUN combo collapseEarlyOnly
Model: 4 layers, 128d, 8 heads
Parameters: 875,328
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [00:48<00:00, 41.10step/s, loss=0.0044, acc=1.0000]


‚úÖ Training complete in 48.7s
Saved combo log to logs/1.0/logdet_spectral_norm/copy_medium_small_collapseEarlyOnly_steps2000_seed42.npy
RUN combo expandEarlyOnly
Model: 4 layers, 128d, 8 heads
Parameters: 875,328
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:05<00:00, 30.34step/s, loss=0.0007, acc=1.0000]


‚úÖ Training complete in 65.9s
Saved combo log to logs/1.0/logdet_spectral_norm/copy_medium_small_expandEarlyOnly_steps2000_seed42.npy
RUN: task=copy size=medium arch=medium
RUN combo collapseEarlyThenExpandLate
Model: 6 layers, 256d, 8 heads
Parameters: 4,902,976
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [02:31<00:00, 13.24step/s, loss=0.0029, acc=1.0000]


‚úÖ Training complete in 151.0s
Saved combo log to logs/1.0/logdet_spectral_norm/copy_medium_medium_collapseEarlyThenExpandLate_steps2000_seed42.npy
RUN combo expandEarlyThenCollapseLate
Model: 6 layers, 256d, 8 heads
Parameters: 4,902,976
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [02:26<00:00, 13.66step/s, loss=0.1373, acc=1.0000]


‚úÖ Training complete in 146.5s
Saved combo log to logs/1.0/logdet_spectral_norm/copy_medium_medium_expandEarlyThenCollapseLate_steps2000_seed42.npy
RUN combo collapseEarlyOnly
Model: 6 layers, 256d, 8 heads
Parameters: 4,902,976
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [01:16<00:00, 26.21step/s, loss=0.0036, acc=1.0000]


‚úÖ Training complete in 76.3s
Saved combo log to logs/1.0/logdet_spectral_norm/copy_medium_medium_collapseEarlyOnly_steps2000_seed42.npy
RUN combo expandEarlyOnly
Model: 6 layers, 256d, 8 heads
Parameters: 4,902,976
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [02:22<00:00, 14.01step/s, loss=0.0001, acc=1.0000]


‚úÖ Training complete in 142.7s
Saved combo log to logs/1.0/logdet_spectral_norm/copy_medium_medium_expandEarlyOnly_steps2000_seed42.npy
RUN: task=copy size=medium arch=large
RUN combo collapseEarlyThenExpandLate
Model: 8 layers, 512d, 16 heads
Parameters: 25,547,840
Device: cuda


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [06:58<00:00,  4.78step/s, loss=0.0023, acc=1.0000]


‚úÖ Training complete in 418.8s
Saved combo log to logs/1.0/logdet_spectral_norm/copy_medium_large_collapseEarlyThenExpandLate_steps2000_seed42.npy
RUN combo expandEarlyThenCollapseLate
Model: 8 layers, 512d, 16 heads
Parameters: 25,547,840
Device: cuda


Training:   3%|‚ñé         | 65/2000 [00:44<15:31,  2.08step/s, loss=14.4380]