In [25]:
from google.colab import drive
drive.mount("/content/drive")

PROJECT = "/content/drive/MyDrive/DLFinalProject/rank-collapse"
# PROJECT = "/content/drive/MyDrive/rank-collapse"
%cd {PROJECT}

from utils import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/DLFinalProject/rank-collapse


In [26]:
def run_collapse_expand_combos(
        task_name="copy", dataset_size_name="medium", arch="small",
        collapse_func=None, expand_func=None, steps=2000,
        batch_size=64, out_dir="logs/weight0.01", seed=42):
    """
    Runs:
      - collapse early (first25) then expand late (last25)
      - expand early then collapse late
      - collapse early only
      - expand early only
    Uses collapse_func and expand_func from dictionaries.
    """
    combos = [
        ("collapseEarlyThenExpandLate", [ (0, int(0.25*steps)-1, collapse_func), (int(0.75*steps), steps-1, expand_func) ]),
        ("expandEarlyThenCollapseLate", [ (0, int(0.25*steps)-1, expand_func), (int(0.75*steps), steps-1, collapse_func) ]),
        ("collapseEarlyOnly", [ (0, int(0.25*steps)-1, collapse_func) ]),
        ("expandEarlyOnly", [ (0, int(0.25*steps)-1, expand_func) ]),
    ]
    sizes = {"small": 1024, "medium": 4096, "large": 8192}

    for combo_name, funcs in combos:
        print(f"RUN combo {combo_name}")
        # build TrainingScheduler from funcs mapping to (start,end,func)
        scheduler = TrainingScheduler(which_layers="all", base_func=cross_entropy, functions_at_times=funcs)
        # build datasets & model
        dataset_size = sizes[dataset_size_name]
        ds_train, task_type, vocab_size, _ = create_dataset(
            task_name, seq_len=32, vocab_size=64, dataset_size=dataset_size,
            split="train")
        ds_probe, _, _, _ = create_dataset(
            task_name, seq_len=32, vocab_size=64,
            dataset_size=min(512, dataset_size), split="train")
        model, arch_config = instantiate_architecture(arch, vocab_size=max(32, vocab_size), seq_len=32, task_type=task_type)
        model = model.to(device)

        # wrapper loss to ensure scheduled functions get model/layers/activations
        # we reuse run_experiment but pass scheduler object directly: run_experiment calls scheduler.call_function with **kwargs
        log_name = f"{task_name}_{dataset_size_name}_{arch}_{combo_name}_steps{steps}_seed{seed}.npy"
        out_path = os.path.join(out_dir, log_name)
        os.makedirs(out_dir, exist_ok=True)

        if not os.path.exists(out_path):
            model, _ = run_experiment((ds_train, task_type, None, None),
                (ds_probe, task_type, None, None),
                model, arch_config,
                steps=steps, batch_size=batch_size,
                checkpoint_every=max(1, steps//10), lr=3e-4,
                seed=seed, loss_scheduler=scheduler, metric_func=full_spectrum_metrics,
                log_store_name=out_path, weight_decay=0.01)
            print(f"Saved combo log to {out_path}")
        else:
            print(f"{out_path} already done!")

def run_collapse_expand_sweeps(
        collapse_func, expand_func, tasks=None, archs=None,
        dataset_sizes=None, steps=2000, batch_size=64,
        out_dir="logs", seed=42):
    """
    Iterate over tasks × archs × dataset sizes × regs × timings and run experiments.
    By default uses the increase_dict keys then decrease_dict keys.
    """
    if tasks is None:
        tasks = ["copy", "sort", "tiny_stories"]
    if archs is None:
        archs = ["small", "medium", "large"]
    if dataset_sizes is None:
        dataset_sizes = ["medium"]

    all_runs = []

    for task in tasks:
        for size_name in dataset_sizes:
            for arch in archs:
                print(f"RUN: task={task} size={size_name} arch={arch}")
                run_collapse_expand_combos(
                    task, size_name, arch, collapse_func, expand_func, steps,
                    batch_size, out_dir=out_dir, seed=seed)
    return all_runs

Training:   2%|▏         | 43/2000 [00:25<19:37,  1.66step/s, loss=21.8371]
Training:   0%|          | 6/2000 [00:16<1:29:57,  2.71s/step, loss=62.2856]


In [27]:
def run_tests(weight=1.0):
    collapse_funcs = ["logdet", "erank"]
    expand_funcs = ["orthonorm", "spectral_norm"]

    for collapse_name in collapse_funcs:
        for expand_name in expand_funcs:
            collapse_func = lambda logits, yb, **kwargs: weight * increase_rank_regularizers[collapse_name](logits, yb, **kwargs)
            expand_func = decrease_rank_regularizers[expand_name]
            out_dir = f"logs/{weight}/{collapse_func}_{expand_func}"
            run_collapse_expand_sweeps(collapse_func, expand_func, out_dir=out_dir)

In [None]:
run_tests(0.1)

RUN: task=copy size=medium arch=small
RUN combo collapseEarlyThenExpandLate
Model: 4 layers, 128d, 8 heads
Parameters: 875,328
Device: cuda







Training:   0%|          | 0/2000 [00:00<?, ?step/s][A[A[A[A[A




Training:   0%|          | 1/2000 [00:00<01:08, 29.17step/s, loss=72.1028][A[A[A[A[A


Training:   1%|▏         | 29/2000 [00:13<01:16, 25.70step/s, loss=481.9613][A[A[A




Training:   0%|          | 1/2000 [00:02<1:19:08,  2.38s/step, loss=72.1028, acc=0.0193][A[A[A[A[A




Training:   0%|          | 2/2000 [00:02<40:04,  1.20s/step, loss=72.1028, acc=0.0193]  [A[A[A[A[A




Training:   0%|          | 2/2000 [00:02<40:04,  1.20s/step, loss=70.0541]            [A[A[A[A[A




Training:   0%|          | 3/2000 [00:02<40:03,  1.20s/step, loss=68.2029][A[A[A[A[A




Training:   0%|          | 4/2000 [00:02<40:02,  1.20s/step, loss=66.0722][A[A[A[A[A




Training:   0%|          | 5/2000 [00:02<40:01,  1.20s/step, loss=64.3164][A[A[A[A[A




Training:   0%|          | 6/2000 [00:02<11:03,  3.01step/s, loss=64.3164][A[A[A[A[A




Training:   0%|          | 6/2000 [00:02<11