In [1]:
import shutil
from pathlib import Path

import datasets
import torch as t
from sae_lens import (
    CacheActivationsRunner,
    CacheActivationsRunnerConfig,
)
from sae_lens.store.cached_activation_store import CachedActivationsStore

from crosscoder_lens.config import CrossCoderTrainerConfig
from crosscoder_lens.trainer import CrossCoderTrainingRunner

In [2]:
total_training_steps = 30_000
sae_batch_size = 4096
total_training_tokens = sae_batch_size * total_training_steps
lr_warm_up_steps = 0
lr_decay_steps = total_training_steps // 5  # 20% of training
l1_warm_up_steps = total_training_steps // 20  # 5% of training

d_in = 1024
context_size = 512
dtype = "float32"
device = (
    "cuda"
    if t.cuda.is_available()
    else "mps"
    if t.backends.mps.is_available()
    else "cpu"
)

In [3]:
activations_path = Path("activations")
if activations_path.exists():
    print(f"Removing existing activations directory {activations_path}")
    shutil.rmtree(activations_path)
activations_path.mkdir(exist_ok=False)


def _create_dataset_from_hook(hook_name: str):
    cfg = CacheActivationsRunnerConfig(
        new_cached_activations_path=str(activations_path / hook_name),
        dataset_path="apollo-research/roneneldan-TinyStories-tokenizer-gpt2",
        model_name="tiny-stories-1L-21M",
        hook_name=hook_name,
        hook_layer=0,
        buffer_size_gb=2,
        ### Parameters
        training_tokens=total_training_tokens,
        model_batch_size=256,
        context_size=context_size,
        ###
        d_in=d_in,
        shuffle=False,
        prepend_bos=False,
        device=device,
        dtype=dtype,
    )
    return CacheActivationsRunner(cfg).run()

dataset_pre = _create_dataset_from_hook("blocks.0.hook_resid_pre").rename_column("blocks.0.hook_resid_pre", "pre")
dataset_post = _create_dataset_from_hook("blocks.0.hook_resid_post").rename_column("blocks.0.hook_resid_post", "post")
dataset = datasets.concatenate_datasets([dataset_pre, dataset_post], axis=1)
dataset

Removing existing activations directory activations
Loaded pretrained model tiny-stories-1L-21M into HookedTransformer


Generating train split: 0 examples [00:00, ? examples/s]



In [4]:
cached_activations_store = CachedActivationsStore(
    ds=dataset,
    column_names=["pre", "post"],
    batch_size=sae_batch_size,
    context_size=context_size,
    dl_kwargs={"num_workers": 0, "prefetch_factor": None}, # dataloader doesn't like notebooks
)
print(f"Batch shape: {cached_activations_store.next_batch().shape}")
cached_activations_store.reset_input_dataset()


Batch shape: torch.Size([32, 2, 1024])


In [6]:
trainer_cfg = CrossCoderTrainerConfig(
    n_models=2,
    d_in=d_in,
    expansion_factor=4,
    training_tokens=total_training_tokens,
    normalize_sae_decoder=False,  # has to be False for now
    device=device,
    checkpoint_path="checkpoints",
    ### Copied from SAELens training_a_sparse_autoencoder.ipynb
    lr=5e-5,
    adam_beta1=0.9,
    adam_beta2=0.999,
    lr_scheduler_name="constant",
    lr_warm_up_steps=lr_warm_up_steps,
    lr_decay_steps=lr_decay_steps,
    l1_coefficient=5,
    l1_warm_up_steps=l1_warm_up_steps,
    lp_norm=1.0,
    log_to_wandb=True,
    wandb_project="crosscoder_tiny_stories_1l_21M",
    wandb_log_frequency=30,
    eval_every_n_wandb_logs=20,
)

trainer = CrossCoderTrainingRunner(trainer_cfg, cached_activations_store)
sae = trainer.run()

Run name: 2048-L1-0.001-LR-0.0003-Tokens-5.000e+04
n_tokens_per_buffer (millions): 0.08192
Lower bound: n_contexts_per_buffer (millions): 0.00064
Total training steps: 12
Total wandb updates: 12
n_tokens_per_feature_sampling_window (millions): 1048.576
n_tokens_per_dead_feature_window (millions): 524.288
We will reset the sparsity calculation 0 times.
Number tokens in sparsity calculation window: 8.19e+06
Running training
<sae_lens.store.cached_activation_store.CachedActivationsStore object at 0x79a7645f76e0>
CrossCoder(
  (activation_fn): ReLU()
  (hook_sae_input): HookPoint()
  (hook_sae_acts_pre): HookPoint()
  (hook_sae_acts_post): HookPoint()
  (hook_sae_output): HookPoint()
  (hook_sae_recons): HookPoint()
  (hook_sae_error): HookPoint()
)


Training SAE:   0%|          | 0/50000 [00:00<?, ?it/s]

n_training_tokens: 32
n_training_tokens: 64
n_training_tokens: 96
n_training_tokens: 128
n_training_tokens: 160
n_training_tokens: 192
n_training_tokens: 224
n_training_tokens: 256
n_training_tokens: 288
n_training_tokens: 320
n_training_tokens: 352
n_training_tokens: 384
n_training_tokens: 416
n_training_tokens: 448
n_training_tokens: 480
n_training_tokens: 512
n_training_tokens: 544
n_training_tokens: 576
n_training_tokens: 608
n_training_tokens: 640
n_training_tokens: 672
n_training_tokens: 704
n_training_tokens: 736
n_training_tokens: 768
n_training_tokens: 800
n_training_tokens: 832
n_training_tokens: 864
n_training_tokens: 896
n_training_tokens: 928
n_training_tokens: 960
n_training_tokens: 992
n_training_tokens: 1024
n_training_tokens: 1056
n_training_tokens: 1088
n_training_tokens: 1120
n_training_tokens: 1152
n_training_tokens: 1184
n_training_tokens: 1216
n_training_tokens: 1248
n_training_tokens: 1280
n_training_tokens: 1312
n_training_tokens: 1344
n_training_tokens: 1376
n_

100| l1_loss: 0.37917 | mse_loss: 165.83426: : 409600it [00:00, 1387046.05it/s]       

n_training_tokens: 2272
n_training_tokens: 2304
n_training_tokens: 2336
n_training_tokens: 2368
n_training_tokens: 2400
n_training_tokens: 2432
n_training_tokens: 2464
n_training_tokens: 2496
n_training_tokens: 2528
n_training_tokens: 2560
n_training_tokens: 2592
n_training_tokens: 2624
n_training_tokens: 2656
n_training_tokens: 2688
n_training_tokens: 2720
n_training_tokens: 2752
n_training_tokens: 2784
n_training_tokens: 2816
n_training_tokens: 2848
n_training_tokens: 2880
n_training_tokens: 2912
n_training_tokens: 2944
n_training_tokens: 2976
n_training_tokens: 3008
n_training_tokens: 3040
n_training_tokens: 3072
n_training_tokens: 3104
n_training_tokens: 3136
n_training_tokens: 3168
n_training_tokens: 3200
n_training_tokens: 3232
n_training_tokens: 3264
n_training_tokens: 3296
n_training_tokens: 3328
n_training_tokens: 3360
n_training_tokens: 3392
n_training_tokens: 3424
n_training_tokens: 3456
n_training_tokens: 3488
n_training_tokens: 3520
n_training_tokens: 3552
n_training_token

200| l1_loss: 0.30657 | mse_loss: 107.76467: : 819200it [00:00, 1375161.05it/s]

n_training_tokens: 4384
n_training_tokens: 4416
n_training_tokens: 4448
n_training_tokens: 4480
n_training_tokens: 4512
n_training_tokens: 4544
n_training_tokens: 4576
n_training_tokens: 4608
n_training_tokens: 4640
n_training_tokens: 4672
n_training_tokens: 4704
n_training_tokens: 4736
n_training_tokens: 4768
n_training_tokens: 4800
n_training_tokens: 4832
n_training_tokens: 4864
n_training_tokens: 4896
n_training_tokens: 4928
n_training_tokens: 4960
n_training_tokens: 4992
n_training_tokens: 5024
n_training_tokens: 5056
n_training_tokens: 5088
n_training_tokens: 5120
n_training_tokens: 5152
n_training_tokens: 5184
n_training_tokens: 5216
n_training_tokens: 5248
n_training_tokens: 5280
n_training_tokens: 5312
n_training_tokens: 5344
n_training_tokens: 5376
n_training_tokens: 5408
n_training_tokens: 5440
n_training_tokens: 5472
n_training_tokens: 5504
n_training_tokens: 5536
n_training_tokens: 5568
n_training_tokens: 5600
n_training_tokens: 5632
n_training_tokens: 5664
n_training_token

300| l1_loss: 0.27198 | mse_loss: 109.28758: : 1228800it [00:00, 1363187.51it/s]

n_training_tokens: 8768
n_training_tokens: 8800
n_training_tokens: 8832
n_training_tokens: 8864
n_training_tokens: 8896
n_training_tokens: 8928
n_training_tokens: 8960
n_training_tokens: 8992
n_training_tokens: 9024
n_training_tokens: 9056
n_training_tokens: 9088
n_training_tokens: 9120
n_training_tokens: 9152
n_training_tokens: 9184
n_training_tokens: 9216
n_training_tokens: 9248
n_training_tokens: 9280
n_training_tokens: 9312
n_training_tokens: 9344
n_training_tokens: 9376
n_training_tokens: 9408
n_training_tokens: 9440
n_training_tokens: 9472
n_training_tokens: 9504
n_training_tokens: 9536
n_training_tokens: 9568
n_training_tokens: 9600
n_training_tokens: 9632
n_training_tokens: 9664
n_training_tokens: 9696
n_training_tokens: 9728
n_training_tokens: 9760
n_training_tokens: 9792
n_training_tokens: 9824
n_training_tokens: 9856
n_training_tokens: 9888
n_training_tokens: 9920
n_training_tokens: 9952
n_training_tokens: 9984
n_training_tokens: 10016
n_training_tokens: 10048
n_training_tok

400| l1_loss: 0.27594 | mse_loss: 97.91668: : 1638400it [00:01, 1370662.79it/s] 


n_training_tokens: 10912
n_training_tokens: 10944
n_training_tokens: 10976
n_training_tokens: 11008
n_training_tokens: 11040
n_training_tokens: 11072
n_training_tokens: 11104
n_training_tokens: 11136
n_training_tokens: 11168
n_training_tokens: 11200
n_training_tokens: 11232
n_training_tokens: 11264
n_training_tokens: 11296
n_training_tokens: 11328
n_training_tokens: 11360
n_training_tokens: 11392
n_training_tokens: 11424
n_training_tokens: 11456
n_training_tokens: 11488
n_training_tokens: 11520
n_training_tokens: 11552
n_training_tokens: 11584
n_training_tokens: 11616
n_training_tokens: 11648
n_training_tokens: 11680
n_training_tokens: 11712
n_training_tokens: 11744
n_training_tokens: 11776
n_training_tokens: 11808
n_training_tokens: 11840
n_training_tokens: 11872
n_training_tokens: 11904
n_training_tokens: 11936
n_training_tokens: 11968
n_training_tokens: 12000
n_training_tokens: 12032
n_training_tokens: 12064
n_training_tokens: 12096
n_training_tokens: 12128
n_training_tokens: 12160

500| l1_loss: 0.29264 | mse_loss: 75.80750: : 2048000it [00:01, 1379520.52it/s]

n_training_tokens: 15328
n_training_tokens: 15360
n_training_tokens: 15392
n_training_tokens: 15424
n_training_tokens: 15456
n_training_tokens: 15488
n_training_tokens: 15520
n_training_tokens: 15552
n_training_tokens: 15584
n_training_tokens: 15616
n_training_tokens: 15648
n_training_tokens: 15680
n_training_tokens: 15712
n_training_tokens: 15744
n_training_tokens: 15776
n_training_tokens: 15808
n_training_tokens: 15840
n_training_tokens: 15872
n_training_tokens: 15904
n_training_tokens: 15936
n_training_tokens: 15968
n_training_tokens: 16000
n_training_tokens: 16032
n_training_tokens: 16064
n_training_tokens: 16096
n_training_tokens: 16128
n_training_tokens: 16160
n_training_tokens: 16192
n_training_tokens: 16224
n_training_tokens: 16256
n_training_tokens: 16288
n_training_tokens: 16320
n_training_tokens: 16352
n_training_tokens: 16384
n_training_tokens: 16416
n_training_tokens: 16448
n_training_tokens: 16480
n_training_tokens: 16512
n_training_tokens: 16544
n_training_tokens: 16576


600| l1_loss: 0.32920 | mse_loss: 55.34613: : 2457600it [00:01, 1380538.08it/s]

n_training_tokens: 17504
n_training_tokens: 17536
n_training_tokens: 17568
n_training_tokens: 17600
n_training_tokens: 17632
n_training_tokens: 17664
n_training_tokens: 17696
n_training_tokens: 17728
n_training_tokens: 17760
n_training_tokens: 17792
n_training_tokens: 17824
n_training_tokens: 17856
n_training_tokens: 17888
n_training_tokens: 17920
n_training_tokens: 17952
n_training_tokens: 17984
n_training_tokens: 18016
n_training_tokens: 18048
n_training_tokens: 18080
n_training_tokens: 18112
n_training_tokens: 18144
n_training_tokens: 18176
n_training_tokens: 18208
n_training_tokens: 18240
n_training_tokens: 18272
n_training_tokens: 18304
n_training_tokens: 18336
n_training_tokens: 18368
n_training_tokens: 18400
n_training_tokens: 18432
n_training_tokens: 18464
n_training_tokens: 18496
n_training_tokens: 18528
n_training_tokens: 18560
n_training_tokens: 18592
n_training_tokens: 18624
n_training_tokens: 18656
n_training_tokens: 18688
n_training_tokens: 18720
n_training_tokens: 18752


700| l1_loss: 0.28444 | mse_loss: 62.48945: : 2867200it [00:02, 1382245.81it/s]

n_training_tokens: 21888
n_training_tokens: 21920
n_training_tokens: 21952
n_training_tokens: 21984
n_training_tokens: 22016
n_training_tokens: 22048
n_training_tokens: 22080
n_training_tokens: 22112
n_training_tokens: 22144
n_training_tokens: 22176
n_training_tokens: 22208
n_training_tokens: 22240
n_training_tokens: 22272
n_training_tokens: 22304
n_training_tokens: 22336
n_training_tokens: 22368
n_training_tokens: 22400
n_training_tokens: 22432
n_training_tokens: 22464
n_training_tokens: 22496
n_training_tokens: 22528
n_training_tokens: 22560
n_training_tokens: 22592
n_training_tokens: 22624
n_training_tokens: 22656
n_training_tokens: 22688
n_training_tokens: 22720
n_training_tokens: 22752
n_training_tokens: 22784
n_training_tokens: 22816
n_training_tokens: 22848
n_training_tokens: 22880
n_training_tokens: 22912
n_training_tokens: 22944
n_training_tokens: 22976
n_training_tokens: 23008
n_training_tokens: 23040
n_training_tokens: 23072
n_training_tokens: 23104
n_training_tokens: 23136


800| l1_loss: 0.29014 | mse_loss: 59.57063: : 3276800it [00:02, 1375230.91it/s]

n_training_tokens: 24064
n_training_tokens: 24096
n_training_tokens: 24128
n_training_tokens: 24160
n_training_tokens: 24192
n_training_tokens: 24224
n_training_tokens: 24256
n_training_tokens: 24288
n_training_tokens: 24320
n_training_tokens: 24352
n_training_tokens: 24384
n_training_tokens: 24416
n_training_tokens: 24448
n_training_tokens: 24480
n_training_tokens: 24512
n_training_tokens: 24544
n_training_tokens: 24576
n_training_tokens: 24608
n_training_tokens: 24640
n_training_tokens: 24672
n_training_tokens: 24704
n_training_tokens: 24736
n_training_tokens: 24768
n_training_tokens: 24800
n_training_tokens: 24832
n_training_tokens: 24864
n_training_tokens: 24896
n_training_tokens: 24928
n_training_tokens: 24960
n_training_tokens: 24992
n_training_tokens: 25024
n_training_tokens: 25056
n_training_tokens: 25088
n_training_tokens: 25120
n_training_tokens: 25152
n_training_tokens: 25184
n_training_tokens: 25216
n_training_tokens: 25248
n_training_tokens: 25280
n_training_tokens: 25312


900| l1_loss: 0.33565 | mse_loss: 45.32476: : 3686400it [00:02, 1369355.50it/s]

n_training_tokens: 28352
n_training_tokens: 28384
n_training_tokens: 28416
n_training_tokens: 28448
n_training_tokens: 28480
n_training_tokens: 28512
n_training_tokens: 28544
n_training_tokens: 28576
n_training_tokens: 28608
n_training_tokens: 28640
n_training_tokens: 28672
n_training_tokens: 28704
n_training_tokens: 28736
n_training_tokens: 28768
n_training_tokens: 28800
n_training_tokens: 28832
n_training_tokens: 28864
n_training_tokens: 28896
n_training_tokens: 28928
n_training_tokens: 28960
n_training_tokens: 28992
n_training_tokens: 29024
n_training_tokens: 29056
n_training_tokens: 29088
n_training_tokens: 29120
n_training_tokens: 29152
n_training_tokens: 29184
n_training_tokens: 29216
n_training_tokens: 29248
n_training_tokens: 29280
n_training_tokens: 29312
n_training_tokens: 29344
n_training_tokens: 29376
n_training_tokens: 29408
n_training_tokens: 29440
n_training_tokens: 29472
n_training_tokens: 29504
n_training_tokens: 29536
n_training_tokens: 29568
n_training_tokens: 29600


1000| l1_loss: 0.33065 | mse_loss: 42.75666: : 4096000it [00:02, 1367955.26it/s]

n_training_tokens: 30496
n_training_tokens: 30528
n_training_tokens: 30560
n_training_tokens: 30592
n_training_tokens: 30624
n_training_tokens: 30656
n_training_tokens: 30688
n_training_tokens: 30720
n_training_tokens: 30752
n_training_tokens: 30784
n_training_tokens: 30816
n_training_tokens: 30848
n_training_tokens: 30880
n_training_tokens: 30912
n_training_tokens: 30944
n_training_tokens: 30976
n_training_tokens: 31008
n_training_tokens: 31040
n_training_tokens: 31072
n_training_tokens: 31104
n_training_tokens: 31136
n_training_tokens: 31168
n_training_tokens: 31200
n_training_tokens: 31232
n_training_tokens: 31264
n_training_tokens: 31296
n_training_tokens: 31328
n_training_tokens: 31360
n_training_tokens: 31392
n_training_tokens: 31424
n_training_tokens: 31456
n_training_tokens: 31488
n_training_tokens: 31520
n_training_tokens: 31552
n_training_tokens: 31584
n_training_tokens: 31616
n_training_tokens: 31648
n_training_tokens: 31680
n_training_tokens: 31712
n_training_tokens: 31744


1100| l1_loss: 0.36365 | mse_loss: 44.45412: : 4505600it [00:03, 1368132.79it/s]

n_training_tokens: 34880
n_training_tokens: 34912
n_training_tokens: 34944
n_training_tokens: 34976
n_training_tokens: 35008
n_training_tokens: 35040
n_training_tokens: 35072
n_training_tokens: 35104
n_training_tokens: 35136
n_training_tokens: 35168
n_training_tokens: 35200
n_training_tokens: 35232
n_training_tokens: 35264
n_training_tokens: 35296
n_training_tokens: 35328
n_training_tokens: 35360
n_training_tokens: 35392
n_training_tokens: 35424
n_training_tokens: 35456
n_training_tokens: 35488
n_training_tokens: 35520
n_training_tokens: 35552
n_training_tokens: 35584
n_training_tokens: 35616
n_training_tokens: 35648
n_training_tokens: 35680
n_training_tokens: 35712
n_training_tokens: 35744
n_training_tokens: 35776
n_training_tokens: 35808
n_training_tokens: 35840
n_training_tokens: 35872
n_training_tokens: 35904
n_training_tokens: 35936
n_training_tokens: 35968
n_training_tokens: 36000
n_training_tokens: 36032
n_training_tokens: 36064
n_training_tokens: 36096
n_training_tokens: 36128


1200| l1_loss: 0.33380 | mse_loss: 37.12907: : 4915200it [00:03, 1368427.67it/s]

n_training_tokens: 37024
n_training_tokens: 37056
n_training_tokens: 37088
n_training_tokens: 37120
n_training_tokens: 37152
n_training_tokens: 37184
n_training_tokens: 37216
n_training_tokens: 37248
n_training_tokens: 37280
n_training_tokens: 37312
n_training_tokens: 37344
n_training_tokens: 37376
n_training_tokens: 37408
n_training_tokens: 37440
n_training_tokens: 37472
n_training_tokens: 37504
n_training_tokens: 37536
n_training_tokens: 37568
n_training_tokens: 37600
n_training_tokens: 37632
n_training_tokens: 37664
n_training_tokens: 37696
n_training_tokens: 37728
n_training_tokens: 37760
n_training_tokens: 37792
n_training_tokens: 37824
n_training_tokens: 37856
n_training_tokens: 37888
n_training_tokens: 37920
n_training_tokens: 37952
n_training_tokens: 37984
n_training_tokens: 38016
n_training_tokens: 38048
n_training_tokens: 38080
n_training_tokens: 38112
n_training_tokens: 38144
n_training_tokens: 38176
n_training_tokens: 38208
n_training_tokens: 38240
n_training_tokens: 38272


1300| l1_loss: 0.37713 | mse_loss: 33.79031: : 5324800it [00:03, 1370088.90it/s]

n_training_tokens: 41408
n_training_tokens: 41440
n_training_tokens: 41472
n_training_tokens: 41504
n_training_tokens: 41536
n_training_tokens: 41568
n_training_tokens: 41600
n_training_tokens: 41632
n_training_tokens: 41664
n_training_tokens: 41696
n_training_tokens: 41728
n_training_tokens: 41760
n_training_tokens: 41792
n_training_tokens: 41824
n_training_tokens: 41856
n_training_tokens: 41888
n_training_tokens: 41920
n_training_tokens: 41952
n_training_tokens: 41984
n_training_tokens: 42016
n_training_tokens: 42048
n_training_tokens: 42080
n_training_tokens: 42112
n_training_tokens: 42144
n_training_tokens: 42176
n_training_tokens: 42208
n_training_tokens: 42240
n_training_tokens: 42272
n_training_tokens: 42304
n_training_tokens: 42336
n_training_tokens: 42368
n_training_tokens: 42400
n_training_tokens: 42432
n_training_tokens: 42464
n_training_tokens: 42496
n_training_tokens: 42528
n_training_tokens: 42560
n_training_tokens: 42592
n_training_tokens: 42624
n_training_tokens: 42656


1400| l1_loss: 0.37123 | mse_loss: 31.01844: : 5734400it [00:04, 1374439.50it/s]

n_training_tokens: 43552
n_training_tokens: 43584
n_training_tokens: 43616
n_training_tokens: 43648
n_training_tokens: 43680
n_training_tokens: 43712
n_training_tokens: 43744
n_training_tokens: 43776
n_training_tokens: 43808
n_training_tokens: 43840
n_training_tokens: 43872
n_training_tokens: 43904
n_training_tokens: 43936
n_training_tokens: 43968
n_training_tokens: 44000
n_training_tokens: 44032
n_training_tokens: 44064
n_training_tokens: 44096
n_training_tokens: 44128
n_training_tokens: 44160
n_training_tokens: 44192
n_training_tokens: 44224
n_training_tokens: 44256
n_training_tokens: 44288
n_training_tokens: 44320
n_training_tokens: 44352
n_training_tokens: 44384
n_training_tokens: 44416
n_training_tokens: 44448
n_training_tokens: 44480
n_training_tokens: 44512
n_training_tokens: 44544
n_training_tokens: 44576
n_training_tokens: 44608
n_training_tokens: 44640
n_training_tokens: 44672
n_training_tokens: 44704
n_training_tokens: 44736
n_training_tokens: 44768
n_training_tokens: 44800


1500| l1_loss: 0.37315 | mse_loss: 34.35598: : 6144000it [00:04, 1369967.10it/s]

n_training_tokens: 47936
n_training_tokens: 47968
n_training_tokens: 48000
n_training_tokens: 48032
n_training_tokens: 48064
n_training_tokens: 48096
n_training_tokens: 48128
n_training_tokens: 48160
n_training_tokens: 48192
n_training_tokens: 48224
n_training_tokens: 48256
n_training_tokens: 48288
n_training_tokens: 48320
n_training_tokens: 48352
n_training_tokens: 48384
n_training_tokens: 48416
n_training_tokens: 48448
n_training_tokens: 48480
n_training_tokens: 48512
n_training_tokens: 48544
n_training_tokens: 48576
n_training_tokens: 48608
n_training_tokens: 48640
n_training_tokens: 48672
n_training_tokens: 48704
n_training_tokens: 48736
n_training_tokens: 48768
n_training_tokens: 48800
n_training_tokens: 48832
n_training_tokens: 48864
n_training_tokens: 48896
n_training_tokens: 48928
n_training_tokens: 48960
n_training_tokens: 48992
n_training_tokens: 49024
n_training_tokens: 49056
n_training_tokens: 49088
n_training_tokens: 49120
n_training_tokens: 49152
n_training_tokens: 49184


1500| l1_loss: 0.37315 | mse_loss: 34.35598: : 6144000it [00:05, 1201630.96it/s]


VBox(children=(Label(value='1.656 MB of 16.090 MB uploaded\r'), FloatProgress(value=0.10291741191242548, max=1…

0,1
details/current_l1_coefficient,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
details/current_learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
details/n_training_tokens,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇████
losses/l1_loss,█▆▄▃▂▂▂▂▂▂▁▂▁▂▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▂▂▃▃
losses/mse_loss,█▇▆▇▇▅▅▄▃▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▂▁▁▁▁▁▁▁
losses/overall_loss,█▅▄▄▃▃▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
losses/raw_l1_loss,██▆▆▄▅▄▁▂▂▂▁▂▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▄▅▄▅▅▅▅▅▅▅▅
metrics/explained_variance,▁▆▇█▇███████████████████████████████████
metrics/explained_variance_std,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
metrics/l0,█▅▄▄▄▃▂▂▂▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄

0,1
details/current_l1_coefficient,0.001
details/current_learning_rate,0.0003
details/n_training_tokens,50016.0
losses/l1_loss,361.56911
losses/mse_loss,36.51381
losses/overall_loss,36.87537
losses/raw_l1_loss,0.36157
metrics/explained_variance,0.15195
metrics/explained_variance_std,0.78309
metrics/l0,574.0625
