# Auto resume checkpoint testing

This is to help validate the ability of various commands, to be "auto-resumed" from various interruptions.

Manual interruption may be needed for several segments.

In [1]:
# First lets setup the various directories required
!mkdir -p ../../model/
!mkdir -p ../../datapath/
!mkdir -p ../../checkpoint/

# Model init, with skip on rerun

We initialize a model, and skip it, if file already exists.

In [4]:
# Lets initialized the L6-D512 model with the init_model.py code
# first run should do the full init - if the file does not exists
!cd ../../RWKV-v4neo/ && python3 init_model.py --skip-if-exists --n_layer 6 --n_embd 512 --vocab_size neox ../model/L6-D512-neox-init.pth

[2023-08-03 07:21:07,341] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230706'
---- Initializing model ----
No of layers: 6
Embedding size: 512
Output model path: ../model/L6-D512-neox-init.pth
Vocab size: 50277
---- ----- ----
Using /home/ubuntu/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/ubuntu/.cache/torch_extensions/py311_cu118/wkv_1_bf16/build.ninja...
Building extension module wkv_1_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_1_bf16...
[RWKV.model]: Finished initial model load
50277 512   -0.1 emb.weight
512   512   0    blocks.0.att.key.weight
512   512   1.0  blocks.0.att.value.weight
512   512   0    blocks.0.att.receptance.weight
512   512

In [5]:
# Lets rerun it again, it should skip this time round
!cd ../../RWKV-v4neo/ && python3 init_model.py --skip-if-exists --n_layer 6 --n_embd 512 --vocab_size neox ../model/L6-D512-neox-init.pth

[2023-08-03 07:21:25,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230706'
---- Initializing model ----
No of layers: 6
Embedding size: 512
Output model path: ../model/L6-D512-neox-init.pth
Vocab size: 50277
---- ----- ----
Model exists, skipping init_model


# Datapath setup, cache skip if its done

Datapath, already by design, is safely rerunable 

In [6]:
# Lets preload the requried dataset
!cd ../../RWKV-v4neo && python3 preload_datapath.py ../notebook/trainer-validation/config/ckpt-auto-resume-test-1024.yaml

Found cached dataset parquet (/home/ubuntu/.cache/huggingface/datasets/teven___parquet/teven--enwiki_10k-de63a925546e70ab/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1002.94it/s]
                                                                                

In [7]:
# Lets rerun it again! (Quick cache validation!)
!cd ../../RWKV-v4neo && python3 preload_datapath.py ../notebook/trainer-validation/config/ckpt-auto-resume-test-1024.yaml

Found cached dataset parquet (/home/ubuntu/.cache/huggingface/datasets/teven___parquet/teven--enwiki_10k-de63a925546e70ab/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 999.36it/s]
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/teven___parquet/teven--enwiki_10k-de63a925546e70ab/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-01e7352a6ba33e03_*_of_00032.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/teven___parquet/teven--enwiki_10k-de63a925546e70ab/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-295f4a3681ee7c83_*_of_00032.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/teven___parquet/teven--enwiki_10k-de63a925546e70ab/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-aecf54c7557ed3b6_*_of_00032.arrow
        

# Model training run!

Everything above should have been done with manual supervision (to make sure its all cleared). The remaining in concept can be let to "auto run" / "auto resume" on a pre-emptible instance

In [9]:
# Lets start a simple run, interrupt this after 32+ steps, where it should create a checkpoint
!cd ../../RWKV-v4neo && python3 lightning_trainer.py fit \
    -c ../notebook/trainer-validation/config/ckpt-auto-resume-test-1024.yaml \
    --auto-resume-ckpt-dir "auto"

[RWKV.lightning_trainer.py]: Running with PYTORCH_CUDA_ALLOC_CONF=backend:cudaMallocAsync
[RWKV.lightning_trainer.py] Extracting checkpoint dir from config, for --auto-resume-ckpt-dir=auto
[RWKV.lightning_trainer.py] Enabling --auto-resume-ckpt-dir=../checkpoint/trainer-validaiton/ckpt-auto-resume-test --auto-resume-ckpt-mode=2nd-last
[RWKV.lightning_trainer.py] No checkpoints found in '../checkpoint/trainer-validaiton/ckpt-auto-resume-test', starting from scratch


[2023-08-03 07:41:08,278] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230706'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 1114373010
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230803_074110-aacg83ak[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33minfctx-validation ckpt-test (train-ctx=1024, data-ctx=1024)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-InfCtx-Validation[0m
[34m[1mwandb

In [10]:
# Lets run it again, and confirm if it resumed from the checkpoint
# Lets interrupt this again at a later stage
!cd ../../RWKV-v4neo && python3 lightning_trainer.py fit \
    -c ../notebook/trainer-validation/config/ckpt-auto-resume-test-1024.yaml \
    --auto-resume-ckpt-dir "auto"

[RWKV.lightning_trainer.py] Running with PYTORCH_CUDA_ALLOC_CONF=backend:cudaMallocAsync
[RWKV.lightning_trainer.py] Extracting checkpoint dir from config, for --auto-resume-ckpt-dir=auto
[RWKV.lightning_trainer.py] Enabling --auto-resume-ckpt-dir=../checkpoint/trainer-validaiton/ckpt-auto-resume-test --auto-resume-ckpt-mode=2nd-last
[RWKV.lightning_trainer.py] Found 5 checkpoints in '../checkpoint/trainer-validaiton/ckpt-auto-resume-test', using 'last.ckpt'
[2023-08-03 07:43:47,587] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230706'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 636907093
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install 

In [11]:
# Lets run it again, and let it run to completion
!cd ../../RWKV-v4neo && python3 lightning_trainer.py fit \
    -c ../notebook/trainer-validation/config/ckpt-auto-resume-test-1024.yaml \
    --auto-resume-ckpt-dir "auto"

[RWKV.lightning_trainer.py] Running with PYTORCH_CUDA_ALLOC_CONF=backend:cudaMallocAsync
[RWKV.lightning_trainer.py] Extracting checkpoint dir from config, for --auto-resume-ckpt-dir=auto
[RWKV.lightning_trainer.py] Enabling --auto-resume-ckpt-dir=../checkpoint/trainer-validaiton/ckpt-auto-resume-test --auto-resume-ckpt-mode=2nd-last
[RWKV.lightning_trainer.py] Found 5 checkpoints in '../checkpoint/trainer-validaiton/ckpt-auto-resume-test', using 'last.ckpt'
[2023-08-03 07:46:48,809] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230706'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 655468113
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install 

In [12]:
# Lets see if there is any issues when rerunning after completion
!cd ../../RWKV-v4neo && python3 lightning_trainer.py fit \
    -c ../notebook/trainer-validation/config/ckpt-auto-resume-test-1024.yaml \
    --auto-resume-ckpt-dir "auto"

[RWKV.lightning_trainer.py] Running with PYTORCH_CUDA_ALLOC_CONF=backend:cudaMallocAsync
[RWKV.lightning_trainer.py] Extracting checkpoint dir from config, for --auto-resume-ckpt-dir=auto
[RWKV.lightning_trainer.py] Enabling --auto-resume-ckpt-dir=../checkpoint/trainer-validaiton/ckpt-auto-resume-test --auto-resume-ckpt-mode=2nd-last
[RWKV.lightning_trainer.py] Found 5 checkpoints in '../checkpoint/trainer-validaiton/ckpt-auto-resume-test', using 'epoch=0-step=332.ckpt'
[2023-08-03 08:09:00,684] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230706'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 663606280
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ 

In [13]:
# The downside of using "2nd last" is that the `save_on_train_epoch_end: true` flag will only update the last.ckpt
# meaning we may end up retraining the last few steps repeatingly, if not handled carefully in notebooks. 
#
# In most cases, this is not an isssue, but you can trade in resiliance protection, for truely loading the last checkpoint instead if needed
!cd ../../RWKV-v4neo && python3 lightning_trainer.py fit \
    -c ../notebook/trainer-validation/config/ckpt-auto-resume-test-1024.yaml \
    --auto-resume-ckpt-dir "auto" \
    --auto-resume-ckpt-mode "last"

[RWKV.lightning_trainer.py] Running with PYTORCH_CUDA_ALLOC_CONF=backend:cudaMallocAsync
[RWKV.lightning_trainer.py] Extracting checkpoint dir from config, for --auto-resume-ckpt-dir=auto
[RWKV.lightning_trainer.py] Enabling --auto-resume-ckpt-dir=../checkpoint/trainer-validaiton/ckpt-auto-resume-test --auto-resume-ckpt-mode=last
[RWKV.lightning_trainer.py] Found 5 checkpoints in '../checkpoint/trainer-validaiton/ckpt-auto-resume-test', using 'last.ckpt'
[2023-08-03 08:17:08,832] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230706'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 4260174562
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wan