# Training validation

CUDA based training, for loss curve comparision / validation between the various kernel implements

In [1]:
# Configure the parent path to be the proj folder
import sys, os, torch, time
sys.path.append('../../')
sys.path.append('../../test')

# Memory segmenting fix?
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Ensure sys.path has export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
# so that the CUDA binaries are available
if "/cuda/bin" not in sys.path:
    sys.path.append('/usr/local/cuda/bin')

# Import the model classes
from rwkv_block.v7_goose.model.rwkv7_goose_model import RWKV7GooseModel
from trainer.SimpleTestTrainer import SimpleTestTrainer

# Device to run on
RUN_DEVICE="cuda:0"

# If multiple cuda devices are available
# we use the respective device, so that I can run multiple notebooks in parallel
#
# Comment out this logic if you intend to manually set the device
if torch.cuda.device_count() >= 8:
    RUN_DEVICE="cuda:2"

# Training batch size
BATCH_SIZE = 4

# Model shape and size
LAYER_COUNT = 12
DIM_SIZE = 512
TMIX_BACKEND="cuda_ref"

# Create and initalize the model
model = RWKV7GooseModel({
    "n_layer": LAYER_COUNT,
    "n_dim": DIM_SIZE,
    "tmix_backend": TMIX_BACKEND,
    "device": RUN_DEVICE,
    "dtype": "bfloat16",
    "n_vocab": 50432
})
model.init_parameters()

# Setup the trainer
trainer = SimpleTestTrainer(model, device=RUN_DEVICE, batch_size=BATCH_SIZE)

# Trigger the train process
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm


---------------------------------------------
[SimpleTestTrainer] Initializing the trainer for:  RWKV-Block.SimpleTestTrainer
- hf_dataset:          recursal/SuperWiki-Tiny
- dataset_ctx_length:  4096
- dataset_min_length:  4096
- tokenizer_name:      EleutherAI/gpt-neox-20b
- batch_size:          2
- learning_rate:       0.001
- num_epochs:          1
---------------------------------------------
[SimpleTestTrainer] Loading the tokenizer:  EleutherAI/gpt-neox-20b ...
[SimpleTestTrainer] Loading the dataset:  recursal/SuperWiki-Tiny ...


Generating train split: 718763 examples [00:12, 55543.29 examples/s]


[SimpleTestTrainer] Preparing the training dataset...


Map (num_proc=256): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 718763/718763 [03:29<00:00, 3427.08 examples/s]
Filter (num_proc=256): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 718763/718763 [00:20<00:00, 35087.79 examples/s]
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m ([33mrwkv-x-dev[0m). Use [1m`wandb login --relogin`[0m to force relogin


[SimpleTestTrainer] Training dataset size:    718044
[SimpleTestTrainer] Validation dataset size:  719
[SimpleTestTrainer] Preparing the data loaders...
[SimpleTestTrainer] Training batch count:    359022
[SimpleTestTrainer] Validation batch count:  359
[SimpleTestTrainer] Setting up the optimizer, loss function...
[SimpleTestTrainer] Initializing wandb...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[SimpleTestTrainer] wandb is logged in.


[SimpleTestTrainer] Initialization complete.
---------------------------------------------
Epoch 1/1


Training:   0%|                                                                                                                                                          | 0/359022 [00:00<?, ?it/s]



Using /home/recursal/.cache/torch_extensions/py312_cu121 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/recursal/.cache/torch_extensions/py312_cu121/wind_backstepping/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module wind_backstepping...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] /home/recursal/miniconda3/envs/py-3-12/bin/nvcc --generate-dependencies-with-compile --dependency-output wkv7_cuda.cuda.o.d -ccbin /home/recursal/miniconda3/envs/py-3-12/bin/x86_64-conda-linux-gnu-cc -DTORCH_EXTENSION_NAME=wind_backstepping -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/recursal/miniconda3/envs/py-3-12/lib/python3.12/site-packages/torch/include -isystem /home/recursal/miniconda3/envs/py-3-12/lib/python3.12/site-packages/torch/include/torch/csrc/api/include -isystem /home/recursal/miniconda3/envs/py-3-12/lib/python3.12/site-packages/torch/include/TH -isystem /home/recursal/miniconda3/envs/py-3-12/lib/python3.12/site-packages/torch/include/THC -isystem /home/recursal/miniconda3/envs/py-3-12/include -isystem /home/recursal/miniconda3/envs/py-3-12/include/python3.12 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_B

Loading extension module wind_backstepping...
Training:   0%|                                                                                                                                  | 331/359022 [01:25<25:42:32,  3.88it/s, loss=3.08]


KeyboardInterrupt: 