In [1]:
# Configure the parent path to be the proj folder
import sys, os, torch, time
sys.path.append('../../')

# Import the model classes
from rwkv.model.rwkv5_eagle_model import RWKV5EagleModel
from rwkv.model.rwkv5_eagle_config_map import RWKV5EagleConfigMap

# File to load
MODEL_FILENAME="v5-0B4.pth"

# Run device, and run dtype to use
RUN_DEVICE="cpu"
RUN_DTYPE=torch.bfloat16

# Check for cuda device
if torch.cuda.is_available():
    RUN_DEVICE="cuda:0"

# Check if the reference weights exists
assert os.path.exists(f"./.model/{MODEL_FILENAME}"), "The reference weights does not exist. Please download it first (00-model-download.ipynb)"

# Loads the model weights
model_weight = torch.load(f"./.model/{MODEL_FILENAME}", map_location='cpu', weights_only=True, mmap=True)

# Model filename
print(f"### Model filename: {MODEL_FILENAME}")

# Lets get the n_dim, and setup the test module
n_dim = model_weight['emb.weight'].shape[1]
print(f"### Model n_dim: {n_dim}")

# List the model weights keys, and their shapes
print(f"### model weights keys:")
for key in model_weight:
    print(f"{key}: {model_weight[key].shape} - {model_weight[key].dtype}")

### Model filename: v5-0B4.pth
### Model n_dim: 1024
### model weights keys:
emb.weight: torch.Size([65536, 1024]) - torch.bfloat16
blocks.0.ln1.weight: torch.Size([1024]) - torch.bfloat16
blocks.0.ln1.bias: torch.Size([1024]) - torch.bfloat16
blocks.0.ln2.weight: torch.Size([1024]) - torch.bfloat16
blocks.0.ln2.bias: torch.Size([1024]) - torch.bfloat16
blocks.0.ln0.weight: torch.Size([1024]) - torch.bfloat16
blocks.0.ln0.bias: torch.Size([1024]) - torch.bfloat16
blocks.0.att.time_mix_k: torch.Size([1, 1, 1024]) - torch.bfloat16
blocks.0.att.time_mix_v: torch.Size([1, 1, 1024]) - torch.bfloat16
blocks.0.att.time_mix_r: torch.Size([1, 1, 1024]) - torch.bfloat16
blocks.0.att.time_mix_g: torch.Size([1, 1, 1024]) - torch.bfloat16
blocks.0.att.time_decay: torch.Size([16, 64]) - torch.bfloat16
blocks.0.att.time_faaaa: torch.Size([16, 64]) - torch.bfloat16
blocks.0.att.receptance.weight: torch.Size([1024, 1024]) - torch.bfloat16
blocks.0.att.key.weight: torch.Size([1024, 1024]) - torch.bfloat

In [2]:
BATCH_SIZE=1
TEST_COUNT=1000
TEST_LOOP=1
# GPU_COUNT=1

@torch.inference_mode()
def testForwardPass(smodel, compiled=False):
    # Lets prepare the states accordingly
    in_state = smodel.init_state(BATCH_SIZE)
    out_state = smodel.init_state(BATCH_SIZE)
    x_tokens = torch.ones(BATCH_SIZE, 1, device=smodel.emb.weight.device, dtype=torch.int)
    # out_emb = torch.zeros(BATCH_SIZE, 1, n_dim, device=smodel.emb.weight.device, dtype=smodel.emb.weight.dtype)

    # Lets test more aggressively
    time0 = time.time()
    if compiled:
        for i in range(TEST_COUNT):
            smodel.forward_with_compile(x_tokens, in_state, out_state)
    else:
        for i in range(TEST_COUNT):
            smodel.forward(x_tokens, in_state, out_state)
    time1 = time.time()

    print("--")
    print(f"### Compiled: {compiled}")
    print("--")
    print(f"### (warmup) Avg time per token batch ({BATCH_SIZE}):", (time1-time0)*1000/TEST_COUNT, "ms")
    print(f"### (warmup) Avg tok/s batch ({BATCH_SIZE}) :", 1000/((time1-time0)*1000/TEST_COUNT), "tok/s")
    print(f"### (warmup) Avg time per token unbatched :", (time1-time0)*1000/TEST_COUNT/BATCH_SIZE, "ms")
    print(f"### (warmup) Avg tok/s unbatched :", 1000/((time1-time0)*1000/TEST_COUNT/BATCH_SIZE), "tok/s")
    # print(f"### (warmup) Avg tok/s unbatched / gpu :", (1000/((time1-time0)*1000/TEST_COUNT/BATCH_SIZE))/GPU_COUNT, "tok/s")

    for i in range(TEST_LOOP):
        time0 = time.time()
        if compiled:
            for i in range(TEST_COUNT):
                smodel.forward_with_compile(x_tokens, in_state, out_state)
        else:
            for i in range(TEST_COUNT):
                smodel.forward(x_tokens, in_state, out_state)
        time1 = time.time()
        print("--")
        print(f"### (actual) Avg time per token batch ({BATCH_SIZE}):", (time1-time0)*1000/TEST_COUNT, "ms")
        print(f"### (actual) Avg tok/s batch ({BATCH_SIZE}) :", 1000/((time1-time0)*1000/TEST_COUNT), "tok/s")
        print(f"### (actual) Avg time per token unbatched :", (time1-time0)*1000/TEST_COUNT/BATCH_SIZE, "ms")
        print(f"### (actual) Avg tok/s unbatched :", 1000/((time1-time0)*1000/TEST_COUNT/BATCH_SIZE), "tok/s")
        # print(f"### (actual) Avg tok/s unbatched / gpu :", (1000/((time1-time0)*1000/TEST_COUNT/BATCH_SIZE))/GPU_COUNT, "tok/s")

# Get the config
model_config = RWKV5EagleConfigMap.from_model_state_dict(model_weight, device=RUN_DEVICE, dtype=RUN_DTYPE)

# Log the config
print("### Model Config:")
print(model_config)

# Initialize the model instance
model_inst = RWKV5EagleModel(model_config)
model_inst.load_from_model_state_dict(model_weight)
model_state = model_inst.state_dict()

# List the model weights keys, and their shapes
print(f"### model weights keys:")
for key in model_state:
    print(f"{key}: {model_state[key].shape} - {model_state[key].dtype}")


### Model Config:
RWKV5EagleConfigMap(n_layer=24, n_dim=1024, head_size=64, head_size_divisor=8, dropout_rate=0.0, n_dim_ffn=3584, n_dim_att=1024, layer_id=None, n_head=None, device='cuda:0', dtype=torch.bfloat16, n_vocab=65536, init_state_wkv=False)


### model weights keys:
emb.weight: torch.Size([65536, 1024]) - torch.bfloat16
blocks.0.ln1.weight: torch.Size([1024]) - torch.bfloat16
blocks.0.ln1.bias: torch.Size([1024]) - torch.bfloat16
blocks.0.ln2.weight: torch.Size([1024]) - torch.bfloat16
blocks.0.ln2.bias: torch.Size([1024]) - torch.bfloat16
blocks.0.ln0.weight: torch.Size([1024]) - torch.bfloat16
blocks.0.ln0.bias: torch.Size([1024]) - torch.bfloat16
blocks.0.att.time_mix_k: torch.Size([1, 1, 1024]) - torch.bfloat16
blocks.0.att.time_mix_v: torch.Size([1, 1, 1024]) - torch.bfloat16
blocks.0.att.time_mix_r: torch.Size([1, 1, 1024]) - torch.bfloat16
blocks.0.att.time_mix_g: torch.Size([1, 1, 1024]) - torch.bfloat16
blocks.0.att.time_decay: torch.Size([16, 64]) - torch.bfloat16
blocks.0.att.time_faaaa: torch.Size([16, 64]) - torch.bfloat16
blocks.0.att.receptance.weight: torch.Size([1024, 1024]) - torch.bfloat16
blocks.0.att.key.weight: torch.Size([1024, 1024]) - torch.bfloat16
blocks.0.att.value.weight: torch.Size([1024, 1024]

In [3]:
# Test the single token forward pass
testForwardPass(model_inst)

--
### Compiled: False
--
### (warmup) Avg time per token batch (1): 16.194694757461548 ms
### (warmup) Avg tok/s batch (1) : 61.74861675236328 tok/s
### (warmup) Avg time per token unbatched : 16.194694757461548 ms
### (warmup) Avg tok/s unbatched : 61.74861675236328 tok/s


In [None]:
# Test the single token forward pass
testForwardPass(model_inst, True)

TorchRuntimeError: Failed running call_method copy_(*(FakeTensor(..., device='cuda:0', size=(1, 1, 1024), dtype=torch.bfloat16), FakeTensor(..., device='cuda:0', size=(1, 1, 65536), dtype=torch.bfloat16)), **{'non_blocking': True}):
expand: attempting to expand a dimension of length 65536!

from user code:
   File "/home/picocreator/recursal/RWKV-block/test/v5_eagle/../../rwkv/model/rwkv5_eagle_model.py", line 196, in forward_with_compile
    out_emb.copy_(x_emb, non_blocking=True)

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True
