In [1]:

import torch
def load_tensor(path):
    tensors = {}
    try:
        tensors["hidden_states"] = torch.load(path + "hidden_states.pt").to("cuda:1")
        tensors["w1"] = torch.load(path + "w1.pt").to("cuda:1")
        tensors["w2"] = torch.load(path + "w2.pt").to("cuda:1")
        tensors["topk_weights"] = torch.load(path + "topk_weights.pt").to("cuda:1")
        tensors["topk_ids"] = torch.load(path + "topk_ids.pt").to("cuda:1")
        tensors["expert_map"] = torch.load(path + "expert_map.pt").to("cuda:1")
        tensors["out_hidden_states"] = torch.load(path + "out_hidden_states.pt").to("cuda:1")
        tensors["final_hidden_states"] = torch.load(path + "final_hidden_states.pt").to("cuda:1")
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
    except RuntimeError as e:
        print(f"Error: CUDA runtime issue - {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

    return tensors
rank1 = load_tensor("/home/ubuntu/vllm_test_field/saved_tensors/rank_1/")
rank0 = load_tensor("/home/ubuntu/vllm_test_field/saved_tensors/rank_0/")


In [2]:
print( torch.equal(rank1["hidden_states"], rank0["hidden_states"]) )
print( torch.equal(rank1["topk_weights"], rank0["topk_weights"]) )
print( torch.equal(rank1["topk_ids"], rank0["topk_ids"]) )
print( torch.equal(rank1["final_hidden_states"], rank0["final_hidden_states"]) )

True
False
True
True


In [3]:
rank0["shared_output"] = torch.load("/home/ubuntu/vllm_test_field/saved_tensors/rank_0/shared_output.pt").to("cuda:1")
rank1["shared_output"] = torch.load("/home/ubuntu/vllm_test_field/saved_tensors/rank_1/shared_output.pt").to("cuda:1")


In [4]:
print( torch.equal(rank1["w1"], rank0["w1"]) )
print( torch.equal(rank1["w2"], rank0["w2"]) )
print( torch.equal(rank1["expert_map"], rank0["expert_map"]) )
print( torch.equal(rank1["out_hidden_states"], rank0["out_hidden_states"]) )
print( torch.equal(rank1["shared_output"], rank0["shared_output"]) )
print(rank0["expert_map"])
print(rank1["expert_map"])

False
False
False
False
False
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1], device='cuda:1', dtype=torch.int32)
tensor([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  1,  2,  3,  4,  5,
         6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
        24, 25, 26, 27, 28, 29], device='cuda:1', dtype=torch.int32)


In [5]:
print(rank1["topk_ids"])

tensor([[27, 49, 41,  2],
        [59,  3, 31, 41],
        [24, 11, 52, 48],
        [29, 53, 47, 13],
        [13, 11, 52, 16],
        [ 1, 11, 26, 17]], device='cuda:1', dtype=torch.int32)


In [6]:
# replica expert 59 , add new expert id = 60
# modify rank0 weight
print(rank1["w1"].shape) # torch.Size([30, 2816, 2048])
new_tensor = rank1["w1"][-1, :, :]
new_tensor = new_tensor.to(rank1["w1"].device)
rank1["w1"] = torch.cat((rank1["w1"], new_tensor.unsqueeze(0)), dim=0)
print(rank1["w1"].shape) # torch.Size([31, 2816, 2048])

print(rank1["w2"].shape) # torch.Size([30, 2048, 1408])
new_tensor = rank1["w2"][-1, :, :]
new_tensor = new_tensor.to(rank1["w2"].device)
rank1["w2"] = torch.cat((rank1["w2"], new_tensor.unsqueeze(0)), dim=0)
print(rank1["w2"].shape) # torch.Size([31, 2048, 1408])

#  modify topk_ids, 59-->60
print(rank1["topk_ids"])
rank0["topk_ids"][1][1]= 60
rank1["topk_ids"][1][1] = 60
print(rank1["topk_ids"])
# modify expert_map
rank0["expert_map"] = torch.cat((rank0["expert_map"], torch.tensor([-1]).to(rank0["expert_map"].device)))
print(rank0["expert_map"])
rank1["expert_map"] = torch.cat((rank1["expert_map"], torch.tensor([30]).to(rank1["expert_map"].device)))
print(rank1["expert_map"])

torch.Size([30, 2816, 2048])
torch.Size([31, 2816, 2048])
torch.Size([30, 2048, 1408])
torch.Size([31, 2048, 1408])
tensor([[27, 49, 41,  2],
        [59,  3, 31, 41],
        [24, 11, 52, 48],
        [29, 53, 47, 13],
        [13, 11, 52, 16],
        [ 1, 11, 26, 17]], device='cuda:1', dtype=torch.int32)
tensor([[27, 49, 41,  2],
        [59, 60, 31, 41],
        [24, 11, 52, 48],
        [29, 53, 47, 13],
        [13, 11, 52, 16],
        [ 1, 11, 26, 17]], device='cuda:1', dtype=torch.int32)
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1], device='cuda:1')
tensor([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  1,  2,  3,  4,  5,
         6,  7,  8,  9, 10, 11, 12, 13, 14, 

In [7]:
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts_impl 
# forward of rank0 moe layer
rank0_output = fused_experts_impl(
    hidden_states = rank0["hidden_states"],
    w1 = rank0["w1"],
    w2 = rank0["w2"],
    topk_weights = rank0["topk_weights"],
    topk_ids = rank0["topk_ids"],
    inplace = True,
    activation = "silu",
    expert_map = rank0["expert_map"],
    global_num_experts = 61 
)
print(rank0_output.shape)
print( torch.equal(rank0_output , rank0["out_hidden_states"]) )

  from .autonotebook import tqdm as notebook_tqdm


INFO 05-08 15:42:35 [__init__.py:239] Automatically detected platform cuda.


2025-05-08 15:42:36,573	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 05-08 15:42:37 [fused_moe.py:1658] expert_ids.shape: torch.Size([59])


RuntimeError: Triton Error [CUDA]: an illegal memory access was encountered

In [7]:
# forward of rank1 moe layer
rank1_output = fused_experts_impl(
    hidden_states = rank1["hidden_states"],
    w1 = rank1["w1"],
    w2 = rank1["w2"],
    topk_weights = rank1["topk_weights"],
    topk_ids = rank1["topk_ids"],
    inplace = True,
    activation = "silu",
    expert_map = rank1["expert_map"],
    global_num_experts = 61
)
print(rank1_output.shape)
print( torch.equal(rank1_output , rank1["out_hidden_states"]) )

INFO 05-08 15:26:59 [fused_moe.py:1658] expert_ids.shape: torch.Size([59])
torch.Size([6, 2048])
False


In [18]:
rank0_final_hidden_states = rank0_output + rank0["shared_output"]
rank1_final_hidden_states = rank1_output + rank1["shared_output"]
# all reduce 
reduced_result = rank0_final_hidden_states + rank1_final_hidden_states
print( torch.equal(reduced_result, 
                   rank0["final_hidden_states"]) )

print( torch.equal(reduced_result, 
                   rank1["final_hidden_states"]) )

True
True
