## See how many GPUs we have

In [1]:
import torch

def check_gpu_memory_simple():
    if not torch.cuda.is_available():
        print("No available CUDA GPU")
        return
    
    for i in range(torch.cuda.device_count()):
        name = torch.cuda.get_device_name(i)
        total = torch.cuda.get_device_properties(i).total_memory / (1024 ** 3)  # GB
        allocated = torch.cuda.memory_allocated(i) / (1024 ** 3)  # GB
        free = (torch.cuda.get_device_properties(i).total_memory - torch.cuda.memory_reserved(i)) / (1024 ** 3)  # GB
        
        print(f"GPU {i} ({name}):")
        print(f"  Total Memory: {total:.2f} GB")
        print(f"  Used Memory: {allocated:.2f} GB")
        print(f"  Available Memory: {free:.2f} GB")

check_gpu_memory_simple()

GPU 0 (NVIDIA A10G):
  Total Memory: 21.99 GB
  Used Memory: 0.00 GB
  Available Memory: 21.99 GB


## Plot Expert load function

In [2]:
from collections import Counter
import matplotlib.pyplot as plt
import os
 
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def my_plot(topk_ids, selected=None, new_id=None):
    count = Counter(topk_ids.flatten().tolist())
    sorted_counts = sorted(count.items(), key=lambda x: x[0])
    elements = [k for k, v in sorted_counts]
    values = [v for k, v in sorted_counts] 
    colors = [  'skyblue' for v in values]
    if selected != None:
        for s in selected:
            colors[s] = 'green'
    if new_id!= None:
        for n in new_id:
            colors[n] = 'red'
    # 绘制柱状图
    plt.figure(figsize=(8, 5))
    plt.bar(elements, values, color=colors)
    plt.title('Frequency of Each Element', fontsize=14)
    plt.xlabel('Element', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    for i, value in enumerate(values):
        plt.text(elements[i], value + 0.1, str(value), ha='center', va='bottom', fontsize=10)

    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()


## Initialize workers

In [3]:

from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts_impl 
import transformers
from transformers import Qwen2MoeConfig 
import torch
import os

# 假设已经加载了 model
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen1.5-MoE-A2.7B"
config = transformers.AutoConfig.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B")


WORLD_SIZE =  4
TOKEN_NUM= 1000
workers = [dict() for _ in range(WORLD_SIZE)]
for i in range(0,len(workers)):
    workers[i]["world_size"] = WORLD_SIZE
    workers[i]["rank"] =   i
print(workers)
 


  from .autonotebook import tqdm as notebook_tqdm


INFO 05-18 13:42:17 [__init__.py:239] Automatically detected platform cuda.


2025-05-18 13:42:18,365	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


[{'world_size': 4, 'rank': 0}, {'world_size': 4, 'rank': 1}, {'world_size': 4, 'rank': 2}, {'world_size': 4, 'rank': 3}]


## Initialize rank weight in the original track

In [4]:
def init_rank_weight(world_size, config ):
    expert_num = config.num_experts
    avg_expert = expert_num // world_size
    w13_weight = torch.nn.Parameter(torch.randn(
            avg_expert,
            2 * config.moe_intermediate_size,
        config.hidden_size,
        dtype=torch.bfloat16), requires_grad=False).to("cuda")
    print(w13_weight.shape)
    w2_weight =torch.nn.Parameter(torch.randn(
        avg_expert,
        config.hidden_size,
        config.moe_intermediate_size ,
        dtype=torch.bfloat16),
        requires_grad=False).to("cuda")
    print(w2_weight.shape) 
    return w13_weight, w2_weight

for worker in workers:
    worker["w1"] , worker["w2"] = init_rank_weight(WORLD_SIZE, config)


torch.Size([15, 2816, 2048])
torch.Size([15, 2048, 1408])
torch.Size([15, 2816, 2048])
torch.Size([15, 2048, 1408])
torch.Size([15, 2816, 2048])
torch.Size([15, 2048, 1408])
torch.Size([15, 2816, 2048])
torch.Size([15, 2048, 1408])


## Prepare input for normal vllm track and expert-duplication containers

In [5]:


def create_input(token_num, config):
    topk_ids = torch.randint(0, config.num_experts, (token_num, config.num_experts_per_tok), dtype=torch.int32).to("cuda")
    topk_weights = torch.randn(token_num, config.num_experts_per_tok, dtype=torch.float32).to("cuda")
    hidden_states = torch.randn( token_num, config.hidden_size, dtype=torch.bfloat16).to("cuda") # 
    print("topk_ids.shape", topk_ids.shape, topk_ids.dtype)
    print("topk_weights.shape", topk_weights.shape, topk_weights.dtype)
    print("hidden_states.shape", hidden_states.shape, hidden_states.dtype)
    return {"topk_ids":topk_ids,"topk_weights":topk_weights, "hidden_states":hidden_states }

inputs = create_input( TOKEN_NUM, config)


topk_ids.shape torch.Size([1000, 4]) torch.int32
topk_weights.shape torch.Size([1000, 4]) torch.float32
hidden_states.shape torch.Size([1000, 2048]) torch.bfloat16


## Initialize expert map for boths path

In [6]:
def init_expert_map(rank, world_size, total_expert_num):
    avg_expert_num = total_expert_num // world_size
    real_local_expert_ids = list(range(rank * avg_expert_num, (rank + 1) * avg_expert_num))
    expert_map = [-1] * total_expert_num
    for local_id, global_id in enumerate(real_local_expert_ids):
        expert_map[global_id] = local_id
    return torch.tensor(expert_map, device="cuda", dtype=torch.int32)

for worker in workers:
    worker["expert_map"]  = init_expert_map( worker["rank"],WORLD_SIZE,  config.num_experts)
    print(worker["expert_map"], worker["expert_map"].shape)
    


tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1], device='cuda:0', dtype=torch.int32) torch.Size([60])
tensor([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  1,  2,
         3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1], device='cuda:0', dtype=torch.int32) torch.Size([60])
tensor([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  1,  2,  3,  4,  5,
         6,  7,  8,  9, 10, 11, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1], device='cuda:0', dtype=torch.int32) torch.Size([60])
tensor([-1, -1, -1, -1

## Warm up

In [7]:
def moe_forward(worker, inputs ):
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)
    start_event.record()
    output = fused_experts_impl(
        hidden_states = inputs["hidden_states"],
        w1 = worker["w1"],
        w2 = worker["w2"],
        topk_weights = inputs["topk_weights"],
        topk_ids = inputs["topk_ids"],
        inplace = True,
        activation = "silu",
        expert_map = worker["expert_map"],
        global_num_experts =worker["expert_map"].shape[0]
    )
    end_event.record()
    torch.cuda.synchronize()
    latency_ms = start_event.elapsed_time(end_event)
    return output,latency_ms

make_inputs={"topk_ids": torch.randint(low=0, high=60, size=inputs["topk_ids"].shape, dtype=inputs["topk_ids"].dtype,device=inputs["topk_ids"].device),
             "topk_weights": torch.rand_like(inputs["topk_weights"], dtype=inputs["topk_weights"].dtype,device=inputs["topk_weights"].device),
             "hidden_states": torch.rand_like(inputs["hidden_states"], dtype=inputs["hidden_states"].dtype,device=inputs["hidden_states"].device)}

for worker in workers:
    moe_forward (worker, make_inputs)

INFO 05-18 13:42:34 [fused_moe.py:1658] expert_ids.shape: torch.Size([122])
INFO 05-18 13:42:34 [fused_moe.py:1658] expert_ids.shape: torch.Size([122])
INFO 05-18 13:42:34 [fused_moe.py:1658] expert_ids.shape: torch.Size([122])
INFO 05-18 13:42:34 [fused_moe.py:1658] expert_ids.shape: torch.Size([122])


## without expert duplication

In [8]:
inputs["hidden_states"].shape

torch.Size([1000, 2048])

In [9]:

import time

start_time = time.time() 
without_expert_duplication_time_cost=[]
for worker in workers:
    worker["output"],latency_ms = moe_forward (worker, inputs)
    without_expert_duplication_time_cost.append(latency_ms)
    print(worker["output"].shape)

for i in range(len(without_expert_duplication_time_cost)):
    print(f"Rank {i} latency: {without_expert_duplication_time_cost[i]} ms")
end_time = time.time()  # 记录结束时间
elapsed_time = end_time - start_time  # 计算耗时


print(f"End to End time is: {elapsed_time*1000:.2f} ms")


INFO 05-18 13:42:34 [fused_moe.py:1658] expert_ids.shape: torch.Size([122])
torch.Size([1000, 2048])
INFO 05-18 13:42:34 [fused_moe.py:1658] expert_ids.shape: torch.Size([122])
torch.Size([1000, 2048])
INFO 05-18 13:42:34 [fused_moe.py:1658] expert_ids.shape: torch.Size([122])
torch.Size([1000, 2048])
INFO 05-18 13:42:34 [fused_moe.py:1658] expert_ids.shape: torch.Size([122])
torch.Size([1000, 2048])
Rank 0 latency: 2.0346879959106445 ms
Rank 1 latency: 1.5892479419708252 ms
Rank 2 latency: 1.6076799631118774 ms
Rank 3 latency: 1.58515202999115 ms
End to End time is: 7.54 ms


In [10]:
print("finished")

finished
