In [1]:
import torch

def check_gpu_memory_simple():
    if not torch.cuda.is_available():
        print("No available CUDA GPU")
        return
    
    for i in range(torch.cuda.device_count()):
        name = torch.cuda.get_device_name(i)
        total = torch.cuda.get_device_properties(i).total_memory / (1024 ** 3)  # GB
        allocated = torch.cuda.memory_allocated(i) / (1024 ** 3)  # GB
        free = (torch.cuda.get_device_properties(i).total_memory - torch.cuda.memory_reserved(i)) / (1024 ** 3)  # GB
        
        print(f"GPU {i} ({name}):")
        print(f"  Total Memory: {total:.2f} GB")
        print(f"  Used Memory: {allocated:.2f} GB")
        print(f"  Available Memory: {free:.2f} GB")

check_gpu_memory_simple()

GPU 0 (NVIDIA A10G):
  Total Memory: 21.99 GB
  Used Memory: 0.00 GB
  Available Memory: 21.99 GB
GPU 1 (NVIDIA A10G):
  Total Memory: 21.99 GB
  Used Memory: 0.00 GB
  Available Memory: 21.99 GB
GPU 2 (NVIDIA A10G):
  Total Memory: 21.99 GB
  Used Memory: 0.00 GB
  Available Memory: 21.99 GB
GPU 3 (NVIDIA A10G):
  Total Memory: 21.99 GB
  Used Memory: 0.00 GB
  Available Memory: 21.99 GB


In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

## Launch the fused moe as a flask server

## Define an api that receive hidden layer data and return the processed result

## Load all weights using the given instructions

## naive implementation（using Bei's code）

In [5]:
def load_tensor(path):
    tensors = {}
    try:
        tensors["hidden_states"] = torch.load(path + "hidden_states.pt",map_location="cuda:0").to("cuda")
        tensors["w1"] = torch.load(path + "w1.pt",map_location="cuda:0").to("cuda")
        tensors["w2"] = torch.load(path + "w2.pt",map_location="cuda:0").to("cuda")
        tensors["topk_weights"] = torch.load(path + "topk_weights.pt",map_location="cuda:0").to("cuda")
        tensors["topk_ids"] = torch.load(path + "topk_ids.pt",map_location="cuda:0").to("cuda")
        tensors["expert_map"] = torch.load(path + "expert_map.pt",map_location="cuda:0").to("cuda")
        tensors["out_hidden_states"] = torch.load(path + "out_hidden_states.pt",map_location="cuda:0").to("cuda")
        tensors["final_hidden_states"] = torch.load(path + "final_hidden_states.pt",map_location="cuda:0").to("cuda")
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
    except RuntimeError as e:
        print(f"Error: CUDA runtime issue - {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

    return tensors
rank1 = load_tensor("/home/ubuntu/vllm_test_field/saved_tensors/rank_1/")
rank0 = load_tensor("/home/ubuntu/vllm_test_field/saved_tensors/rank_0/")

### load shared experts

In [6]:
rank0["shared_output"] = torch.load("/home/ubuntu/vllm_test_field/saved_tensors/rank_0/shared_output.pt",map_location="cuda:0").to("cuda")
rank1["shared_output"] = torch.load("/home/ubuntu/vllm_test_field/saved_tensors/rank_1/shared_output.pt",map_location="cuda:0").to("cuda")

In [6]:
rank0.keys()

dict_keys(['hidden_states', 'w1', 'w2', 'topk_weights', 'topk_ids', 'expert_map', 'out_hidden_states', 'final_hidden_states', 'shared_output'])

### validation

In [7]:
rank1["w1"].shape

torch.Size([30, 2816, 2048])

In [None]:
print( torch.equal(rank1["w1"], rank0["w1"]) )
print( torch.equal(rank1["w2"], rank0["w2"]) )
print( torch.equal(rank1["expert_map"], rank0["expert_map"]) )
print( torch.equal(rank1["out_hidden_states"], rank0["out_hidden_states"]) )
print( torch.equal(rank1["shared_output"], rank0["shared_output"]) )
print(rank0["expert_map"])
print(rank1["expert_map"])

False
False
False
False
False
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1], device='cuda:0', dtype=torch.int32)
tensor([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  1,  2,  3,  4,  5,
         6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
        24, 25, 26, 27, 28, 29], device='cuda:0', dtype=torch.int32)


### run rank 0

In [12]:
rank0["topk_ids"], rank1["topk_ids"]

(tensor([[27, 49, 41,  2],
         [59,  3, 31, 41],
         [24, 11, 52, 48],
         [29, 53, 47, 13],
         [13, 11, 52, 16],
         [ 1, 11, 26, 17]], device='cuda:0', dtype=torch.int32),
 tensor([[27, 49, 41,  2],
         [59,  3, 31, 41],
         [24, 11, 52, 48],
         [29, 53, 47, 13],
         [13, 11, 52, 16],
         [ 1, 11, 26, 17]], device='cuda:0', dtype=torch.int32))

In [None]:
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts_impl 
# forward of rank0 moe layer
rank0_output = fused_experts_impl(
    hidden_states = rank0["hidden_states"],
    w1 = rank0["w1"],
    w2 = rank0["w2"],
    topk_weights = rank0["topk_weights"],
    topk_ids = rank0["topk_ids"],
    inplace = True,
    activation = "silu",
    expert_map = rank0["expert_map"],
    global_num_experts = 60 
)
print(rank0_output.shape)
print( torch.equal(rank0_output , rank0["out_hidden_states"]) )

  from .autonotebook import tqdm as notebook_tqdm


INFO 05-06 07:28:20 [__init__.py:239] Automatically detected platform cuda.


2025-05-06 07:28:21,720	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 05-06 07:28:22 [fused_moe.py:1658] expert_ids.shape: torch.Size([58])
torch.Size([6, 2048])
True


### run rank 1

In [None]:
# forward of rank1 moe layer
rank1_output = fused_experts_impl(
    hidden_states = rank1["hidden_states"],
    w1 = rank1["w1"],
    w2 = rank1["w2"],
    topk_weights = rank1["topk_weights"],
    topk_ids = rank1["topk_ids"],
    inplace = True,
    activation = "silu",
    expert_map = rank1["expert_map"],
    global_num_experts = 60
)
print(rank1_output.shape)
print( torch.equal(rank1_output , rank1["out_hidden_states"]) )

INFO 05-06 07:28:26 [fused_moe.py:1658] expert_ids.shape: torch.Size([58])
torch.Size([6, 2048])
True


### merge and validate

In [None]:
rank0_final_hidden_states = rank0_output + rank0["shared_output"]
rank1_final_hidden_states = rank1_output + rank1["shared_output"]
# all reduce 
reduced_result = rank0_final_hidden_states + rank1_final_hidden_states
print( torch.equal(reduced_result, 
                   rank0["final_hidden_states"]) )

True


## Try loading weights using IPC

In [3]:
from transformers import LlamaConfig, AutoTokenizer
import torch
import ctypes
import os, time, json, sys
import cupy as cp
import numpy as np 


  from .autonotebook import tqdm as notebook_tqdm


In [4]:

def load_tensor_info():
    current_dir = os.getcwd()
    tensor_info_path = os.path.join(current_dir, "tensor_info.json")
    with open(tensor_info_path, 'r') as f:
        tensor_info_raw = json.load(f)
    
    # 转换数据类型
    tensor_info = {}
    for key, info in tensor_info_raw.items():
        tensor_info[key] = {
            'size': tuple(info['size']),  # 转换回tuple
            'dtype': getattr(torch, info['dtype'].split('.')[-1]),  # 转换回torch.dtype
            'numel': info['numel']
        }
    
    return tensor_info


# 获取当前脚本所在目录
current_dir = os.getcwd()
# 拼接绝对路径
lib_path = os.path.join(current_dir, "ipc_handle.so")
# 加载共享库
lib = ctypes.CDLL(lib_path)
#lib = ctypes.CDLL("./libipc_handle.so")

# Define the function types
lib.open_ipc_handle.restype = ctypes.c_void_p
lib.open_ipc_handle.argtypes = [ctypes.c_char_p]

# 初始化一个空的state_dict，只包含key信息
tensor_info = load_tensor_info()
state_dict = {key: None for key in tensor_info.keys()}

handle_directory = "/home/ubuntu/vllm_test_field/vllm/demo/handles"

In [18]:
qqqq=filter(lambda x: "att" not in x, list(state_dict.keys()))
qqqq = sorted(list(qqqq))


In [5]:
state_dict={}
def load_model_shard_by_key(key,state_dict):
    handle_file = f"{handle_directory}/device_0/{key.replace('.', '_')}.bin.ipc"
    if not os.path.exists(handle_file):
        print(f"The specified IPC file does not exist: {handle_file}")
        return

    device_ptr = lib.open_ipc_handle(handle_file.encode('utf-8'))
    if device_ptr:
        tensor_size = tensor_info[key]['numel']
        dtype_map = {
            torch.float32: cp.float32,
            torch.float64: cp.float64,
            torch.int32: cp.int32,
            torch.int64: cp.int64,
            torch.uint8: cp.uint8,
            torch.int8: cp.int8,
            torch.int16: cp.int16,
            torch.float16: cp.float16
        }
        # 使用提前知道的dtype信息
        cp_dtype = dtype_map.get(tensor_info[key]['dtype'])

        # Wrap the raw GPU pointer using CuPy
        gpu_array = cp.ndarray((tensor_size,), dtype=cp_dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(
            device_ptr, tensor_size * cp_dtype().itemsize, None), 0))
        
        # Convert CuPy array to PyTorch tensor using DLPack
        try:
            dlpack = gpu_array.toDlpack()
            #state_dict[key] = torch.utils.dlpack.from_dlpack(dlpack).view(tensor_info[key]['size'])
            
            torch_tensor = torch.utils.dlpack.from_dlpack(dlpack)
            state_dict[key] = torch_tensor.view(tensor_info[key]['size'])
        except RuntimeError as e:
            print(f"Error with key {key}: {e}")

    else:
        print(f"Failed to open IPC handle for {key}")


In [7]:
from vllm.model_executor.layers.fused_moe import FusedMoE
import transformers

    # 加载配置
config = transformers.AutoConfig.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B")
experts = FusedMoE(num_experts=config.num_experts,
                        top_k=config.num_experts_per_tok,
                        hidden_size=config.hidden_size,
                        intermediate_size=config.moe_intermediate_size,
                        reduce_results=False,
                        renormalize=config.norm_topk_prob,
                        tp_size=1,
                        dp_size=1)

expert_params_mapping = FusedMoE.make_expert_params_mapping(
            ckpt_gate_proj_name="gate_proj",
            ckpt_down_proj_name="down_proj",
            ckpt_up_proj_name="up_proj",
            num_experts=config.num_experts)

INFO 05-07 03:34:32 [__init__.py:239] Automatically detected platform cuda.


2025-05-07 03:34:33,353	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.




In [6]:
state_dict_keys=["model.layers.23.mlp.experts.16.down_proj.weight",
                 ]
state_dict={}
for k in state_dict_keys:
   load_model_shard_by_key(k,state_dict)

Failed to open IPC handle: invalid resource handle


Failed to open IPC handle for model.layers.23.mlp.experts.16.down_proj.weight


In [10]:
params_dict=dict(experts.named_parameters())
print(params_dict.keys())
name=""
shard_id="w2"
expert_id=0
experts.weight_loader(params_dict["w2_weight"],,name,shard_id,expert_id)


dict_keys(['w13_weight', 'w2_weight'])

In [13]:
expert_params_mapping[-3:]

[('experts.w13_', 'experts.59.gate_proj.', 59, 'w1'),
 ('experts.w2_', 'experts.59.down_proj.', 59, 'w2'),
 ('experts.w13_', 'experts.59.up_proj.', 59, 'w3')]

In [5]:
import pickle
with open ("qwen2moeparams_dict.pkl", "rb") as f:
    params_dict = pickle.load(f)
    
params_dict=[i for i in params_dict if "mlp.experts" in i] 
params_dict

['model.layers.0.mlp.experts.w13_weight',
 'model.layers.0.mlp.experts.w2_weight',
 'model.layers.1.mlp.experts.w13_weight',
 'model.layers.1.mlp.experts.w2_weight',
 'model.layers.2.mlp.experts.w13_weight',
 'model.layers.2.mlp.experts.w2_weight',
 'model.layers.3.mlp.experts.w13_weight',
 'model.layers.3.mlp.experts.w2_weight',
 'model.layers.4.mlp.experts.w13_weight',
 'model.layers.4.mlp.experts.w2_weight',
 'model.layers.5.mlp.experts.w13_weight',
 'model.layers.5.mlp.experts.w2_weight',
 'model.layers.6.mlp.experts.w13_weight',
 'model.layers.6.mlp.experts.w2_weight',
 'model.layers.7.mlp.experts.w13_weight',
 'model.layers.7.mlp.experts.w2_weight',
 'model.layers.8.mlp.experts.w13_weight',
 'model.layers.8.mlp.experts.w2_weight',
 'model.layers.9.mlp.experts.w13_weight',
 'model.layers.9.mlp.experts.w2_weight',
 'model.layers.10.mlp.experts.w13_weight',
 'model.layers.10.mlp.experts.w2_weight',
 'model.layers.11.mlp.experts.w13_weight',
 'model.layers.11.mlp.experts.w2_weight',


In [None]:
from vllm.model_executor.models.utils import (extract_layer_index, is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory, make_layers,
                    maybe_prefix)
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union

experts.weight_loader(param,
                    loaded_weight,
                    name,
                    shard_id=shard_id,
                    expert_id=expert_id)

In [None]:


with open ("qwen2moeparams_dict.pkl", "r") as f:
    params_dict = json.load(f)
    
for mapping in expert_params_mapping:
    param_name, weight_name, expert_id, shard_id = mapping
    if weight_name not in name:
        continue
    name = name.replace(weight_name, param_name)
    # Skip loading extra bias for GPTQ models.
    # if is_pp_missing_parameter(name, self):
    #                     continue
                    # Skip loading extra bias for GPTQ models.
    if ((name.endswith(".bias") or name.endswith("_bias"))
            and name not in params_dict):
        continue
    param = params_dict[name]
    weight_loader = param.weight_loader
    weight_loader(param,
                    loaded_weight,
                    name,
                    shard_id=shard_id,
                    expert_id=expert_id)
    break


## Try Load Weights just using pt