## Container Preparation

### Load dependencies

In [1]:
import docker
import requests
import torch
import time
import random

client = docker.from_env()

# --- Configs ---

torch.cuda.empty_cache()
torch.cuda.synchronize()
NUM_CONTAINERS = 2
IMAGE_NAME = "moe_expert"
EXPERT_TIMEOUT = 20  # seconds



### Build Image

In [2]:
! docker build -t moe_expert ~/vllm_test_field/vllm/flask_docker_app/.

DEPRECATED: The legacy builder is deprecated and will be removed in a future release.
            Install the buildx component to build images with BuildKit:
            https://docs.docker.com/go/buildx/

Sending build context to Docker daemon  303.1kB
Step 1/7 : FROM nvidia/cuda:12.1.1-base-ubuntu22.04
 ---> 72d1c5868625
Step 2/7 : WORKDIR /app
 ---> Using cache
 ---> a83412ad1a8a
Step 3/7 : RUN apt-get update && apt-get install -y git python3 python3-pip python3-dev && rm -rf /var/lib/apt/lists/*
 ---> Using cache
 ---> 024bf909d6ed
Step 4/7 : RUN pip3 install -v flask
 ---> Using cache
 ---> 6101706050d4
Step 5/7 : RUN pip3 install -v vllm
 ---> Using cache
 ---> b8e07ac7f122
Step 6/7 : COPY expert.py /app
 ---> 8b09389f2fb1
Step 7/7 : CMD ["python3", "./expert.py"]
 ---> Running in de972e1ed3d5


 ---> Removed intermediate container de972e1ed3d5
 ---> 26ebb4b64c05
Successfully built 26ebb4b64c05
Successfully tagged moe_expert:latest


## Launching instances

In [None]:
import subprocess

# --- Step 2: Launch Expert Containers ---
containers = []
layer = 23

experts_count_per_container = 60//NUM_CONTAINERS
for i in range(NUM_CONTAINERS):
    cmd = [
    "docker", "run", "-d",
    "--name", f"fused_moe_layer_{layer}_exp_{i*experts_count_per_container}_{(i+1)*experts_count_per_container-1}",
    "--gpus", "all",
    "--rm",
    "-p", f"{5000+i}:5000",
    "-v", "/home/ubuntu/vllm_test_field/vllm/ipc_handler_demo/weights:/app/weights",
    "-e", f"RANK={i}",
    "-e", f"NUM_EXPERTS={experts_count_per_container}",
    "-e", f"GPU_IDX={0}",
    "-e", f"WEIGHT_PATH=/app/weights",
    "-e", f"LAYER={layer}",
    
    IMAGE_NAME
    ]
    try:
        subprocess.run(cmd, check=True)
        print(f"fused_moe_layer_{layer}_exp_{i*experts_count_per_container}_{(i+1)*experts_count_per_container-1}\n容器启动成功！")
    except subprocess.CalledProcessError as e:
        print(f"启动失败: {e}")
    




09770c2cbc63eb1f735b9711de6d757dd1048c3e4d4674e656feef1f7d3fb76d


fused_moe_layer_23_exp_0_29
容器启动成功！
76350462c9e7c744d957a38dd3a53f73fe4465405af9c94393916327dfb4f5ea
fused_moe_layer_23_exp_30_59
容器启动成功！


### View if the instance is running correctly

In [4]:
! docker ps

CONTAINER ID   IMAGE        COMMAND                 CREATED        STATUS                  PORTS                                       NAMES
c3ae387a3bd9   moe_expert   "python3 ./expert.py"   1 second ago   Up Less than a second   0.0.0.0:5001->5000/tcp, :::5001->5000/tcp   fused_moe_layer_23_exp_30_59
5a27ecb9280a   moe_expert   "python3 ./expert.py"   1 second ago   Up Less than a second   0.0.0.0:5000->5000/tcp, :::5000->5000/tcp   fused_moe_layer_23_exp_0_29


## Testing container function

### Load Inputs

In [4]:
import torch
import requests
def load_tensor(path):
    tensors = {}
    try:
        tensors["hidden_states"] = torch.load(path + "hidden_states.pt")
        tensors["w1"] = torch.load(path + "w1.pt")
        tensors["w2"] = torch.load(path + "w2.pt")
        tensors["topk_weights"] = torch.load(path + "topk_weights.pt")
        tensors["topk_ids"] = torch.load(path + "topk_ids.pt")
        tensors["expert_map"] = torch.load(path + "expert_map.pt")
        tensors["out_hidden_states"] = torch.load(path + "out_hidden_states.pt")
        tensors["final_hidden_states"] = torch.load(path + "final_hidden_states.pt")
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
    except RuntimeError as e:
        print(f"Error: CUDA runtime issue - {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")
    return tensors

rank0 = load_tensor("/home/ubuntu/vllm_test_field/saved_tensors/rank_0/")
rank1 = load_tensor("/home/ubuntu/vllm_test_field/saved_tensors/rank_1/")

### Prepare handler and meta data

In [None]:
import torch
import ctypes

# Load the shared lib
lib = ctypes.CDLL('./cuda_tools/libipc_tensor_tool.so')
lib.export_ipc_handle.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
lib.export_ipc_handle.restype = ctypes.c_int

def get_ipc_handle(tensor: torch.Tensor) -> bytes:
    
    meta={
        "shape": tensor.shape,
        "dtype": str(tensor.dtype),
        "device": int(tensor.device.index),
    }
    if not tensor.is_cuda:
        raise ValueError("Tensor must be on CUDA device")

    dev_ptr = tensor.data_ptr()
    out = ctypes.create_string_buffer(64)

    result = lib.export_ipc_handle(ctypes.c_void_p(dev_ptr), out)
    if result != 0:
        raise RuntimeError(f"export_ipc_handle failed with code {result}")

    return out.raw, meta  # This is the 64-byte IPC handle

hidden_states_handler, hidden_states_meta = get_ipc_handle(rank0["hidden_states"])
topk_weights_handler, topk_weights_meta = get_ipc_handle(rank0["topk_weights"])
topk_ids_handler, topk_ids_meta = get_ipc_handle(rank0["topk_ids"])

### Sending hidden states to instances for results

In [None]:
import json
url1 = "http://localhost:5000/forward"
# url2 = "http://localhost:5001/forward"

response1 = requests.post(url1, 
                          data={
        'hidden_states_meta': json.dumps(hidden_states_meta),
        'topk_weights_meta': json.dumps(topk_weights_meta),
        'topk_ids_meta': json.dumps(topk_ids_meta),
},
files={
        'hidden_states_handler': ('data.bin', hidden_states_handler, 'application/octet-stream'),
        'topk_weights_handler': ('data.bin', topk_weights_handler, 'application/octet-stream'),
        'topk_ids_handler': ('data.bin', topk_ids_handler, 'application/octet-stream'),
})

# response2 = requests.post(url2, json={
#         "hidden_states":rank1["hidden_states"].cpu().tolist(),
#         "topk_weights": rank1["topk_weights"].cpu().tolist(),
#         "topk_ids": rank1["topk_ids"].cpu().tolist()
#         })

output1 = torch.tensor(response1.json()["hidden_output"],dtype=torch.bfloat16,device="cuda:0")
# output2 = torch.tensor(response2.json()["hidden_output"],dtype=torch.bfloat16,device="cuda:0")

In [6]:
print( torch.equal(output1 , rank0["out_hidden_states"]) )
print( torch.equal(output2 , rank1["out_hidden_states"].cuda()) )

True
True


In [7]:
rank0["shared_output"] = torch.load("/home/ubuntu/vllm_test_field/saved_tensors/rank_0/shared_output.pt")
rank1["shared_output"] = torch.load("/home/ubuntu/vllm_test_field/saved_tensors/rank_1/shared_output.pt")
rank0_final_hidden_states = output1 + rank0["shared_output"]
rank1_final_hidden_states = output2 + rank1["shared_output"].cuda()
# all reduce 
reduced_result = rank0_final_hidden_states + rank1_final_hidden_states.to(rank0_final_hidden_states.device)
print( torch.equal(reduced_result, 
                   rank0["final_hidden_states"]) )

print( torch.equal(reduced_result, 
                   rank1["final_hidden_states"].cuda()) )

True
True


## Turn off the instances

In [8]:
def stop_fused_moe_containers():
    try:
        # 1. 获取所有名称包含 'fused_moe_layer' 的容器ID
        grep_cmd = "docker ps --filter 'name=fused_moe_layer' -q"
        container_ids = subprocess.check_output(grep_cmd, shell=True, text=True).strip().split('\n')
        
        # 2. 批量停止容器
        if container_ids and container_ids[0]:  # 如果有匹配的容器
            stop_cmd = f"docker stop {' '.join(container_ids)}"
            subprocess.run(stop_cmd, shell=True, check=True)
            print(f"已停止 {len(container_ids)} 个容器: {container_ids}")
        else:
            print("没有找到名称包含 'fused_moe_layer' 的容器")
            
    except subprocess.CalledProcessError as e:
        print(f"操作失败: {e}")

# 执行函数
stop_fused_moe_containers()

76350462c9e7
09770c2cbc63
已停止 2 个容器: ['76350462c9e7', '09770c2cbc63']
