## Test if my GPU supports P2P

In [1]:
import torch

def check_p2p_support():
    if not torch.cuda.is_available():
        print("CUDA is not available. No GPUs detected.")
        return

    num_gpus = torch.cuda.device_count()
    print(f"Found {num_gpus} GPU(s)")

    for i in range(num_gpus):
        for j in range(num_gpus):
            if i == j:
                continue  # Skip self-check

            # Check if P2P access is possible
            try:
                # Enable P2P access (temporarily)
                torch.cuda.set_device(i)  # Set current GPU
                can_access = torch.cuda.can_device_access_peer(i,j)
                print(f"GPU {i} can access GPU {j} via P2P: {'✅ Yes' if can_access else '❌ No'}")
            except RuntimeError as e:
                print(f"P2P between GPU {i} and GPU {j} not supported: {e}")

check_p2p_support()

Found 6 GPU(s)
GPU 0 can access GPU 1 via P2P: ❌ No
GPU 0 can access GPU 2 via P2P: ✅ Yes
GPU 0 can access GPU 3 via P2P: ❌ No
GPU 0 can access GPU 4 via P2P: ❌ No
GPU 0 can access GPU 5 via P2P: ❌ No
GPU 1 can access GPU 0 via P2P: ❌ No
GPU 1 can access GPU 2 via P2P: ❌ No
GPU 1 can access GPU 3 via P2P: ✅ Yes
GPU 1 can access GPU 4 via P2P: ❌ No
GPU 1 can access GPU 5 via P2P: ❌ No
GPU 2 can access GPU 0 via P2P: ✅ Yes
GPU 2 can access GPU 1 via P2P: ❌ No
GPU 2 can access GPU 3 via P2P: ❌ No
GPU 2 can access GPU 4 via P2P: ❌ No
GPU 2 can access GPU 5 via P2P: ❌ No
GPU 3 can access GPU 0 via P2P: ❌ No
GPU 3 can access GPU 1 via P2P: ✅ Yes
GPU 3 can access GPU 2 via P2P: ❌ No
GPU 3 can access GPU 4 via P2P: ❌ No
GPU 3 can access GPU 5 via P2P: ❌ No
GPU 4 can access GPU 0 via P2P: ❌ No
GPU 4 can access GPU 1 via P2P: ❌ No
GPU 4 can access GPU 2 via P2P: ❌ No
GPU 4 can access GPU 3 via P2P: ❌ No
GPU 4 can access GPU 5 via P2P: ✅ Yes
GPU 5 can access GPU 0 via P2P: ❌ No
GPU 5 can access G

In [4]:
import torch
import ctypes

# Load the shared lib
lib = ctypes.CDLL('./cuda_tools/libipc_tensor_tool.so')
lib.export_ipc_handle.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
lib.export_ipc_handle.restype = ctypes.c_int

def get_ipc_handle(tensor: torch.Tensor) -> bytes:
    
    meta={
        "shape": tensor.shape,
        "dtype": str(tensor.dtype),
        "device": int(tensor.device.index),
        "numel": tensor.numel()
    }
    if not tensor.is_cuda:
        raise ValueError("Tensor must be on CUDA device")

    dev_ptr = tensor.data_ptr()
    out = ctypes.create_string_buffer(64)

    result = lib.export_ipc_handle(ctypes.c_void_p(dev_ptr), out)
    if result != 0:
        raise RuntimeError(f"export_ipc_handle failed with code {result}")

    return out.raw, meta  # This is the 64-byte IPC handle

In [1]:
from tensor_utils_pybind import get_ipc_handle_pybind, tensor_restore_from_handler_pybind
import torch
TOKEN_NUM = 128
def create_input(token_num):
    topk_ids = torch.randint(0, 60, (token_num, 4), dtype=torch.int32).to("cuda")
    topk_weights = torch.randn(token_num, 4, dtype=torch.float32).to("cuda")
    hidden_states = torch.randn( token_num,2048, dtype=torch.bfloat16).to("cuda") # 
    print("topk_ids.shape", topk_ids.shape, topk_ids.dtype)
    print("topk_weights.shape", topk_weights.shape, topk_weights.dtype)
    print("hidden_states.shape", hidden_states.shape, hidden_states.dtype)
    return {"topk_ids":topk_ids,"topk_weights":topk_weights, "hidden_states":hidden_states }

inputs = create_input( TOKEN_NUM)

hidden_states_handler, hidden_states_meta= get_ipc_handle_pybind(inputs["hidden_states"])
topk_ids_handler, topk_ids_meta = get_ipc_handle_pybind(inputs["topk_ids"])
topk_weights_handler, topk_weights_meta= get_ipc_handle_pybind(inputs["topk_weights"])

topk_ids.shape torch.Size([128, 4]) torch.int32
topk_weights.shape torch.Size([128, 4]) torch.float32
hidden_states.shape torch.Size([128, 2048]) torch.bfloat16


In [2]:
inputs

{'topk_ids': tensor([[35,  8, 42, 45],
         [ 1,  3, 31, 23],
         [14, 41, 47, 13],
         [36,  7, 46, 30],
         [22, 20, 32, 11],
         [ 9,  6, 43, 51],
         [30, 15, 54, 13],
         [46, 58, 42, 23],
         [48, 10, 16, 43],
         [ 1, 46, 27, 14],
         [50, 33, 49, 31],
         [56, 17,  6, 35],
         [24, 56, 26, 40],
         [44, 16, 35, 25],
         [50, 11, 37, 41],
         [43, 25, 25, 48],
         [33,  7, 32, 42],
         [ 8, 57, 57, 55],
         [57, 20, 28, 55],
         [11,  9, 54,  5],
         [ 3, 35, 53,  1],
         [50, 19, 11, 51],
         [54,  1, 42, 57],
         [21,  5, 40, 40],
         [55, 59, 47, 10],
         [28, 38, 55, 46],
         [26, 46,  2, 58],
         [27, 46, 18,  4],
         [57, 38, 20, 20],
         [ 2, 15, 13, 19],
         [ 3, 15, 57,  9],
         [28, 48, 25,  8],
         [23,  5, 30,  1],
         [33, 13,  6, 25],
         [57, 55, 28,  4],
         [ 7, 43, 45,  3],
         [59,  4

In [3]:
import json
import requests
url1 = "http://localhost:5000/forward"
# url2 = "http://localhost:5001/forward"

response1 = requests.post(url1, 
                          data={
        'hidden_states_meta': json.dumps(hidden_states_meta),
        'topk_weights_meta': json.dumps(topk_weights_meta),
        'topk_ids_meta': json.dumps(topk_ids_meta),
},
                          
files={
        'topk_ids_handler': ('topk_ids_handler.bin', topk_ids_handler, 'application/octet-stream'),
        'hidden_states_handler': ('hidden_states_handler.bin', hidden_states_handler, 'application/octet-stream'),
        'topk_weights_handler': ('topk_weights_handler.bin', topk_weights_handler, 'application/octet-stream'),
})

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [9]:
import requests
import json

# 准备数据
byte_data = handler # 示例 bytes 数据
map_data = meta  # 示例 map 数据

# 构建 multipart/form-data 请求
files = {
    'byte_data': ('data.bin', byte_data, 'application/octet-stream')
}

data = {
    'map_data': json.dumps(map_data)
}

# 发送 POST 请求
response = requests.post(
    'http://localhost:1177/upload',
    files=files,
    data=data
)

restored_tensor = torch.tensor(response.json()['restored_tensor'], device='cuda:0')
torch.equal(tensor, restored_tensor)


True