## Test if my GPU supports P2P

In [1]:
import torch

def check_p2p_support():
    if not torch.cuda.is_available():
        print("CUDA is not available. No GPUs detected.")
        return

    num_gpus = torch.cuda.device_count()
    print(f"Found {num_gpus} GPU(s)")

    for i in range(num_gpus):
        for j in range(num_gpus):
            if i == j:
                continue  # Skip self-check

            # Check if P2P access is possible
            try:
                # Enable P2P access (temporarily)
                torch.cuda.set_device(i)  # Set current GPU
                can_access = torch.cuda.can_device_access_peer(i,j)
                print(f"GPU {i} can access GPU {j} via P2P: {'✅ Yes' if can_access else '❌ No'}")
            except RuntimeError as e:
                print(f"P2P between GPU {i} and GPU {j} not supported: {e}")

check_p2p_support()

Found 6 GPU(s)
GPU 0 can access GPU 1 via P2P: ❌ No
GPU 0 can access GPU 2 via P2P: ✅ Yes
GPU 0 can access GPU 3 via P2P: ❌ No
GPU 0 can access GPU 4 via P2P: ❌ No
GPU 0 can access GPU 5 via P2P: ❌ No
GPU 1 can access GPU 0 via P2P: ❌ No
GPU 1 can access GPU 2 via P2P: ❌ No
GPU 1 can access GPU 3 via P2P: ✅ Yes
GPU 1 can access GPU 4 via P2P: ❌ No
GPU 1 can access GPU 5 via P2P: ❌ No
GPU 2 can access GPU 0 via P2P: ✅ Yes
GPU 2 can access GPU 1 via P2P: ❌ No
GPU 2 can access GPU 3 via P2P: ❌ No
GPU 2 can access GPU 4 via P2P: ❌ No
GPU 2 can access GPU 5 via P2P: ❌ No
GPU 3 can access GPU 0 via P2P: ❌ No
GPU 3 can access GPU 1 via P2P: ✅ Yes
GPU 3 can access GPU 2 via P2P: ❌ No
GPU 3 can access GPU 4 via P2P: ❌ No
GPU 3 can access GPU 5 via P2P: ❌ No
GPU 4 can access GPU 0 via P2P: ❌ No
GPU 4 can access GPU 1 via P2P: ❌ No
GPU 4 can access GPU 2 via P2P: ❌ No
GPU 4 can access GPU 3 via P2P: ❌ No
GPU 4 can access GPU 5 via P2P: ✅ Yes
GPU 5 can access GPU 0 via P2P: ❌ No
GPU 5 can access G

## cupy way

In [1]:
import torch
import ctypes

# Load the shared lib
lib = ctypes.CDLL('./cuda_tools/libipc_tensor_tool.so')
lib.export_ipc_handle.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
lib.export_ipc_handle.restype = ctypes.c_int

def get_ipc_handle(tensor: torch.Tensor) -> bytes:
    
    meta={
        "shape": tensor.shape,
        "dtype": str(tensor.dtype),
        "device": int(tensor.device.index),
        "numel": tensor.numel()
    }
    if not tensor.is_cuda:
        raise ValueError("Tensor must be on CUDA device")

    dev_ptr = tensor.data_ptr()
    out = ctypes.create_string_buffer(64)

    result = lib.export_ipc_handle(ctypes.c_void_p(dev_ptr), out)
    if result != 0:
        raise RuntimeError(f"export_ipc_handle failed with code {result}")

    return out.raw, meta  # This is the 64-byte IPC handle

## Pybind way

In [None]:
from tensor_utils_pybind import get_ipc_handle_pybind, tensor_restore_from_handler_pybind
import torch
TOKEN_NUM = 128
def create_input(token_num):
    # topk_ids = torch.randint(0, 60, (token_num, 4), dtype=torch.int32).to("cuda")
    # topk_ids_handler, topk_ids_meta = get_ipc_handle_pybind(topk_ids)
    
    # topk_ids = torch.randn(token_num, 4, dtype=torch.float32).to("cuda")
    
    hidden_states = torch.randn( token_num,2048, dtype=torch.bfloat16).to("cuda")
    hidden_states_handler, hidden_states_meta= get_ipc_handle_pybind(hidden_states)
    
    topk_weights = torch.randn(token_num, 4, dtype=torch.float32).to("cuda")
    topk_weights_handler, topk_weights_meta= get_ipc_handle_pybind(topk_weights)
    torch.cuda.synchronize()

    
    # print("topk_ids.shape", topk_ids.shape, topk_ids.dtype)
    # print("topk_weights.shape", topk_weights.shape, topk_weights.dtype)
    # print("hidden_states.shape", hidden_states.shape, hidden_states.dtype)
    print(f"hidden_states:{hidden_states}")
    print(f"topk_weights:{topk_weights}")
    # print(f"topk_ids:{topk_ids}")
    # return {"topk_ids":topk_ids,
    return hidden_states_handler, hidden_states_meta,topk_weights_handler, topk_weights_meta

hidden_states_handler, hidden_states_meta,topk_weights_handler, topk_weights_meta = create_input( TOKEN_NUM)
# topk_ids_handler, topk_ids_meta = get_ipc_handle_pybind(inputs["topk_ids"])
# topk_weights_handler, topk_weights_meta= get_ipc_handle_pybind(inputs["topk_weights"])

hidden_states:tensor([[ 0.5078, -0.0164, -0.2617,  ...,  0.1631,  0.8047, -0.3867],
        [ 0.9766,  1.0156,  0.4004,  ...,  0.7969, -1.3750, -0.5391],
        [-1.5625,  0.1074,  0.3086,  ..., -1.3125,  0.7812,  0.2910],
        ...,
        [ 0.0630,  0.8438, -0.8984,  ..., -1.1250,  1.1016, -0.1187],
        [ 0.4922,  0.5234,  1.1797,  ...,  1.2031,  1.5703,  0.3262],
        [-1.0469, -0.7461, -0.0859,  ...,  0.5000,  1.2969, -0.5234]],
       device='cuda:0', dtype=torch.bfloat16)
topk_weights:tensor([[-0.3725,  0.3987, -0.3490,  0.1184],
        [-0.0544,  1.1020,  0.0609, -0.4153],
        [-0.0889, -1.4458, -0.7052,  0.4264],
        [ 0.7858, -0.6384,  1.8901, -0.6639],
        [ 0.6100, -0.4221,  0.4014, -0.3279],
        [-0.2314,  1.5952,  1.6162, -0.1776],
        [-0.4671, -0.4409, -0.0951,  1.5733],
        [-0.4426, -0.7888,  0.0384, -0.2229],
        [ 0.3285,  1.2358,  1.9886, -1.4418],
        [-1.1658,  1.2076, -0.0545, -1.0676],
        [ 1.6244, -0.6511, -1.238

In [2]:
import json
import requests
url1 = "http://localhost:5000/forward"
# url2 = "http://localhost:5001/forward"

response1 = requests.post(url1, 
                          data={
        'hidden_states_meta': json.dumps(hidden_states_meta),
        'topk_weights_meta': json.dumps(topk_weights_meta),
        'topk_ids_meta': json.dumps(topk_ids_meta),
},
                          
files={
        'topk_ids_handler': ('topk_ids_handler.bin', topk_ids_handler, 'application/octet-stream'),
        'hidden_states_handler': ('hidden_states_handler.bin', hidden_states_handler, 'application/octet-stream'),
        'topk_weights_handler': ('topk_weights_handler.bin', topk_weights_handler, 'application/octet-stream'),
})

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [9]:
import requests
import json

# 准备数据
byte_data = handler # 示例 bytes 数据
map_data = meta  # 示例 map 数据

# 构建 multipart/form-data 请求
files = {
    'byte_data': ('data.bin', byte_data, 'application/octet-stream')
}

data = {
    'map_data': json.dumps(map_data)
}

# 发送 POST 请求
response = requests.post(
    'http://localhost:1177/upload',
    files=files,
    data=data
)

restored_tensor = torch.tensor(response.json()['restored_tensor'], device='cuda:0')
torch.equal(tensor, restored_tensor)


True