## Test if my GPU supports P2P

In [1]:
import torch

def check_p2p_support():
    if not torch.cuda.is_available():
        print("CUDA is not available. No GPUs detected.")
        return

    num_gpus = torch.cuda.device_count()
    print(f"Found {num_gpus} GPU(s)")

    for i in range(num_gpus):
        for j in range(num_gpus):
            if i == j:
                continue  # Skip self-check

            # Check if P2P access is possible
            try:
                # Enable P2P access (temporarily)
                torch.cuda.set_device(i)  # Set current GPU
                can_access = torch.cuda.can_device_access_peer(i,j)
                print(f"GPU {i} can access GPU {j} via P2P: {'✅ Yes' if can_access else '❌ No'}")
            except RuntimeError as e:
                print(f"P2P between GPU {i} and GPU {j} not supported: {e}")

check_p2p_support()

Found 2 GPU(s)
GPU 0 can access GPU 1 via P2P: ❌ No
GPU 1 can access GPU 0 via P2P: ❌ No


## Pybind Import

In [2]:
from tensor_utils_pybind import get_ipc_handle_pybind, tensor_restore_from_handler_pybind,merge_tensors_and_export_ipc_handle
import torch
token_num = 4096

def get_dtype_size(dtype):
    """获取dtype的字节大小"""
    return torch.tensor([], dtype=dtype).element_size()

## handler preparation

In [3]:
token_num = 1024
hidden_states = torch.randn( token_num, 2048, dtype=torch.float32).to("cuda")
topk_ids = torch.randint(0, 60, (token_num, 4), dtype=torch.int32).to("cuda")
topk_weights = torch.randn(token_num, 4, dtype=torch.bfloat16).to("cuda")

tensors=[hidden_states,topk_weights,topk_ids]

handler = merge_tensors_and_export_ipc_handle(tensors,hidden_states[0].device.index)
torch.cuda.synchronize()



## metadata preparation


In [4]:
max_dtype = max(tensors, key=lambda t: get_dtype_size(t.dtype)).dtype
max_dtype_size = get_dtype_size(max_dtype)
print(f"max_dtype: {max_dtype}, max_dtype_size: {max_dtype_size} ")

# 2. 计算总元素数（考虑对齐）
total_elements = 0
metadata = []
offset_bytes = 0
for tensor in tensors:
    # 计算当前张量需要的元素数（考虑对齐）
    tensor_bytes = tensor.numel() * get_dtype_size(tensor.dtype)
    elements_needed = (tensor_bytes + max_dtype_size - 1) // max_dtype_size
    
    # 记录元数据
    metadata.append({
        'dtype': str(tensor.dtype),
        'shape': tensor.shape,
        'device': tensor.device.index,
        'offset_bytes':offset_bytes
    })
    offset_bytes += tensor_bytes
    total_elements += elements_needed
print(f"total_elements: {total_elements}")


max_dtype: torch.float32, max_dtype_size: 4 
total_elements: 2103296


## Send requests

### Data check

In [5]:
print(f"tensors: {tensors}")
print(f"metadata: {metadata}")

tensors: [tensor([[-0.4055, -0.5077,  1.3535,  ..., -0.4322, -0.6301, -0.6791],
        [-0.2482, -0.2057, -0.9148,  ..., -0.4003, -1.0719,  0.2630],
        [ 1.6116,  1.6385,  0.9094,  ...,  0.5347, -0.6238, -0.7083],
        ...,
        [ 0.5343, -0.9832,  0.9822,  ...,  0.7232, -0.1213, -1.6530],
        [-0.1056,  0.0625,  0.4504,  ...,  0.2294, -2.3053,  1.1465],
        [ 0.0213, -0.1226, -2.0355,  ...,  0.0794,  1.3142,  0.7398]],
       device='cuda:1'), tensor([[ 0.0153,  0.2168,  0.3789,  0.7266],
        [ 0.2334, -1.7578,  1.6328,  0.0918],
        [-0.1523, -0.1279,  0.9219, -0.4004],
        ...,
        [-1.4062, -0.1504,  0.1631, -0.4902],
        [-0.8867, -0.3945, -0.1816, -1.1797],
        [ 0.5547,  1.2031,  0.9336,  0.1338]], device='cuda:1',
       dtype=torch.bfloat16), tensor([[45, 40, 47, 27],
        [24, 41, 45, 17],
        [16,  6, 25, 13],
        ...,
        [17, 48,  8,  2],
        [17, 10, 23, 56],
        [38, 19, 14,  1]], device='cuda:1', dtype=t

### Sending requests and in the server side restore multiple tensors

In [6]:
import json
import requests
url1 = "http://localhost:1177/merged_handler"

#2101248

response1 = requests.post(url1, 
  data={
        'hidden_states_meta': json.dumps(metadata[0]),
        'topk_weights_meta': json.dumps(metadata[1]),
        'topk_ids_meta': json.dumps(metadata[2]),
},
                          
files={
        'handler': ('handler.bin', handler, 'application/octet-stream'),
})


### Sending requests and in the server side restore 1 tensor

In [7]:
# md={'dtype': 'torch.float32',
#  'shape': [1024*2048+4096],
#  'offset': 0,
#  'elements': 1024*2048+4096,
#  'device': 1,
#  'offset_bytes': 0}

In [8]:
# import json
# import requests
# url1 = "http://localhost:1177/merged_single"

# #2101248

# response1 = requests.post(url1, 
#   data={
#         'hidden_states_meta': json.dumps(md),
#         # 'topk_ids_meta': json.dumps(metadata[2]),
# },
                          
# files={
#         'merged_handler': ('merged_handler.bin', merged_handler, 'application/octet-stream'),
# })
