In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
sys.path.append(os.path.join(os.path.dirname('/home/ntmduy/GraphAE/src/model')))
sys.path.append(os.path.join(os.path.dirname('/home/ntmduy/GraphAE/src/utils')))

In [27]:
from model.GAE_Projection_Att import GAE_CLS_Link_NODE_Cosine_SupCon_2
from model.resnet_big import SupCEResNet, SupConResNet, LinearClassifier, SupIncepResnet
import torch
from torch_geometric.nn import summary
from thop import profile
import numpy as np
import time
from torch_geometric.loader import DataLoader
from utils.data import load_and_split_graphs
from torch.profiler import profile, record_function, ProfilerActivity

In [3]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [31]:
def measure_time_gpu(dummy_input, model, device, rep, none_gnn=False):
    model = model.to(device=device)
    # dummy_input = torch.randn(1, 1, 29, 29, dtype=torch.float).to(device)
    # INIT LOGGERS
    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
    repetitions = rep
    timings=np.zeros((repetitions,1))
    #GPU-WARM-UP
    for _ in range(100):
        if (none_gnn):
            _ = model(dummy_input)
        else:
            _ = model(dummy_input, device, acummulate = True, remove_random=True)
    # MEASURE PERFORMANCE
    with torch.no_grad():
        for rep in range(repetitions):
            starter.record()
            if (none_gnn):
                _ = model(dummy_input)
            else:
                _ = model(dummy_input, device, acummulate = True, remove_random=True)
            ender.record()
            # WAIT FOR GPU SYNC
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)
            timings[rep] = curr_time
    mean_syn = np.sum(timings) / repetitions
    std_syn = np.std(timings)
    return mean_syn, std_syn

In [5]:
def measure_time_cpu(model, device, rep = 10):
    model = model.to(device=device)
    x = torch.rand((1, 1, 29, 29), device=device)
    timings=np.zeros((rep,1))
    for i in range(rep):    
        start_time = time.time()
        out = model(x)
        timings[i] = time.time() - start_time
    mean_syn = np.sum(timings) / rep
    std_syn = np.std(timings)
    return mean_syn, std_syn

In [14]:
def analyze_performance(model, test_loader, device='cuda', num_warmup=10, num_repeats=100):
    model = model.to(device)
    model.eval()

    times = []
    with torch.no_grad():
        # Warm-up
        print(f"Warming up for {num_warmup} iterations...")
        for i, batch in enumerate(test_loader):
            batch = batch.to(device)
            _ = model(batch, device, acummulate = True, remove_random=True)
            if i >= num_warmup - 1:
                break

        torch.cuda.synchronize()
        print(f"Starting timed inference over {num_repeats} iterations...")
        for i, batch in enumerate(test_loader):
            if i >= num_repeats:
                break
            batch = batch.to(device)
            torch.cuda.synchronize()
            start_time = time.perf_counter()
            _ = model(batch, device, acummulate = True, remove_random=True)
            torch.cuda.synchronize()
            end_time = time.perf_counter()
            times.append(end_time - start_time)

        times = np.array(times)
        print(f"Tested {len(times)} samples.")
        print(f"Mean Latency: {np.mean(times)*1000:.2f} ms")
        print(f"Median Latency: {np.median(times)*1000:.2f} ms")
        print(f"Throughput: {1/np.mean(times):.2f} samples/sec")

        if torch.cuda.is_available():
            mem = torch.cuda.max_memory_allocated() / (1024**2)
            print(f"Max GPU memory used: {mem:.2f} MB")
            torch.cuda.reset_peak_memory_stats()

In [6]:
path = f'/home/ntmduy/GraphAE/data/mas/WS_300/step_300/9/edge_features_3/normalized_except_id/raw/sort_seperated'

train_graphs, test_graphs, graphs_names = load_and_split_graphs(path, exclude=[], train_ratio=0.8, seed=2025)

In [11]:
# Load data
warmup_loader = DataLoader([train_graphs[0]], batch_size=1, shuffle=False)

for graph in test_graphs:
    graph.edge_attr = torch.tensor(graph.edge_attr[:, 0].reshape(-1, 1), dtype=torch.float32)

test_loader = DataLoader(test_graphs, batch_size=64, shuffle=False)

dummy_input = next(iter(warmup_loader)).to('cuda')
dummy_input.edge_attr = torch.tensor(dummy_input.edge_attr[:, 0].reshape(-1, 1), dtype=torch.float32)

  graph.edge_attr = torch.tensor(graph.edge_attr[:, 0].reshape(-1, 1), dtype=torch.float32)
  dummy_input.edge_attr = torch.tensor(dummy_input.edge_attr[:, 0].reshape(-1, 1), dtype=torch.float32)


In [15]:
model = GAE_CLS_Link_NODE_Cosine_SupCon_2(num_features=9, embedding_size=32, projection_emb=128, activate='gelu', layer_type='gatv2', num_layers=2, directed=False, id_dim=1, num_classes = 5, linear_node=True, num_id_embeddings=2048, attn_head=1)
measure_time_gpu(dummy_input, model, 'cuda', rep=1000)

(3.7600102066993712, 0.20232976952404977)

In [32]:
incep = SupIncepResnet(num_classes=5)
measure_time_gpu(torch.randn(1, 1, 29, 29, dtype=torch.float).cuda(), incep, 'cuda', rep=1000, none_gnn=True)

(2.999163134098053, 0.23098467288391075)

In [16]:
analyze_performance(model, test_loader, device='cuda', num_warmup=10, num_repeats=100)

Warming up for 10 iterations...
Starting timed inference over 100 iterations...
Tested 19 samples.
Mean Latency: 3.83 ms
Median Latency: 3.82 ms
Throughput: 261.19 samples/sec
Max GPU memory used: 17.34 MB


In [23]:
def profile_gnn_model(model, example_data, device=None, repeat=10):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device).eval()
    example_data = example_data.to(device)
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                 record_shapes=True,
                 profile_memory=True,
                 with_flops=True) as prof:
        with torch.no_grad():
            for _ in range(repeat):
                with record_function("model_inference"):
                    model(example_data)
    print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=15))
    print(prof.key_averages().table(sort_by="flops", row_limit=15))
    # For total FLOPs (note: double to get MACs for real-valued ops)
    total_flops = sum([item.flops for item in prof.key_averages() if hasattr(item, 'flops')])
    print(f"Estimated total FLOPs: {total_flops}")
    print(f"Estimated total MACs: {total_flops/2}")

In [24]:
dummy_input = dummy_input.to(device='cpu')
model = model.to(device='cpu')
profile_gnn_model(model, dummy_input)

STAGE:2025-06-04 11:58:42 1943430:1943430 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2025-06-04 11:58:42 1943430:1943430 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2025-06-04 11:58:42 1943430:1943430 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  Total KFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                     aten::scatter_add_         2.10%       2.485ms         2.96%       3.495ms      29.125us     180.000us        16.59%     180.000us       1.500us           0 

In [10]:
summary(model.cuda(), dummy_input.cuda(), device='cuda')

'+-----------------------------------+------------------------------+-------------------------------+----------+\n| Layer                             | Input Shape                  | Output Shape                  | #Param   |\n|-----------------------------------+------------------------------+-------------------------------+----------|\n| GAE_CLS_Link_NODE_Cosine_SupCon_2 | [72, 72]                     | [72, 32]                      | 74,715   |\n| ├─(id_embedding)Embedding         | --                           | --                            | 2,048    |\n| ├─(encoder)Graph_Encoder_Norm     | [72, 9], [2, 136], [136, 1]  | [72, 32], [72, 32], [2304, 2] | 3,154    |\n| │    └─(bn)BatchNorm1d            | --                           | --                            | 18       |\n| │    └─(convs)ModuleList          | --                           | --                            | 2,944    |\n| │    │    └─(0)GATv2Conv          | [72, 9], [2, 136], [136, 1]  | [72, 32]                  