In [9]:
import time
import torch
import torch.nn as nn
torch.manual_seed(0)
from custom_torchinfo import custom_summary

In [10]:
class SimpleMLP(nn.Module):
    def __init__(self, input_size=128, hidden_size=516, output_size=1):
        super().__init__()
        self.fc1   = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2   = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3   = nn.Linear(hidden_size, output_size) 

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        return x

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model = SimpleMLP().to(device)

custom_summary(model, input_size=(64, 128))

Using device: cuda
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Trainable
SimpleMLP                                [64, 128]                 [64, 1]                   --                        True
├─Linear: 1-1                            [64, 128]                 [64, 516]                 66,564                    True
├─ReLU: 1-2                              [64, 516]                 [64, 516]                 --                        --
├─Linear: 1-3                            [64, 516]                 [64, 516]                 266,772                   True
├─ReLU: 1-4                              [64, 516]                 [64, 516]                 --                        --
├─Linear: 1-5                            [64, 516]                 [64, 1]                   517                       True
Total params: 333,853
Trainable params: 333,853
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 21.37

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Trainable
SimpleMLP                                [64, 128]                 [64, 1]                   --                        True
├─Linear: 1-1                            [64, 128]                 [64, 516]                 66,564                    True
├─ReLU: 1-2                              [64, 516]                 [64, 516]                 --                        --
├─Linear: 1-3                            [64, 516]                 [64, 516]                 266,772                   True
├─ReLU: 1-4                              [64, 516]                 [64, 516]                 --                        --
├─Linear: 1-5                            [64, 516]                 [64, 1]                   517                       True
Total params: 333,853
Trainable params: 333,853
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 21.37
Input size (MB): 0

In [3]:
def model_size_bytes(model):
    """Return model size in bytes (params + buffers)."""
    model_params = list(model.parameters()) + list(model.buffers())
    size = 0
    for t in model_params:
        size += t.numel() * t.element_size()
    return size

size_bytes = model_size_bytes(model)

# Show output
print(f"[Model Size] {size_bytes:,} bytes ~ {size_bytes / (1024**2):.3f} MB")

[Model Size] 1,335,412 bytes ~ 1.274 MB


In [11]:
# Create test batches

x_base = torch.randn(1, 128, device=device)
batch_size1 = 64
x1 = x_base.expand(batch_size1, -1).contiguous()
batch_size2 = 8
x2 = x_base.expand(batch_size2, -1).contiguous()

In [12]:
def measure_latency(model, x, iters=50):
    """Return average latency (ms) per forward pass."""
    model.eval()
    start = time.perf_counter()
    with torch.inference_mode():
        for _ in range(iters):
            _ = model(x)
            if x.device.type == "cuda":
                torch.cuda.synchronize()
    elapsed = time.perf_counter() - start
    return (elapsed / iters) * 1000  # Convert seconds to milliseconds

latency_x1 = measure_latency(model, x1) 
latency_x2 = measure_latency(model, x2)

print(f"[Latency] {latency_x1:.3f} ms per forward pass (batch size={batch_size1})")
print(f"[Latency] {latency_x2:.3f} ms per forward pass (batch size={batch_size2})")

[Latency] 0.342 ms per forward pass (batch size=64)
[Latency] 0.312 ms per forward pass (batch size=8)


In [6]:
def measure_gpu_memory(model, x):
    """Return current and peak GPU memory in MB after one forward."""
    if not torch.cuda.is_available():
        return (0.0, 0.0)  # Return zeros if CUDA not available
    
    torch.cuda.empty_cache()
    try:
        torch.cuda.reset_peak_memory_stats()
    except AttributeError:
        pass
    
    model.eval()
    with torch.inference_mode():
        y = model(x)
        
    torch.cuda.synchronize()
    current = torch.cuda.memory_allocated() / (1024**2)
    peak    = torch.cuda.max_memory_allocated() / (1024**2)
    return current, peak

if torch.cuda.is_available():
    mem1 = measure_gpu_memory(model, x1)
    mem2 = measure_gpu_memory(model, x2)
    print(f"[GPU Memory] Current allocated: {mem1[0]:.2f} MB | Peak during forward: {mem1[1]:.2f} MB (batch size={batch_size1})")
    print(f"[GPU Memory] Current allocated: {mem2[0]:.2f} MB | Peak during forward: {mem2[1]:.2f} MB (batch size={batch_size2})")
else:
    print("[GPU Memory] CUDA not available - running on CPU (no GPU memory tracking)")

[GPU Memory] Current allocated: 10.44 MB | Peak during forward: 10.81 MB (batch size=64)
[GPU Memory] Current allocated: 10.44 MB | Peak during forward: 10.48 MB (batch size=8)


In [7]:
import gc, torch

del model, x1, x2 
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.synchronize()