In [2]:
print("hello again?")

hello again?


In [None]:
# source .venv/bin/activate
# .venv\Scripts\activate

In [3]:
# 라이브러리 import
print("importing libraries...")
import time
import platform
import torch
print(f"PyTorch version: {torch.__version__}")

importing libraries...
PyTorch version: 2.10.0+cpu


In [4]:
# cuda 사용 가능 여부 확인
print(f"CUDA is available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")

CUDA is available: False


In [None]:
# mac 버전
import time
import platform
import torch


def device_sync(device: torch.device, tensor_for_sync: torch.Tensor | None = None) -> None:
    """
    장치별 동기화.
    - CUDA: torch.cuda.synchronize()
    - MPS: torch.mps.synchronize() (가능한 경우)
    - fallback: tensor_for_sync를 CPU로 한번 가져와 완료를 강제
    """
    if device.type == "cuda":
        torch.cuda.synchronize()
        return

    if device.type == "mps":
        # PyTorch 버전에 따라 torch.mps.synchronize()가 있을 수 있음
        if hasattr(torch, "mps") and hasattr(torch.mps, "synchronize"):
            torch.mps.synchronize()
            return

    # CPU 또는 기타: 특별한 동기화 필요 없음
    # 다만 MPS에서 synchronize가 없다면 fallback으로 결과를 CPU로 가져와 완료를 강제
    if device.type == "mps" and tensor_for_sync is not None:
        _ = tensor_for_sync.float().mean().cpu().item()


def print_system_info():
    print("========== System Info ==========")
    print(f"Platform       : {platform.platform()}")
    print(f"Python         : {platform.python_version()}")
    print(f"PyTorch        : {torch.__version__}")
    print(f"MPS built      : {torch.backends.mps.is_built()}")
    print(f"MPS available  : {torch.backends.mps.is_available()}")
    print(f"CPU threads    : {torch.get_num_threads()} (intra-op)")
    print(f"Interop threads: {torch.get_num_interop_threads()}")
    print("=================================\n")


@torch.no_grad()
def gemm_bench_exact(
    device: torch.device,
    seconds: float = 10.0,
    size: int = 4096,
    dtype: torch.dtype = torch.float16,
    use_relu: bool = True,
):
    """
    정확한 시간 측정을 위해 각 반복마다 device_sync를 호출해 backlog를 막는다.
    - 매초 matmul/sec, (추정)TFLOPS 출력
    - 마지막에 총 FLOPs 및 평균 처리량 출력
    """

    print(f"--- Benchmark start: device={device}, size={size}, dtype={dtype}, seconds={seconds} ---")

    # dtype 안전장치: MPS는 float16을 지원하지만, 일부 연산은 float32가 더 안정적일 수 있음
    # 사용자가 원하는 dtype을 그대로 쓰되, 문제 생기면 float32로 바꿔 테스트하세요.

    a = torch.randn((size, size), device=device, dtype=dtype)
    b = torch.randn((size, size), device=device, dtype=dtype)

    # 워밍업
    c = a @ b
    if use_relu:
        c = c.relu_()
    device_sync(device, c)

    # FLOPs 모델: GEMM = 2*N^3, ReLU ~ N^2
    flops_matmul = 2 * (size ** 3)
    flops_relu = (size ** 2) if use_relu else 0
    flops_per_iter = flops_matmul + flops_relu

    start = time.perf_counter()
    next_report = start + 1.0
    interval_start = start

    total_iters = 0
    interval_iters = 0

    while True:
        # 1) 연산 실행
        c = a @ b
        if use_relu:
            c = c.relu_()

        # 2) 완료 동기화(정확 측정 핵심)
        device_sync(device, c)

        total_iters += 1
        interval_iters += 1

        now = time.perf_counter()
        elapsed = now - start

        # 3) 매초 출력
        if now >= next_report:
            interval = now - interval_start
            iters_per_sec = interval_iters / interval
            tflops = (flops_per_iter * iters_per_sec) / 1e12

            print(f"[{elapsed:6.2f}s] iters/sec: {iters_per_sec:8.2f} | est TFLOPS: {tflops:6.2f}")

            interval_start = now
            interval_iters = 0
            while next_report <= now:
                next_report += 1.0

        # 4) 정확한 종료
        if elapsed >= seconds:
            break

    total_time = time.perf_counter() - start
    total_flops = flops_per_iter * total_iters
    achieved_tflops = (total_flops / total_time) / 1e12

    print("\n========== Summary ==========")
    print(f"Device             : {device}")
    print(f"Total time         : {total_time:.3f} sec")
    print(f"Total iters(matmul): {total_iters}")
    print(f"FLOPs per iter     : {flops_per_iter/1e12:.6f} TFLOPs "
          f"(matmul {flops_matmul/1e12:.6f} + relu {flops_relu/1e12:.6f})")
    print(f"Total FLOPs        : {total_flops/1e12:.3f} TFLOPs")
    print(f"Achieved TFLOPS    : {achieved_tflops:.3f}")
    print("=============================\n")

    return {
        "device": str(device),
        "size": size,
        "dtype": str(dtype),
        "seconds": seconds,
        "total_time": total_time,
        "iters": total_iters,
        "achieved_tflops": achieved_tflops,
    }


def main():
    print_system_info()

    # 1) MPS 가능하면 MPS 먼저
    if torch.backends.mps.is_available():
        # M1에서는 4096~8192 사이가 흔히 잘 맞습니다. 메모리(통합) 여유에 따라 조절.
        gemm_bench_exact(device=torch.device("mps"), seconds=10.0, size=4096, dtype=torch.float16, use_relu=True)
    else:
        print("MPS가 사용 불가합니다. CPU만 측정합니다.\n")

    # 2) CPU 측정
    # CPU는 float32가 일반적
    gemm_bench_exact(device=torch.device("cpu"), seconds=10.0, size=2048, dtype=torch.float32, use_relu=True)


if __name__ == "__main__":
    main()

Platform       : macOS-26.2-arm64-arm-64bit
Python         : 3.12.12
PyTorch        : 2.10.0
MPS built      : True
MPS available  : True
CPU threads    : 4 (intra-op)
Interop threads: 8

--- Benchmark start: device=mps, size=4096, dtype=torch.float16, seconds=10.0 ---
[  1.04s] iters/sec:    14.48 | est TFLOPS:   1.99
[  2.00s] iters/sec:    14.45 | est TFLOPS:   1.99
[  3.04s] iters/sec:    14.47 | est TFLOPS:   1.99
[  4.01s] iters/sec:    14.42 | est TFLOPS:   1.98
[  5.05s] iters/sec:    14.45 | est TFLOPS:   1.99
[  6.02s] iters/sec:    14.45 | est TFLOPS:   1.99
[  7.06s] iters/sec:    14.45 | est TFLOPS:   1.99
[  8.03s] iters/sec:    14.45 | est TFLOPS:   1.99
[  9.06s] iters/sec:    14.46 | est TFLOPS:   1.99
[ 10.03s] iters/sec:    14.45 | est TFLOPS:   1.99

Device             : mps
Total time         : 10.032 sec
Total iters(matmul): 145
FLOPs per iter     : 0.137456 TFLOPs (matmul 0.137439 + relu 0.000017)
Total FLOPs        : 19.931 TFLOPs
Achieved TFLOPS    : 1.987

--- 

In [5]:
# window 버전
import time
import platform
import torch


def print_system_info():
    print("========== System Info ==========")
    print(f"Platform       : {platform.platform()}")
    print(f"Python         : {platform.python_version()}")
    print(f"PyTorch        : {torch.__version__}")
    print(f"CUDA available : {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        idx = torch.cuda.current_device()
        props = torch.cuda.get_device_properties(idx)
        print(f"GPU            : cuda:{idx} ({props.name})")
        print(f"Compute cap    : {props.major}.{props.minor}")
        print(f"VRAM           : {props.total_memory / (1024**3):.2f} GB")
        print(f"CUDA version   : {torch.version.cuda}")
    print(f"CPU threads    : {torch.get_num_threads()} (intra-op)")
    print(f"Interop threads: {torch.get_num_interop_threads()}")
    print("=================================\n")


def device_sync(device: torch.device):
    if device.type == "cuda":
        torch.cuda.synchronize()


@torch.no_grad()
def gemm_bench_exact(
    device: torch.device,
    seconds: float = 10.0,
    size: int = 8192,
    dtype: torch.dtype = torch.float16,
    use_relu: bool = True,
):
    """
    - CUDA: 1초마다 synchronize해서 backlog를 과도하게 쌓지 않으면서 측정 안정화
    - CPU: 그대로 측정
    - 정확히 seconds 동안 실행(루프 종료는 wall-clock 기준), 마지막에 sync 후 요약
    """

    print(f"--- Benchmark: device={device}, size={size}, dtype={dtype}, seconds={seconds} ---")
    print(f"Extra op: {'relu' if use_relu else 'none'}\n")

    a = torch.randn((size, size), device=device, dtype=dtype)
    b = torch.randn((size, size), device=device, dtype=dtype)

    # warmup
    c = a @ b
    if use_relu:
        c.relu_()
    device_sync(device)

    # FLOPs 모델: GEMM=2*N^3, ReLU~N^2
    flops_matmul = 2 * (size ** 3)
    flops_relu = (size ** 2) if use_relu else 0
    flops_per_iter = flops_matmul + flops_relu

    start = time.perf_counter()
    end_time = start + seconds

    next_report = start + 1.0
    interval_start = start
    interval_iters = 0
    total_iters = 0

    while True:
        c = a @ b
        if use_relu:
            c.relu_()

        total_iters += 1
        interval_iters += 1

        now = time.perf_counter()

        # 1초마다 출력 (CUDA는 여기서만 sync해서 측정 정확도/오버헤드 균형)
        if now >= next_report:
            device_sync(device)
            t = time.perf_counter()
            interval = t - interval_start

            iters_per_sec = interval_iters / interval
            tflops = (flops_per_iter * iters_per_sec) / 1e12
            elapsed = t - start

            print(f"[{elapsed:6.2f}s] iters/sec: {iters_per_sec:8.2f} | est TFLOPS: {tflops:6.2f}")

            interval_start = t
            interval_iters = 0
            while next_report <= now:
                next_report += 1.0

        if now >= end_time:
            break

    # 마무리
    device_sync(device)
    total_time = time.perf_counter() - start

    total_flops = flops_per_iter * total_iters
    achieved_tflops = (total_flops / total_time) / 1e12

    print("\n========== Summary ==========")
    print(f"Device          : {device}")
    print(f"Total time      : {total_time:.3f} sec")
    print(f"Total iters     : {total_iters}")
    print(f"Total FLOPs     : {total_flops/1e12:.3f} TFLOPs")
    print(f"Achieved TFLOPS : {achieved_tflops:.3f}")
    print("=============================\n")

    return {
        "device": str(device),
        "size": size,
        "dtype": str(dtype),
        "seconds": seconds,
        "total_time": total_time,
        "iters": total_iters,
        "achieved_tflops": achieved_tflops,
    }


def main():
    print_system_info()

    # CUDA GPU 벤치 (가능한 경우)
    if torch.cuda.is_available():
        # VRAM 8GB급이면 8192 FP16이 빡셀 수 있어 6144부터 권장
        gemm_bench_exact(
            device=torch.device("cuda"),
            seconds=10.0,
            size=8192,          # OOM 나면 6144 또는 4096로
            dtype=torch.float16,
            use_relu=True
        )
    else:
        print("CUDA GPU가 없어 GPU 벤치를 건너뜁니다.\n")

    # CPU 벤치
    gemm_bench_exact(
        device=torch.device("cpu"),
        seconds=10.0,
        size=2048,
        dtype=torch.float32,
        use_relu=True
    )


if __name__ == "__main__":
    main()

Platform       : Windows-10-10.0.19045-SP0
Python         : 3.12.12
PyTorch        : 2.10.0+cpu
CUDA available : False
CPU threads    : 4 (intra-op)
Interop threads: 4

CUDA GPU가 없어 GPU 벤치를 건너뜁니다.

--- Benchmark: device=cpu, size=2048, dtype=torch.float32, seconds=10.0 ---
Extra op: relu

[  1.13s] iters/sec:     3.55 | est TFLOPS:   0.06
[  2.06s] iters/sec:     3.20 | est TFLOPS:   0.06
[  3.20s] iters/sec:     3.52 | est TFLOPS:   0.06
[  4.03s] iters/sec:     3.63 | est TFLOPS:   0.06
[  5.11s] iters/sec:     3.67 | est TFLOPS:   0.06
[  6.20s] iters/sec:     2.75 | est TFLOPS:   0.05
[  7.28s] iters/sec:     2.78 | est TFLOPS:   0.05
[  8.04s] iters/sec:     2.64 | est TFLOPS:   0.05
[  9.15s] iters/sec:     2.70 | est TFLOPS:   0.05
[ 10.12s] iters/sec:     3.09 | est TFLOPS:   0.05

Device          : cpu
Total time      : 10.124 sec
Total iters     : 32
Total FLOPs     : 0.550 TFLOPs
Achieved TFLOPS : 0.054

