In [1]:
# CUDA 12.1 runfile，toolkit
!set -x \
 && cd $(mktemp -d) \
 && wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run \
 && sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit \
 && rm cuda_12.1.0_530.30.02_linux.run

import os
os.environ["PATH"] = os.environ["PATH"] + ":/usr/local/cuda/bin"

!ncu --version


++ mktemp -d
+ cd /tmp/tmp.dZc5Uq5yxS
+ wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
--2025-12-08 22:28:22--  https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.213.43.212, 23.213.43.205
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|23.213.43.212|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4245586997 (4.0G) [application/octet-stream]
Saving to: ‘cuda_12.1.0_530.30.02_linux.run’


2025-12-08 22:28:38 (265 MB/s) - ‘cuda_12.1.0_530.30.02_linux.run’ saved [4245586997/4245586997]

+ sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit
+ rm cuda_12.1.0_530.30.02_linux.run
NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2024 NVIDIA Corporation
Version 2024.2.1.0 (build 34372528) (public-release)


In [8]:
!which nv-nsight-cu-cli || which ncu || echo "nv-nsight-cu-cli / ncu not found"
!nv-nsight-cu-cli --version || ncu --version || echo "nv-nsight-cu-cli / ncu"


/usr/local/cuda/bin/ncu
/bin/bash: line 1: nv-nsight-cu-cli: command not found
NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2024 NVIDIA Corporation
Version 2024.2.1.0 (build 34372528) (public-release)


In [3]:
!nvidia-smi

!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q transformers

Mon Dec  8 22:31:28 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   33C    P0             53W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [11]:
!pip -q install "evaluate>=0.4.2"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
%%writefile train.py
import os, time, json
from dataclasses import dataclass

import torch
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer, TrainingArguments, set_seed,
)
from torch.utils.data import DataLoader

# ------------ Basic device / GPU info ------------
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    cc_major, cc_minor = torch.cuda.get_device_capability(0)
else:
    gpu_name = "CPU"
    cc_major, cc_minor = (0, 0)
print("device:", device)

bf16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
print(f"Device: {gpu_name}, CC: {cc_major}.{cc_minor}, bf16_supported={bf16_supported}")

# ------------ Config (keep original hyper-parameters; just add max_train_steps) ------------
@dataclass
class Config:
    dataset_id: str = "dair-ai/emotion"
    model_name: str = "bert-base-uncased"
    lr: float = 5e-5
    per_device_batch_size: int = 8
    num_epochs: int = 3
    use_bf16: bool = bf16_supported
    use_fp16: bool = (not bf16_supported) and torch.cuda.is_available()
    weight_decay: float = 0.0
    warmup_ratio: float = 0.0
    grad_accum_steps: int = 1
    seed: int = 42
    output_dir: str = "/content/bert_emotion_gpu"
    max_train_steps: int = 10  # run only 10 training steps for ncu profiling

cfg = Config()
os.makedirs(cfg.output_dir, exist_ok=True)
set_seed(cfg.seed)

# ------------ Dataset loading and tokenization ------------
raw_ds = load_dataset(cfg.dataset_id)

print(raw_ds)
print("Train size:", len(raw_ds["train"]))
print("Test size: ", len(raw_ds["test"]))

tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
    )

tokenized = raw_ds.map(tokenize, batched=True, remove_columns=["text"])

train_ds = tokenized["train"]
eval_ds  = tokenized["test"]

num_labels = len(raw_ds["train"].features["label"].names)
print("num_labels:", num_labels)

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=None
)

metric_acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return metric_acc.compute(predictions=preds, references=labels)

# ------------ Model ------------
model = AutoModelForSequenceClassification.from_pretrained(
    cfg.model_name, num_labels=num_labels
).to(device)

bf16 = cfg.use_bf16
fp16 = cfg.use_fp16 and (not bf16)

print(f"Using bf16={bf16}, fp16={fp16}")

# ------------ Trainer / TrainingArguments (limit to 10 steps) ------------
training_args = TrainingArguments(
    output_dir=os.path.join(cfg.output_dir, "trainer_baseline"),
    per_device_train_batch_size=cfg.per_device_batch_size,
    per_device_eval_batch_size=cfg.per_device_batch_size,
    gradient_accumulation_steps=cfg.grad_accum_steps,
    num_train_epochs=cfg.num_epochs,            # keep original epochs
    max_steps=cfg.max_train_steps,              # explicitly limit to 10 steps
    learning_rate=cfg.lr,
    weight_decay=cfg.weight_decay,
    warmup_ratio=cfg.warmup_ratio,
    logging_steps=10,
    eval_strategy="no",
    save_strategy="no",
    report_to="none",
    fp16=fp16,
    bf16=bf16,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ------------ Training + wall time measurement ------------
t0 = time.perf_counter()
train_result = trainer.train()
wall_time = time.perf_counter() - t0  # measure total training wall time

trainer.save_model()
train_metrics = train_result.metrics
print("train metrics:", train_metrics)

# # run one evaluation pass
# eval_metrics = trainer.evaluate()
# print("eval metrics:", eval_metrics)

# ------------ Throughput: samples/sec and tokens/sec ------------
loader = DataLoader(train_ds, batch_size=cfg.per_device_batch_size, collate_fn=data_collator)

total_tokens = 0
total_samples = 0
for batch in loader:
    total_tokens += batch["attention_mask"].sum().item()
    total_samples += batch["input_ids"].size(0)

samples_per_sec = total_samples / wall_time
tokens_per_sec  = total_tokens / wall_time

print(f"wall_time (s)   = {wall_time:.2f}")
print(f"samples/sec     = {samples_per_sec:.2f}")
print(f"tokens/sec      = {tokens_per_sec:.2f}")

summary = {
    "gpu_name": gpu_name,
    "device_cc": f"{cc_major}.{cc_minor}",
    "dtype": "bf16" if bf16 else ("fp16" if fp16 else "fp32"),
    "train_samples": total_samples,
    "train_tokens": total_tokens,
    "wall_time_s": wall_time,
    "samples_per_sec": samples_per_sec,
    "tokens_per_sec": tokens_per_sec,
}

print(json.dumps(summary, indent=2))

out_path = os.path.join(cfg.output_dir, "gpu_emotion_baseline_results.json")
with open(out_path, "w") as f:
    json.dump(summary, f, indent=2)

print("Saved to", out_path)


Overwriting train.py


In [21]:
!python train.py

2025-12-09 00:26:25.912241: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-09 00:26:25.930204: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765239985.952699  327755 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765239985.959283  327755 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765239985.976925  327755 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [None]:
!ncu --metrics sm__throughput.avg.pct_of_peak_sustained_elapsed,gpu__time_duration.sum --log-file ncu_sm_1steps.csv --target-processes all python train.py


2025-12-09 01:29:24.274734: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-09 01:29:24.294514: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765243764.318042  374473 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765243764.324806  374473 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765243764.342064  374473 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [28]:
import re
import pandas as pd

log_path = "/content/ncu_sm_1steps.csv"


records = []

with open(log_path, "r") as f:
    lines = f.readlines()

n = len(lines)
i = 0

while i < n:
    line = lines[i]

    # Look for the "Command line profiler metrics" section
    if "Section: Command line profiler metrics" in line:
        # ---- 1.1 Find the kernel header line above this section ----
        j = i - 1
        kernel_header = None

        while j >= 0:
            prev = lines[j].rstrip("\n")
            stripped = prev.strip()

            # Skip empty lines, separators, and other section headers
            if (stripped == "" or
                stripped.startswith("Section:") or
                stripped.startswith("Metric Name") or
                stripped.startswith("-") or
                stripped.startswith("==PROF==")):
                j -= 1
                continue

            # The first non-empty, non-==PROF==, non-table line
            # just above the section is the kernel header.
            kernel_header = stripped
            break

        if kernel_header is None:
            # Could not find a header, skip this section
            i += 1
            continue

        # Short kernel name: everything before the first '('
        kernel_name = kernel_header.split("(")[0].strip()

        # ---- 1.2 Parse the metric table rows under this section ----
        metrics = {}
        k = i + 1

        while k < n:
            l = lines[k].rstrip("\n")
            stripped = l.strip()

            # Empty line: end of this kernel's metric section
            if stripped == "":
                break

            # New section or profiling header: also stop
            if stripped.startswith("Section:") or stripped.startswith("==PROF=="):
                break

            # Skip table headers / separators
            if stripped.startswith("-") or stripped.startswith("Metric Name"):
                k += 1
                continue

            # Metric row example:
            #   gpu__time_duration.sum                                    us        10.62
            parts = stripped.split()
            if len(parts) >= 3:
                metric_name = parts[0]      # e.g. gpu__time_duration.sum
                metric_unit = parts[-2]     # e.g. us
                metric_value = parts[-1]    # e.g. 10.62
                metrics[metric_name] = (metric_unit, metric_value)

            k += 1

        # ---- 1.3 Store record if both metrics are present ----
        t_key = "gpu__time_duration.sum"
        s_key = "sm__throughput.avg.pct_of_peak_sustained_elapsed"

        if t_key in metrics and s_key in metrics:
            t_unit, t_val_str = metrics[t_key]
            s_unit, s_val_str = metrics[s_key]

            try:
                t_val = float(t_val_str)
                s_val = float(s_val_str)
            except ValueError:
                i = k
                continue

            records.append({
                "kernel_header": kernel_header,
                "kernel_name": kernel_name,
                "time_value": t_val,   # numerical time value (unit below)
                "time_unit": t_unit,   # e.g. 'us'
                "sm_pct": s_val,       # SM utilization %
            })

        # Continue after this section
        i = k
    else:
        i += 1

print(f"Parsed {len(records)} metric records")

# --------------------------------------------------------
# 2. Build a DataFrame and normalize time units
# --------------------------------------------------------

df = pd.DataFrame(records)
if df.empty:
    raise RuntimeError("No records parsed. Check log_path or metric names.")

# Map time unit to seconds
unit_to_sec = {"ns": 1e-9, "us": 1e-6, "ms": 1e-3, "s": 1.0}
df["time_sec"] = df.apply(
    lambda row: row["time_value"] * unit_to_sec.get(row["time_unit"], 1.0),
    axis=1
)
df["time_ms"] = df["time_sec"] * 1e3

print("Time units present:", df["time_unit"].value_counts().to_dict())
print(f"Total kernel time: {df['time_ms'].sum():.3f} ms")

# --------------------------------------------------------
# 3. Global time-weighted average SM utilization
# --------------------------------------------------------

def time_weighted_avg_sm(df_in: pd.DataFrame) -> float:
    """Compute time-weighted average SM utilization (%) for a DataFrame subset."""
    t = df_in["time_sec"]
    s = df_in["sm_pct"]
    if (t > 0).sum() == 0:
        return 0.0
    return float((s * t).sum() / t.sum())

global_sm = time_weighted_avg_sm(df)
print(f"\nGlobal time-weighted SM utilization: {global_sm:.2f}%")

# --------------------------------------------------------
# 4. Per-kernel aggregated stats (calls, total time, weighted SM)
# --------------------------------------------------------

def agg_kernel(group: pd.DataFrame) -> pd.Series:
    """Aggregate per-kernel statistics."""
    t = group["time_sec"]
    s = group["sm_pct"]
    return pd.Series({
        "calls": len(group),
        "total_time_ms": t.sum() * 1e3,
        "time_weighted_sm_pct": float((s * t).sum() / t.sum()) if t.sum() > 0 else 0.0,
    })

kernel_stats = df.groupby("kernel_name").apply(agg_kernel).reset_index()
kernel_stats_sorted = kernel_stats.sort_values("total_time_ms", ascending=False)

print("\nTop 20 kernels by total time:")
print(kernel_stats_sorted.head(20).to_string(index=False))

# --------------------------------------------------------
# 5. Classify kernels into high-level op types
# --------------------------------------------------------

def classify_kernel(name: str) -> str:
    """Heuristically classify kernel type based on its name."""
    lower = name.lower()
    if "gemm" in lower:
        return "gemm"
    if "fmha" in lower or "attention" in lower:
        return "attention"
    if ("layer_norm" in lower or "layernorm" in lower or
        "gammabeta" in lower or "grad_input_kernel" in lower):
        return "layernorm"
    if "gelu" in lower:
        return "gelu"
    if "softmax" in lower:
        return "softmax"
    if "dropout" in lower:
        return "dropout"
    if "reduce_kernel" in lower or "reduceop" in lower:
        return "reduce"
    if ("copy_kernel" in lower or "direct_copy" in lower or
        "bfloat16_copy" in lower):
        return "copy"
    if "elementwise_kernel" in lower:
        return "elementwise"
    return "other"

df["op_type"] = df["kernel_name"].apply(classify_kernel)

# --------------------------------------------------------
# 6. Per-op-type time-weighted SM statistics
# --------------------------------------------------------

type_stats = []
total_time_ms_all = df["time_ms"].sum()

for op_type, group in df.groupby("op_type"):
    t = group["time_sec"]
    s = group["sm_pct"]
    total_time_ms = float(t.sum() * 1e3)
    if t.sum() > 0:
        w_sm = float((s * t).sum() / t.sum())
    else:
        w_sm = 0.0
    time_share = 100.0 * total_time_ms / total_time_ms_all

    type_stats.append({
        "op_type": op_type,
        "total_time_ms": total_time_ms,
        "time_share_percent": time_share,
        "time_weighted_sm_pct": w_sm,
        "kernel_records": len(group),
    })

type_stats_df = pd.DataFrame(type_stats).sort_values("total_time_ms", ascending=False)

print("\nPer-op-type time-weighted SM stats:")
print(type_stats_df.to_string(index=False))


Parsed 1700 metric records
Time units present: {'us': 1700}
Total kernel time: 41.091 ms

Global time-weighted SM utilization: 39.87%

Top 20 kernels by total time:
                                                                                                                                                                                                                                                  kernel_name  calls  total_time_ms  time_weighted_sm_pct
                                                                                                                                                                                                            fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80   12.0        7.68727             30.781145
                                                                                                                                                                                                                     fmha_cutlassF_bf16_aligned

  kernel_stats = df.groupby("kernel_name").apply(agg_kernel).reset_index()
