In [1]:
!pip -q install "evaluate>=0.4.2"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, time, math, json, random
from dataclasses import dataclass
from typing import Optional, Dict, Any
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler

from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, default_data_collator,
    get_linear_schedule_with_warmup, Trainer, TrainingArguments, set_seed
)

In [3]:
import os, time, math, json, random
from dataclasses import dataclass

import torch
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer, TrainingArguments, set_seed,
)

device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    cc_major, cc_minor = torch.cuda.get_device_capability(0)
else:
    gpu_name = "CPU"
    cc_major, cc_minor = (0, 0)
print(device)
bf16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
dtype_for_amp = torch.bfloat16 if bf16_supported else torch.float16

print(f"Device: {gpu_name}, CC: {cc_major}.{cc_minor}, bf16_supported={bf16_supported}")

@dataclass
class Config:
    dataset_id: str = "dair-ai/emotion"
    model_name: str = "bert-base-uncased"
    lr: float = 5e-5
    per_device_batch_size: int = 8
    num_epochs: int = 3
    use_bf16: bool = bf16_supported
    use_fp16: bool = (not bf16_supported) and torch.cuda.is_available()
    weight_decay: float = 0.0
    warmup_ratio: float = 0.0
    grad_accum_steps: int = 1
    seed: int = 42
    output_dir: str = "/content/bert_emotion_gpu"

cfg = Config()
os.makedirs(cfg.output_dir, exist_ok=True)
set_seed(cfg.seed)


cuda
Device: NVIDIA L4, CC: 8.9, bf16_supported=True


In [4]:

raw_ds = load_dataset(cfg.dataset_id)

print(raw_ds)
print("Train size:", len(raw_ds["train"]))
print("Test size: ", len(raw_ds["test"]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

split/train-00000-of-00001.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

split/validation-00000-of-00001.parquet:   0%|          | 0.00/127k [00:00<?, ?B/s]

split/test-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})
Train size: 16000
Test size:  2000


In [5]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
    )

tokenized = raw_ds.map(tokenize, batched=True, remove_columns=["text"])

train_ds = tokenized["train"]
eval_ds  = tokenized["test"]

num_labels = len(raw_ds["train"].features["label"].names)
print("num_labels:", num_labels)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

num_labels: 6


In [6]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=None
)

In [7]:
metric_acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return metric_acc.compute(predictions=preds, references=labels)


Downloading builder script: 0.00B [00:00, ?B/s]

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    cfg.model_name, num_labels=num_labels
).to(device)

bf16 = cfg.use_bf16
fp16 = cfg.use_fp16 and (not bf16)

print(f"Using bf16={bf16}, fp16={fp16}")

training_args = TrainingArguments(
    output_dir=os.path.join(cfg.output_dir, "trainer_baseline"),
    per_device_train_batch_size=cfg.per_device_batch_size,
    per_device_eval_batch_size=cfg.per_device_batch_size,
    gradient_accumulation_steps=cfg.grad_accum_steps,
    num_train_epochs=cfg.num_epochs,
    learning_rate=cfg.lr,
    weight_decay=cfg.weight_decay,
    warmup_ratio=cfg.warmup_ratio,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="no",
    report_to="none",
    fp16=fp16,
    bf16=bf16,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using bf16=True, fp16=False


  trainer = Trainer(


In [None]:
from torch.profiler import profile, ProfilerActivity, tensorboard_trace_handler

# 训练参数里加一个小的 max_steps，专门用于这次 profile 跑几步就停
training_args = TrainingArguments(
    ...,
    max_steps=20,          # 只训练 20 个 step，用来 profile
    logging_steps=1,
)

trainer = Trainer(
    args=training_args,
    ...
)

logdir = "/content/bert_prof"

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True,
    with_stack=True,
    on_trace_ready=tensorboard_trace_handler(logdir),
) as prof:
    train_result = trainer.train()   # 这里就只跑前 20 步
    # 如果想要更细粒度 timeline，可以在 Trainer 里加回调，每个 step 调一次 prof.step()
    # 但只看 operator 排行的话，这样已经够用了

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30))


In [None]:

t0 = time.perf_counter()
train_result = trainer.train()

trainer.save_model()
train_metrics = train_result.metrics
trainer.log_metrics("train", train_metrics)
trainer.save_metrics("train", train_metrics)
trainer.save_state()

eval_metrics = trainer.evaluate()
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)



Epoch,Training Loss,Validation Loss,Accuracy
1,0.2721,0.20942,0.9275
2,0.1228,0.198421,0.928


In [None]:
from torch.utils.data import DataLoader

loader = DataLoader(train_ds, batch_size=cfg.per_device_batch_size, collate_fn=data_collator)

total_tokens = 0
total_samples = 0
for batch in loader:
    total_tokens += batch["attention_mask"].sum().item()
    total_samples += batch["input_ids"].size(0)

samples_per_sec = total_samples / wall_time
tokens_per_sec  = total_tokens / wall_time

print(f"samples/sec = {samples_per_sec:.2f}")
print(f"tokens/sec  = {tokens_per_sec:.2f}")


NameError: name 'wall_time' is not defined

In [None]:
summary = {
    "gpu_name": gpu_name,
    "device_cc": f"{cc_major}.{cc_minor}",
    "dtype": "bf16" if bf16 else ("fp16" if fp16 else "fp32"),
    "train_samples": total_samples,
    "train_tokens": total_tokens,
    "wall_time_s": wall_time,
    "samples_per_sec": samples_per_sec,
    "tokens_per_sec": tokens_per_sec,
    "eval": {k: float(v) for k, v in eval_metrics.items()},
}

print(json.dumps(summary, indent=2))

os.makedirs(cfg.output_dir, exist_ok=True)
with open(os.path.join(cfg.output_dir, "gpu_emotion_baseline_results.json"), "w") as f:
    json.dump(summary, f, indent=2)

print("Saved to", os.path.join(cfg.output_dir, "gpu_emotion_baseline_results.json"))
