In [7]:
# ======= server_min_io_finetune.py =======

import os, pathlib

# --- 1) 强制单卡 / 清理分布式变量，避免误走多卡 & P2P 映射 ---
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
for k in ("RANK","LOCAL_RANK","WORLD_SIZE","MASTER_ADDR","MASTER_PORT"):
    os.environ.pop(k, None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# --- 2) Hugging Face 缓存尽量走内存盘/本地快盘，减少慢盘 I/O ---
ramdisk = "/dev/shm" if os.path.isdir("/dev/shm") else "."
hf_home = os.path.join(ramdisk, ".hf_cache")
os.makedirs(hf_home, exist_ok=True)
os.environ["HF_HOME"] = hf_home
os.environ["HF_HUB_CACHE"] = os.path.join(hf_home, "hub")
os.environ["TRANSFORMERS_CACHE"] = os.path.join(hf_home, "transformers")
os.environ["HF_DATASETS_CACHE"] = os.path.join(hf_home, "datasets")
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"   # 纯本地，少网络交互

# --------------------- 下面才开始 import ---------------------
import json, time, random
import numpy as np, pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset, disable_caching
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
import evaluate

# --- 3) CUDA 设备 & TF32（Ampere 加速） ---
assert torch.cuda.is_available(), "CUDA 不可用：确认驱动/容器/权限"
device = torch.device("cuda")
torch.set_float32_matmul_precision("high")      # TF32 快路径（2.0+）:contentReference[oaicite:3]{index=3}
torch.backends.cuda.matmul.allow_tf32 = True    # Ampere 张量核 TF32 开启:contentReference[oaicite:4]{index=4}
torch.backends.cudnn.allow_tf32 = True

# --- 4) 随机种子 ---
SEED = 999
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

# --- 5) 读入你的本地数据（/mnt/data/train.json），自动识别列名 ---
def read_json_flexible(path: str) -> pd.DataFrame:
    # 先按 JSON Lines 读；不行就回退到普通 JSON
    try:
        return pd.read_json(path, lines=True)
    except ValueError:
        return pd.read_json(path)

DATA_PATH = "../train.json"
df = read_json_flexible(DATA_PATH)
text_key_candidates  = ["text","review","reviews","sentence","content"]
label_key_candidates = ["label","labels","sentiment","sentiments","target","y"]
text_col  = next((c for c in text_key_candidates  if c in df.columns), None)
label_col = next((c for c in label_key_candidates if c in df.columns), None)
assert text_col and label_col, f"列名识别失败，请检查字段；现有列：{df.columns.tolist()}"

df = df[[text_col, label_col]].rename(columns={text_col:"text", label_col:"label"}).copy()
# 规范二分类标签到 {0,1}
mapping = {"neg":0,"negative":0,"0":0,"pos":1,"positive":1,"1":1}
if not pd.api.types.is_integer_dtype(df["label"]):
    df["label"] = df["label"].astype(str).str.lower().map(mapping)
df = df.dropna(subset=["label"]).astype({"label":"int64"})

# 训练/验证划分（分层）
train_df, valid_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=df["label"]
)
test_path = "../test.json"
test_df = read_json_flexible(test_path)
text_col = next((c for c in ["text","review","reviews","sentence","content"] if c in test_df.columns), None)
assert text_col, f"测试集未找到文本列，现有列：{test_df.columns.tolist()}"
# --- 6) HF Datasets：禁用磁盘缓存 + 仅驻内存 tokenization ---
disable_caching()  # 全局禁用 datasets 缓存（transform 结果不落盘）:contentReference[oaicite:5]{index=5}
def to_hf(pdf: pd.DataFrame) -> Dataset:
    return Dataset.from_pandas(pdf[["text","label"]], preserve_index=False)

MODEL_NAME = os.getenv("MODEL_NAME", "bert-base-uncased") 
MAX_LEN = int(os.getenv("MAX_LEN", "128"))

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
def tok_fn(batch):  # 只做截断，padding 交给 collator
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

hf_train = to_hf(train_df).map(tok_fn, batched=True, batch_size=2048,
                               remove_columns=["text"], keep_in_memory=True, load_from_cache_file=False)
hf_valid = to_hf(valid_df).map(tok_fn, batched=True, batch_size=2048,
                               remove_columns=["text"], keep_in_memory=True, load_from_cache_file=False)
hf_train = hf_train.rename_column("label", "labels").with_format("torch")
hf_valid = hf_valid.rename_column("label", "labels").with_format("torch")

from datasets import Dataset
hf_test = Dataset.from_pandas(test_df[[text_col]].rename(columns={text_col:"text"}), preserve_index=False)
hf_test = hf_test.map(lambda b: tokenizer(b["text"], truncation=True, max_length=MAX_LEN),
                      batched=True, batch_size=2048, remove_columns=["text"],
                      keep_in_memory=True, load_from_cache_file=False)


# --- 7) 模型 ---
id2label = {0:"NEG", 1:"POS"}; label2id = {"NEG":0, "POS":1}
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, id2label=id2label, label2id=label2id, cache_dir=os.environ["TRANSFORMERS_CACHE"]
).to(device)

# --- 8) Collator：按 8 对齐 padding，利于张量核 ---
collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)  # 文档支持该参数:contentReference[oaicite:6]{index=6}

# --- 9) 最小 I/O 的 TrainingArguments（不存盘、不报告、不评估中间结果） ---
from dataclasses import fields as dataclass_fields
allowed = {f.name for f in dataclass_fields(TrainingArguments)}

args_dict = dict(
    output_dir=os.path.join(ramdisk, "_out"),  # 若有 /dev/shm 则写内存盘；否则当前目录
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=2e-5,
    weight_decay=0.01,

    # ——全程不写盘、仅控制台打印——
    save_strategy="no",
    load_best_model_at_end=False,
    report_to="none",
    logging_strategy="no",

    # 训练中不评估；我们手动在训练前/后各评一次，避免 I/O
    eval_strategy="no" if "eval_strategy" in allowed else None,
    evaluation_strategy=None if "eval_strategy" in allowed else "no",

    # DataLoader：GPU 更友好（pin_memory/num_workers）
    dataloader_pin_memory=True,   # PyTorch 官方推荐 GPU 下开启:contentReference[oaicite:7]{index=7}
    dataloader_num_workers=4,     # 典型 2~8；需结合机器核数调优:contentReference[oaicite:8]{index=8}
    group_by_length=True,

    # 优化器与混合精度
    optim="adamw_torch_fused",    # 新版 PyTorch/Transformers 支持的 fused AdamW
    bf16=torch.cuda.is_bf16_supported(),
    fp16=(not torch.cuda.is_bf16_supported()),
    seed=SEED,
)

# 过滤掉旧版本不支持的键
train_args = TrainingArguments(**{k:v for k,v in args_dict.items() if k in allowed})

acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, (tuple, list)):  # 兼容部分版本
        logits = logits[0]
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": acc.compute(predictions=preds, references=labels)["accuracy"]}


trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=hf_train,
    eval_dataset=hf_valid,   # 只在手动 evaluate 时用
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

# --- 10) 训练前/后各评一次（只打印到控制台，不落盘） ---
print(f"[device] cuda:{torch.cuda.get_device_name(0)} | TF32={torch.backends.cuda.matmul.allow_tf32}")
print("=== Eval BEFORE fine-tune ===")
pre_metrics = trainer.evaluate()
print(pre_metrics)

print("=== Training (no checkpoints / no mid-eval) ===")
t0 = time.time()
train_out = trainer.train()
print(f"train_time = {time.time() - t0:.1f}s ; steps = {train_out.global_step}")

print("=== Eval AFTER fine-tune ===")
post_metrics = trainer.evaluate()
print(post_metrics)

# （如需保存最终权重可手动解开，但会产生一次 I/O）
#trainer.save_model(os.path.join(ramdisk, "_out", "final"))

pred = trainer.predict(hf_test)
logits = pred.predictions[0] if isinstance(pred.predictions, (tuple, list)) else pred.predictions
labels = logits.argmax(axis=-1)


out_csv = "./outputs/submission_bert2.csv"
pd.DataFrame({"sentiments": labels}).to_csv(out_csv, index=False)
print("wrote:", out_csv)


Map:   0%|          | 0/5920 [00:00<?, ? examples/s]

Map:   0%|          | 0/1481 [00:00<?, ? examples/s]

Map:   0%|          | 0/1851 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[device] cuda:NVIDIA RTX A6000 | TF32=True
=== Eval BEFORE fine-tune ===


{'eval_loss': 0.8999132513999939, 'eval_model_preparation_time': 0.0058, 'eval_accuracy': 0.14652261985145174, 'eval_runtime': 0.942, 'eval_samples_per_second': 1572.154, 'eval_steps_per_second': 12.739}
=== Training (no checkpoints / no mid-eval) ===


Step,Training Loss


train_time = 30.6s ; steps = 141
=== Eval AFTER fine-tune ===
{'eval_loss': 0.13750220835208893, 'eval_model_preparation_time': 0.0058, 'eval_accuracy': 0.949358541525996, 'eval_runtime': 0.991, 'eval_samples_per_second': 1494.463, 'eval_steps_per_second': 12.109, 'epoch': 3.0}
wrote: ./outputs/submission_bert2.csv
