In [5]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [6]:
# -------------------------
# 1️⃣ 掛載 Google Drive
# -------------------------
from google.colab import drive
drive.mount('/content/drive')

# -------------------------
# 2️⃣ 指定 JSONL 路徑
# -------------------------
import os
from datasets import load_dataset

jsonl_path = "/content/drive/MyDrive/dataset.jsonl"  # ✅ 改這裡成你的實際路徑

# 檢查檔案是否存在
assert os.path.exists(jsonl_path), f"❌ 找不到檔案：{jsonl_path}"

# -------------------------
# 3️⃣ 載入資料集
# -------------------------
ds = load_dataset("json", data_files=jsonl_path, split="train")

# -------------------------
# 4️⃣ 統一欄位名稱 sentence / label
# -------------------------
def unify(example):
    # sentence 欄位名稱統一
    sentence = (
        example.get("sentence")
        or example.get("prompt")
        or example.get("text")
        or ""
    )

    # label 欄位名稱統一
    if "completion" in example and example["completion"] is not None:
        label = int(example["completion"])
    elif "level" in example and example["level"] is not None:
        label = int(example["level"])
    else:
        # 若資料沒有 label，可設為 -1 以便過濾
        label = -1

    return {"sentence": sentence, "label": label}

ds = ds.map(unify)

# 過濾掉 label 無效的資料
ds = ds.filter(lambda x: x["label"] != -1)

# -------------------------
# 5️⃣ 檢查結果
# -------------------------
print("✅ 成功載入 dataset.jsonl")
print("📊 總筆數:", len(ds))
print("📝 範例資料：")
print(ds[0])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/165 [00:00<?, ? examples/s]

Filter:   0%|          | 0/165 [00:00<?, ? examples/s]

✅ 成功載入 dataset.jsonl
📊 總筆數: 165
📝 範例資料：
{'prompt': '這句話的討人厭程度 (1-5)：好像還行吧。', 'completion': '1', 'sentence': '這句話的討人厭程度 (1-5)：好像還行吧。', 'label': 1}


In [7]:
# 移除無效或缺 label 筆
ds = ds.filter(lambda x: x["sentence"] is not None and x["sentence"].strip() != "" and x["label"] is not None)

# 轉 label 範圍到 0..4（若原本就是 1..5 ）
def normalize_label(x):
    if x["label"] > 5:  # 保險檢查
        x["label"] = 5
    if x["label"] < 1:
        x["label"] = 1
    x["label"] = int(x["label"]) - 1
    return x

ds = ds.map(normalize_label)

# 切 train / val（80/20）
ds = ds.train_test_split(test_size=0.2, seed=42)
print("train:", len(ds["train"]), "val:", len(ds["test"]))


# 確認 label 範圍
for split in ['train', 'test']:
    labels = [x['label'] for x in ds[split]]
    print(split, "min:", min(labels), "max:", max(labels))

Filter:   0%|          | 0/165 [00:00<?, ? examples/s]

Map:   0%|          | 0/165 [00:00<?, ? examples/s]

train: 132 val: 33
train min: 0 max: 4
test min: 0 max: 4


In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "hfl/chinese-roberta-wwm-ext"
num_labels = 5

# 1️⃣ 載入 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2️⃣ 新增 pad token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# 3️⃣ 載入模型
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# 4️⃣ 讓模型 embedding 對應新 pad token
model.resize_token_embeddings(len(tokenizer))

# 5️⃣ 設定 pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# 6️⃣ 設定 device 關掉
#device = "cuda" if torch.cuda.is_available() else "cpu"
#model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
max_len = 128
def tok_fn(batch):
    return tokenizer(batch["prompt"], truncation=True, padding="max_length", max_length=max_len)

ds_encoded = ds.map(lambda x: tok_fn(x), batched=True)
ds_encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

In [10]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["query", "value"]  # encoder-based 可試 ["dense"] 或 ["dense_1"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 298,757 || all params: 102,570,250 || trainable%: 0.2913


In [11]:
#for n, p in model.named_parameters():
    #print(n, p.dtype)


base_model.model.bert.embeddings.word_embeddings.weight torch.float32
base_model.model.bert.embeddings.position_embeddings.weight torch.float32
base_model.model.bert.embeddings.token_type_embeddings.weight torch.float32
base_model.model.bert.embeddings.LayerNorm.weight torch.float32
base_model.model.bert.embeddings.LayerNorm.bias torch.float32
base_model.model.bert.encoder.layer.0.attention.self.query.base_layer.weight torch.float32
base_model.model.bert.encoder.layer.0.attention.self.query.base_layer.bias torch.float32
base_model.model.bert.encoder.layer.0.attention.self.query.lora_A.default.weight torch.float32
base_model.model.bert.encoder.layer.0.attention.self.query.lora_B.default.weight torch.float32
base_model.model.bert.encoder.layer.0.attention.self.key.weight torch.float32
base_model.model.bert.encoder.layer.0.attention.self.key.bias torch.float32
base_model.model.bert.encoder.layer.0.attention.self.value.base_layer.weight torch.float32
base_model.model.bert.encoder.layer.0.a

In [16]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

training_args = TrainingArguments(
    output_dir="./lora_model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    learning_rate=5e-4,
    logging_strategy="steps",  # 只 log 本地，不用 wandb
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    fp16=False,
    report_to="none"
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision": precision_score(labels, preds, average="macro"),
        "recall": recall_score(labels, preds, average="macro")
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_encoded["train"],
    eval_dataset=ds_encoded["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision,Recall
1,No log,1.597517,0.30303,0.158508,0.114,0.28
2,No log,1.42507,0.393939,0.267273,0.226923,0.35119
3,No log,1.176634,0.454545,0.329293,0.271861,0.422857
4,No log,1.160174,0.333333,0.281569,0.294545,0.347857
5,No log,1.057967,0.606061,0.549394,0.585,0.570476
6,No log,1.16425,0.484848,0.481775,0.555714,0.468095
7,No log,1.106468,0.636364,0.620932,0.651429,0.610952
8,No log,1.100968,0.666667,0.643791,0.698413,0.632381
9,No log,1.089251,0.69697,0.665329,0.711111,0.660952
10,No log,1.067051,0.666667,0.643205,0.675,0.635952


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=330, training_loss=0.8030994762073863, metrics={'train_runtime': 42.6437, 'train_samples_per_second': 30.954, 'train_steps_per_second': 7.739, 'total_flos': 87131854909440.0, 'train_loss': 0.8030994762073863, 'epoch': 10.0})

In [18]:
# 儲存 PEFT adapter
model.save_pretrained("./lora_model_adapter")
tokenizer.save_pretrained("./lora_model_adapter")


('./lora_model_adapter/tokenizer_config.json',
 './lora_model_adapter/special_tokens_map.json',
 './lora_model_adapter/vocab.txt',
 './lora_model_adapter/added_tokens.json',
 './lora_model_adapter/tokenizer.json')

In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
import torch

base_model_name = "hfl/chinese-roberta-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=5)

# 載入 LoRA adapter
model = PeftModel.from_pretrained(base_model, "./lora_model_adapter")

# 推論函數
def predict_sentence(sent):
    inputs = tokenizer(sent, return_tensors="pt", truncation=True, padding=True, max_length=128).to(model.device)
    model.eval()
    with torch.no_grad():
        logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1).item()
    return pred + 1

print(predict_sentence("你又拿我筆？"))
print(predict_sentence("哇，你真會把事情弄得這麼亂！"))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2
3
