<a href="https://colab.research.google.com/github/vannis422/trainsmall/blob/main/LLMpytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 安装必要依赖（使用transformers最新版）
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q accelerate bitsandbytes

# 检查GPU资源
import torch
print(f"GPU可用: {torch.cuda.is_available()}")
print(f"GPU型号: {torch.cuda.get_device_name(0)}")
print(f"显存: {torch.cuda.get_device_properties(0).total_memory/1024**3:.2f}GB")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
GPU可用: True
GPU型号: Tesla T4
显存: 14.74GB


In [None]:
from huggingface_hub import snapshot_download

# 下载模型（需要先登录HuggingFace）
from huggingface_hub import login
login("")

# 下载Llama3-2.1B模型
model_path = snapshot_download(
    "meta-llama/Llama-3.2-1B",
    revision="main",
    ignore_patterns=["*.bin", "*.gguf"],  # 不下载原始权重，后面会量化
    local_dir="/content/Llama3-2.1B"
)
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# 4-bit量化配置
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 加载并量化模型
model = AutoModelForCausalLM.from_pretrained(
    "/content/Llama3-2.1B",
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained("/content/Llama3-2.1B")

# 保存量化后的模型
quant_path = "/content/Llama3-2.1B-4bit"
model.save_pretrained(quant_path)
tokenizer.save_pretrained(quant_path)

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

('/content/Llama3-2.1B-4bit/tokenizer_config.json',
 '/content/Llama3-2.1B-4bit/special_tokens_map.json',
 '/content/Llama3-2.1B-4bit/tokenizer.json')

In [None]:
!pip install -q peft transformers accelerate bitsandbytes datasets evaluate
!pip install -U datasets

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorWithPadding
)
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score
from datasets import load_dataset

# 1. 載入數據集
dataset = load_dataset("pubmed_qa", "pqa_labeled")["train"]
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_val_dataset = dataset["train"]
test_dataset = dataset["test"]
train_val_split = train_val_dataset.train_test_split(test_size=0.125, seed=42)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

# 2. 設置標籤映射
label2id = {"yes": 0, "no": 1, "maybe": 2}
id2label = {v: k for k, v in label2id.items()}

# 3. 配置 4-bit 量化
#bnb_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_quant_type="nf4",
#    bnb_4bit_compute_dtype=torch.float16,
#    bnb_4bit_use_double_quant=True,
#)

# 4. 載入 tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/Llama3-2.1B-4bit")
tokenizer.pad_token = tokenizer.eos_token  # 強制指定 pad_token

model = AutoModelForSequenceClassification.from_pretrained(
    "/content/Llama3-2.1B-4bit",
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
    device_map="auto"
)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id


# 6. 準備模型進行 k-bit 訓練
model = prepare_model_for_kbit_training(model)

# 7. 配置 LoRA 參數
peft_config = LoraConfig(
    r=8,  # LoRA 的秩
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=["q_proj", "v_proj"],  # 針對這些模組添加適配器
)

# 8. 添加 LoRA 適配器
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# 9. 數據預處理
def preprocess(examples):
    prompts = [
        f"Question: {q}\nContext: {c}\nAnswer:"
        for q, c in zip(examples["question"], examples["context"])
    ]
    encodings = tokenizer(
        prompts,
        truncation=True,
        padding=False,
        max_length=512,
        return_tensors=None
    )
    encodings["labels"] = [label2id[label] for label in examples["final_decision"]]
    return encodings

# 應用預處理
train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess, batched=True, remove_columns=val_dataset.column_names)
test_dataset = test_dataset.map(preprocess, batched=True, remove_columns=test_dataset.column_names)

# 10. 創建數據收集器
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest",
    max_length=512,
    pad_to_multiple_of=8,
    return_tensors="pt"
)

# 11. 創建 DataLoader
train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    collate_fn=data_collator,
    shuffle=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=4,
    collate_fn=data_collator,
    shuffle=False
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=4,
    collate_fn=data_collator,
    shuffle=False
)

# 12. 設置優化器和學習率調度器
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
num_training_steps = len(train_dataloader) * 5  # epochs=5
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_training_steps)

# 13. 啟用梯度檢查點 (節省記憶體)
model.gradient_checkpointing_enable()

# 14. 訓練循環
model.train()
for epoch in range(5):
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")

    for step, batch in enumerate(progress_bar):
        batch = {k: v.to(model.device) for k, v in batch.items()}

        # 使用自動混合精度
        with torch.cuda.amp.autocast():
            outputs = model(**batch)
            loss = outputs.loss

        # 反向傳播
        loss.backward()

        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # 更新參數
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=total_loss/(step+1))

    # 每個epoch結束後評估
    model.eval()
    val_preds, val_labels = [], []

    for batch in tqdm(val_dataloader, desc="Validating"):
        batch = {k: v.to(model.device) for k, v in batch.items()}

        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        val_preds.extend(preds.cpu().numpy())
        val_labels.extend(batch["labels"].cpu().numpy())

    val_accuracy = accuracy_score(val_labels, val_preds)
    print(f"Epoch {epoch + 1} - Validation Accuracy: {val_accuracy:.2%}")

    # 保存模型檢查點
    checkpoint_dir = f"./results/checkpoint-{epoch}"
    model.save_pretrained(checkpoint_dir)
    tokenizer.save_pretrained(checkpoint_dir)

    model.train()

# 15. 最終測試評估
model.eval()
test_preds, test_labels = [], []

for batch in tqdm(test_dataloader, desc="Testing"):
    batch = {k: v.to(model.device) for k, v in batch.items()}

    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    test_preds.extend(preds.cpu().numpy())
    test_labels.extend(batch["labels"].cpu().numpy())

test_accuracy = accuracy_score(test_labels, test_preds)
print(f"\nFinal Test Accuracy: {test_accuracy:.2%}")
model.save_pretrained("checkpoint-4")
tokenizer.save_pretrained("checkpoint-4")

# 再上傳整個目錄
!huggingface-cli upload checkpoint-4 --repo-id vannishh/llama3-2.1B-4bit-finetuned --include "*"




Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /content/Llama3-2.1B-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 858,112 || all params: 1,236,678,656 || trainable%: 0.0694


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  with torch.cuda.amp.autocast():
Epoch 1: 100%|██████████| 175/175 [03:20<00:00,  1.15s/it, loss=0.78]
Validating: 100%|██████████| 25/25 [00:30<00:00,  1.24s/it]


Epoch 1 - Validation Accuracy: 81.00%


  with torch.cuda.amp.autocast():
Epoch 2: 100%|██████████| 175/175 [03:19<00:00,  1.14s/it, loss=0.464]
Validating: 100%|██████████| 25/25 [00:30<00:00,  1.24s/it]


Epoch 2 - Validation Accuracy: 80.00%


  with torch.cuda.amp.autocast():
Epoch 3: 100%|██████████| 175/175 [03:18<00:00,  1.14s/it, loss=0.324]
Validating: 100%|██████████| 25/25 [00:30<00:00,  1.24s/it]


Epoch 3 - Validation Accuracy: 81.00%


  with torch.cuda.amp.autocast():
Epoch 4: 100%|██████████| 175/175 [03:19<00:00,  1.14s/it, loss=0.24]
Validating: 100%|██████████| 25/25 [00:30<00:00,  1.23s/it]


Epoch 4 - Validation Accuracy: 81.00%


  with torch.cuda.amp.autocast():
Epoch 5: 100%|██████████| 175/175 [03:17<00:00,  1.13s/it, loss=0.213]
Validating: 100%|██████████| 25/25 [00:30<00:00,  1.23s/it]


Epoch 5 - Validation Accuracy: 82.00%


Testing: 100%|██████████| 50/50 [01:00<00:00,  1.22s/it]



Final Test Accuracy: 79.50%
usage: huggingface-cli <command> [<args>]
huggingface-cli: error: unrecognized arguments: --repo-id vannishh/llama3-2.1B-4bit-finetuned


In [None]:
from huggingface_hub import login
login("")  # 用 read/write 權限
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(repo_id="vannishh/llama3-2.1B-4bit-finetuned", repo_type="model")
from huggingface_hub import upload_folder

upload_folder(
    repo_id="your_username/llama3-2.1B-4bit-finetuned",
    folder_path="./results/checkpoint-4",  # 你選擇最後一個 checkpoint 上傳
    repo_type="model",
    commit_message="upload finetuned model"
)



HfHubHTTPError: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6825dc19-250ec95800f0aab37b90aa86;bcf0d4d9-ba33-47cd-ae2a-79a8115f2916)

You already created this model repo

In [None]:
from huggingface_hub import HfApi

api = HfApi()

api.upload_folder(
    folder_path="./results/checkpoint-4",  # 本機模型目錄
    repo_id="vannishh/llama3-2.1B-4bit-finetuned",  # 你已建立的 repo
    repo_type="model",
    commit_message="Upload checkpoint-4"
)


ValueError: Invalid metadata in README.md.
- "base_model" with value "/content/Llama3-2.1B-4bit" is not valid. Use a model id from https://hf.co/models.

In [None]:
readme = '''---
license: apache-2.0
base_model: meta-llama/Llama-3.2-1B
tags:
  - pubmedqa
  - llama3
  - qlora
  - sequence-classification
  - 4bit
  - peft
---

# LLaMA3-2.1B QLoRA fine-tuned on PubMedQA

This model is a 4-bit quantized, QLoRA fine-tuned version of `meta-llama/Llama-3.2-1B`, trained on the PubMedQA dataset for medical question classification (`yes`, `no`, `maybe`). It was optimized using PEFT with LoRA adapters, and is designed for efficient inference on resource-constrained hardware.

## Training Details
- **Base model**: `meta-llama/Llama-3.2-1B`
- **Dataset**: `pubmed_qa/pqa_labeled`
- **Method**: QLoRA (4-bit NF4)
- **LoRA target modules**: `q_proj`, `v_proj`
- **Epochs**: 5
- **Batch size**: 4
'''

with open("/content/results/checkpoint-4/README.md", "w") as f:
    f.write(readme)


In [None]:
from huggingface_hub import HfApi

api = HfApi()

api.upload_folder(
    folder_path="./results/checkpoint-4",
    repo_id="vannishh/llama3-2.1B-4bit-finetuned",
    repo_type="model",
    commit_message="Upload QLoRA fine-tuned model"
)


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.44M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/vannishh/llama3-2.1B-4bit-finetuned/commit/7f91d8cc29d2bafe11d3f9a581d21dea2f84eadf', commit_message='Upload QLoRA fine-tuned model', commit_description='', oid='7f91d8cc29d2bafe11d3f9a581d21dea2f84eadf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/vannishh/llama3-2.1B-4bit-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='vannishh/llama3-2.1B-4bit-finetuned'), pr_revision=None, pr_num=None)

In [None]:
# ✅ 1. 安裝必要套件
!pip install -q transformers datasets accelerate

# ✅ 2. 載入 tokenizer 與 model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax

model_id = "vannishh/llama3-2.1B-4bit-finetuned"

# 必須指定 label 對應，否則會報錯（3分類）
label2id = {"yes": 0, "no": 1, "maybe": 2}
id2label = {v: k for k, v in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
    device_map="auto"
)
model.eval()
print("模型與 tokenizer 載入成功")



Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /content/Llama3-2.1B-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


模型與 tokenizer 載入成功
