In [4]:
!nvidia-smi
!pip install datasets torch accelerate tqdm

Wed Sep 17 13:09:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from tqdm import tqdm
import datasets
from accelerate import Accelerator

# ==== 路徑 ====
train_file="/content/train.json"
validation_file="/content/valid.json"
context_file="/content/context.json"

# ==== Accelerator ====
accelerator = Accelerator()

# ==== Hyperparameters ====
hidden_size = 128
num_layers = 2
num_heads = 4
dropout = 0.1
num_choices = 4
max_len = 128
lr = 3e-4
num_epochs = 3
batch_size = 16
device = accelerator.device

# ==== 小型 Transformer 模型 (for Multiple Choice) ====
class SmallTransformerForMC(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, num_heads, num_choices, max_len, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.pos_embedding = nn.Embedding(max_len, hidden_size)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=num_heads,
            dim_feedforward=hidden_size * 4,
            dropout=dropout,
            activation="gelu",
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(hidden_size, 1)  # 每個選項一個 score

    def forward(self, input_ids, attention_mask=None):
        bsz, num_choices, seq_len = input_ids.shape

        # flatten (batch * num_choices, seq_len)
        input_ids = input_ids.view(-1, seq_len)
        if attention_mask is not None:
            attention_mask = attention_mask.view(-1, seq_len)

        # embeddings
        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(input_ids.size(0), seq_len)
        x = self.embedding(input_ids) + self.pos_embedding(positions)

        # transformer encoder
        x = self.encoder(x, src_key_padding_mask=(attention_mask == 0) if attention_mask is not None else None)

        # [CLS] token representation
        cls_repr = x[:, 0, :]  # (batch * num_choices, hidden_size)

        # logits
        logits = self.classifier(cls_repr)  # (batch * num_choices, 1)
        logits = logits.view(bsz, num_choices)  # (batch, num_choices)
        return logits

# ==== 初始化 tokenizer ====
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
padding = "max_length"

In [6]:
# --- Preprocessing ---
def load_paragraph_selection(file_path, contexts):
    with open(file_path, "r", encoding="utf-8") as f:
        examples = json.load(f)

    data = {
        "id": [],
        "question": [],
        "paragraphs": [],
        "label": [],  # 正解段落在 paragraphs 裡的第幾個
    }

    for ex in examples:
        qid = ex["id"]
        question = ex["question"]
        para_ids = ex["paragraphs"]
        relevant_id = ex["relevant"]

        para_texts = [contexts[pid] for pid in para_ids]
        label = para_ids.index(relevant_id)

        data["id"].append(qid)
        data["question"].append(question)
        data["paragraphs"].append(para_texts)
        data["label"].append(label)

    return datasets.Dataset.from_dict(data)

# ==== 載入資料 ====
with open(context_file, "r", encoding="utf-8") as f:
    contexts = json.load(f)

dataset_splits = {}
if train_file is not None:
    dataset_splits["train"] = load_paragraph_selection(train_file, contexts)
if validation_file is not None:
    dataset_splits["validation"] = load_paragraph_selection(validation_file, contexts)

raw_datasets = datasets.DatasetDict(dataset_splits)

print(raw_datasets)
print(raw_datasets["train"][0])

# ==== Preprocess ====
def preprocess_function(examples):
    questions = examples["question"]
    paragraphs_list = examples["paragraphs"]
    labels = examples["label"]

    first_sentences = []
    second_sentences = []
    new_labels = []

    for q, paras, label in zip(questions, paragraphs_list, labels):
        first_sentences.extend([q] * 4)   # 問題重複四次
        second_sentences.extend(paras)    # 四個段落
        new_labels.append(label)

    # Tokenize
    tokenized_examples = tokenizer(
        first_sentences,
        second_sentences,
        max_length=max_len,
        padding=padding,
        truncation=True,
    )

    # Un-flatten → [batch_size, 4, seq_len]
    tokenized_inputs = {
        k: [v[i:i + 4] for i in range(0, len(v), 4)]
        for k, v in tokenized_examples.items()
    }
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

with accelerator.main_process_first():
    processed_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        remove_columns=raw_datasets["train"].column_names
    )

# ==== DataLoader ====
def collator(features):
    input_ids = torch.tensor([f["input_ids"] for f in features])
    attention_mask = torch.tensor([f["attention_mask"] for f in features])
    labels = torch.tensor([f["labels"] for f in features])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_loader = DataLoader(processed_datasets["train"], batch_size=batch_size, shuffle=True, collate_fn=collator)
val_loader = DataLoader(processed_datasets["validation"], batch_size=batch_size, collate_fn=collator)


DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'paragraphs', 'label'],
        num_rows: 21714
    })
    validation: Dataset({
        features: ['id', 'question', 'paragraphs', 'label'],
        num_rows: 3009
    })
})
{'id': '593f14f960d971e294af884f0194b3a7', 'question': '舍本和誰的數據能推算出連星的恆星的質量？', 'paragraphs': ['1930年，印度物理學家蘇布拉馬尼揚·錢德拉塞卡根據廣義相對論計算出質量大於1.4倍太陽質量的非轉動星體會因重力塌縮成為電子簡併態。愛丁頓雖然在理論上支持黑洞存在的可能性，但同時認為錢德拉塞卡的觀點事實上不能成立，他認為「應當有某種自然定律阻止恆星出現這種荒唐的行為」。當時的物理學家如波耳、亨利·羅素等人都贊同錢德拉塞卡的理論，但出於愛丁頓聲望的原因，他們並沒有公開對錢德拉塞卡表示支持。不過從某種意義上說，愛丁頓也是正確的，當恆星質量大於錢德拉塞卡極限後，確實仍然會有中子簡併壓力阻止恆星繼續塌縮。到了1939年，美國物理學家羅伯特·歐本海默等人推算了這種情形的恆星質量上限，這個極限被稱作托爾曼-歐本海默-沃爾科夫極限。當今的天體物理學家普遍認為，除非有如未知的夸克簡併壓力一類因素的存在，質量大於托爾曼-歐本海默-沃爾科夫極限的恆星將最終會塌縮為錢德拉塞卡所預言的黑洞。即使如此，史瓦西解作為當時能夠描述黑洞行為的唯一精確解，由於具有一些讓人看似不大優美的性質以及難以與實驗觀測相聯繫，一直沒有進入主流物理學研究的視野，關於黑洞的理論乃至整個廣義相對論領域的研究由此擱置了二十年之久。', '心理學是否為自然科學的範圍，目前也尚存爭議，一般較廣為接受的說法是心理學同時包含在自然科學與社會科學的範疇之中。自然科學的根本目的在於尋找隱藏在自然現象背後的規律，但是自然科學的工作尚不包括研究為什麼會存在這些規律。自然科學認為超自然的、隨意的和自相矛盾的現象是不存在的。自然科學的最重要的兩個支柱是觀察和邏輯推理。

Map:   0%|          | 0/21714 [00:00<?, ? examples/s]

Map:   0%|          | 0/3009 [00:00<?, ? examples/s]

In [7]:
# ==== 初始化模型 ====
model = SmallTransformerForMC(vocab_size=len(tokenizer), hidden_size=hidden_size,
                              num_layers=num_layers, num_heads=num_heads,
                              num_choices=num_choices, max_len=max_len, dropout=dropout).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

# ==== Training Loop ====
history = {"train_loss": [], "val_loss": [], "val_acc": []}

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()
        logits = model(batch["input_ids"].to(device), batch["attention_mask"].to(device))
        loss = criterion(logits, batch["labels"].to(device))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # ==== Validation ====
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            logits = model(batch["input_ids"].to(device), batch["attention_mask"].to(device))
            loss = criterion(logits, batch["labels"].to(device))
            val_loss += loss.item()

            preds = logits.argmax(dim=-1)
            correct += (preds == batch["labels"].to(device)).sum().item()
            total += batch["labels"].size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct / total

    history["train_loss"].append(avg_train_loss)
    history["val_loss"].append(avg_val_loss)
    history["val_acc"].append(val_acc)

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

Training Epoch 1: 100%|██████████| 1358/1358 [00:51<00:00, 26.40it/s]
  output = torch._nested_tensor_from_mask(
Validation Epoch 1: 100%|██████████| 189/189 [00:04<00:00, 45.28it/s]


Epoch 1 | Train Loss: 1.0481 | Val Loss: 0.9895 | Val Acc: 0.4879


Training Epoch 2: 100%|██████████| 1358/1358 [00:43<00:00, 31.44it/s]
Validation Epoch 2: 100%|██████████| 189/189 [00:03<00:00, 53.76it/s]


Epoch 2 | Train Loss: 0.9292 | Val Loss: 0.9650 | Val Acc: 0.5045


Training Epoch 3: 100%|██████████| 1358/1358 [00:43<00:00, 31.13it/s]
Validation Epoch 3: 100%|██████████| 189/189 [00:03<00:00, 52.76it/s]

Epoch 3 | Train Loss: 0.8653 | Val Loss: 0.9819 | Val Acc: 0.5002



