In [10]:
!nvidia-smi
!pip install datasets evaluate torch accelerate tqdm

Mon Sep 15 08:07:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P0             27W /  250W |       3MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [11]:
#!/usr/bin/env python
# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Fine-tuning a ü§ó Transformers model on multiple choice relying on the accelerate library without using a Trainer.
"""
# You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.

import argparse
import json
import math
import os
import random
from itertools import chain
from pathlib import Path
from types import SimpleNamespace

import datasets
import evaluate
import torch
from accelerate import Accelerator
from accelerate.utils import set_seed
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoConfig,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    DataCollatorForMultipleChoice,
    SchedulerType,
    default_data_collator,
    get_scheduler,
)
from transformers.utils import check_min_version, send_example_telemetry

In [12]:
args = SimpleNamespace(
    train_file="/kaggle/input/ntu-adl-2025-hw-1/train.json",
    validation_file="/kaggle/input/ntu-adl-2025-hw-1/valid.json",
    context_file="/kaggle/input/ntu-adl-2025-hw-1/context.json",
    max_seq_length=512,
    pad_to_max_length=False,
    model_name_or_path="bert-base-chinese",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=3e-5,
    num_train_epochs=1,
    max_train_steps=None,
    gradient_accumulation_steps=2,
    lr_scheduler_type=SchedulerType.LINEAR, # choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
    output_dir="/kaggle/working/",
    seed=1234,
)

print(args)

namespace(train_file='/kaggle/input/ntu-adl-2025-hw-1/train.json', validation_file='/kaggle/input/ntu-adl-2025-hw-1/valid.json', test_file='/kaggle/input/ntu-adl-2025-hw-1/test.json', context_file='/kaggle/input/ntu-adl-2025-hw-1/context.json', max_seq_length=512, pad_to_max_length=False, model_name_or_path='bert-base-chinese', per_device_train_batch_size=1, per_device_eval_batch_size=1, learning_rate=3e-05, num_train_epochs=1, max_train_steps=None, gradient_accumulation_steps=2, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, output_dir='/kaggle/working/', seed=1234)


In [13]:
# Establish accelerator
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)

# Set the training seed now.
if args.seed is not None:
    set_seed(args.seed)

# Handle the repository creation
if accelerator.is_main_process:
    if args.output_dir is not None:
        os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()

In [14]:
# Get the datasets

with open(args.context_file, "r", encoding="utf-8") as f:
    contexts = json.load(f)

def load_paragraph_selection(file_path, contexts):
    with open(file_path, "r", encoding="utf-8") as f:
        examples = json.load(f)

    data = {
        "id": [],
        "question": [],
        "paragraphs": [],
        "label": [],        # Ê≠£Ëß£ÊÆµËêΩÂú® paragraphs Ë£°ÁöÑÁ¨¨ÂπæÂÄã
    }

    for ex in examples:
        qid = ex["id"]
        question = ex["question"]
        para_ids = ex["paragraphs"]
        relevant_id = ex["relevant"]

        para_texts = [contexts[pid] for pid in para_ids]
        label = para_ids.index(relevant_id)

        data["id"].append(qid)
        data["question"].append(question)
        data["paragraphs"].append(para_texts)
        data["label"].append(label)

    return datasets.Dataset.from_dict(data)

# Load train/valid
dataset_splits = {}
if args.train_file is not None:
    dataset_splits["train"] = load_paragraph_selection(args.train_file, contexts)
if args.validation_file is not None:
    dataset_splits["validation"] = load_paragraph_selection(args.validation_file, contexts)

raw_datasets = datasets.DatasetDict(dataset_splits)

print(raw_datasets)
print(raw_datasets["train"][0])


DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'paragraphs', 'label'],
        num_rows: 21714
    })
    validation: Dataset({
        features: ['id', 'question', 'paragraphs', 'label'],
        num_rows: 3009
    })
})
{'id': '593f14f960d971e294af884f0194b3a7', 'question': 'ËàçÊú¨ÂíåË™∞ÁöÑÊï∏ÊìöËÉΩÊé®ÁÆóÂá∫ÈÄ£ÊòüÁöÑÊÅÜÊòüÁöÑË≥™ÈáèÔºü', 'paragraphs': ['1930Âπ¥ÔºåÂç∞Â∫¶Áâ©ÁêÜÂ≠∏ÂÆ∂ËòáÂ∏ÉÊãâÈ¶¨Â∞ºÊèö¬∑Èå¢Âæ∑ÊãâÂ°ûÂç°Ê†πÊìöÂª£Áæ©Áõ∏Â∞çË´ñË®àÁÆóÂá∫Ë≥™ÈáèÂ§ßÊñº1.4ÂÄçÂ§™ÈôΩË≥™ÈáèÁöÑÈùûËΩâÂãïÊòüÈ´îÊúÉÂõ†ÈáçÂäõÂ°åÁ∏ÆÊàêÁÇ∫ÈõªÂ≠êÁ∞°‰ΩµÊÖã„ÄÇÊÑõ‰∏ÅÈ†ìÈõñÁÑ∂Âú®ÁêÜË´ñ‰∏äÊîØÊåÅÈªëÊ¥ûÂ≠òÂú®ÁöÑÂèØËÉΩÊÄßÔºå‰ΩÜÂêåÊôÇË™çÁÇ∫Èå¢Âæ∑ÊãâÂ°ûÂç°ÁöÑËßÄÈªû‰∫ãÂØ¶‰∏ä‰∏çËÉΩÊàêÁ´ãÔºå‰ªñË™çÁÇ∫„ÄåÊáâÁï∂ÊúâÊüêÁ®ÆËá™ÁÑ∂ÂÆöÂæãÈòªÊ≠¢ÊÅÜÊòüÂá∫ÁèæÈÄôÁ®ÆËçíÂîêÁöÑË°åÁÇ∫„Äç„ÄÇÁï∂ÊôÇÁöÑÁâ©ÁêÜÂ≠∏ÂÆ∂Â¶ÇÊ≥¢ËÄ≥„ÄÅ‰∫®Âà©¬∑ÁæÖÁ¥†Á≠â‰∫∫ÈÉΩË¥äÂêåÈå¢Âæ∑ÊãâÂ°ûÂç°ÁöÑÁêÜË´ñÔºå‰ΩÜÂá∫ÊñºÊÑõ‰∏ÅÈ†ìËÅ≤ÊúõÁöÑÂéüÂõ†Ôºå‰ªñÂÄë‰∏¶Ê≤íÊúâÂÖ¨ÈñãÂ∞çÈå¢Âæ∑ÊãâÂ°ûÂç°Ë°®Á§∫ÊîØÊåÅ„ÄÇ‰∏çÈÅéÂæûÊüêÁ®ÆÊÑèÁæ©‰∏äË™™ÔºåÊÑõ‰∏ÅÈ†ì‰πüÊòØÊ≠£Á

In [16]:
# 1. ËºâÂÖ• config
config = AutoConfig.from_pretrained(args.model_name_or_path)

# 2. ËºâÂÖ• tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

# 3. ËºâÂÖ• model
model = AutoModelForMultipleChoice.from_pretrained(
    args.model_name_or_path,
    from_tf=bool(".ckpt" in args.model_name_or_path)
)

# 4. Ë™øÊï¥ embedding Â§ßÂ∞èÔºàÈÅøÂÖç tokenizer Êñ∞Â¢ûÂ≠óÂÖ∏ÈÄ†Êàê index errorÔºâ
embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    model.resize_token_embeddings(len(tokenizer))

# 5. padding ÊñπÂºè
padding = "max_length" if args.pad_to_max_length else False

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# --- Preprocessing ---
def preprocess_function(examples):
    questions = examples["question"]             # list[str]
    paragraphs_list = examples["paragraphs"]     # list[list[str]] (ÊØèÂÄãÊ®£Êú¨ÂõõÂÄãÊÆµËêΩÊñáÂ≠ó)
    labels = examples["label"]                   # list[int] (Ê≠£Á¢∫Á¥¢Âºï 0~3)

    first_sentences = []
    second_sentences = []
    new_labels = []

    for q, paras, label in zip(questions, paragraphs_list, labels):
        # ÂïèÈ°åÈáçË§áÂõõÊ¨°
        first_sentences.extend([q] * 4)
        # ÊÆµËêΩÊñáÂ≠ó
        second_sentences.extend(paras)
        # label ‰øùÁïô
        new_labels.append(label)

    # Tokenize
    tokenized_examples = tokenizer(
        first_sentences,
        second_sentences,
        max_length=args.max_seq_length,
        padding=padding,
        truncation=True,
    )

    # Un-flatten ‚Üí [batch_size, 4, seq_len]
    tokenized_inputs = {
        k: [v[i:i + 4] for i in range(0, len(v), 4)]
        for k, v in tokenized_examples.items()
    }
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs


with accelerator.main_process_first():
    processed_datasets = raw_datasets.map(
        preprocess_function, 
        batched=True, 
        remove_columns=raw_datasets["train"].column_names
    )

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

# --- DataLoaders ---
if args.pad_to_max_length:
    data_collator = default_data_collator
else:
    if accelerator.mixed_precision == "fp8":
        pad_to_multiple_of = 16
    elif accelerator.mixed_precision != "no":
        pad_to_multiple_of = 8
    else:
        pad_to_multiple_of = None
    data_collator = DataCollatorForMultipleChoice(
        tokenizer, pad_to_multiple_of=pad_to_multiple_of, return_tensors="pt"
    )

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
)
eval_dataloader = DataLoader(
    eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
)

# --- Optimizer ---
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

# --- Accelerator ---
device = accelerator.device
model.to(device)

# Scheduler
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None:
    args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    overrode_max_train_steps = True

lr_scheduler = get_scheduler(
    name=args.lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
)

# Accelerator prepare
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

# ÈáçÊñ∞Ë®àÁÆóÊ≠•Êï∏Ëàá epoch
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if overrode_max_train_steps:
    args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

# --- Metric ---
metric = evaluate.load("accuracy")

# Ë®àÁÆóÁ∏Ω batch size
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps


Map:   0%|          | 0/21714 [00:00<?, ? examples/s]

Map:   0%|          | 0/3009 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [18]:
# ===== Training Setup =====
accelerator.print("***** Running training *****")
accelerator.print(f"  Num examples = {len(train_dataset)}")
accelerator.print(f"  Num Epochs = {args.num_train_epochs}")
accelerator.print(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
accelerator.print(f"  Total train batch size (parallel/distributed/accumulation) = {total_batch_size}")
accelerator.print(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
accelerator.print(f"  Total optimization steps = {args.max_train_steps}")

progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps, starting_epoch = 0, 0
progress_bar.update(completed_steps)   # Êõ¥Êñ∞ÈÄ≤Â∫¶Ê¢ùÔºàËã•Âæû checkpoint ÁπºÁ∫åÔºâ

# ===== Training Loop =====
for epoch in range(starting_epoch, args.num_train_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        with accelerator.accumulate(model):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        if accelerator.sync_gradients:   # ÊØèÂÄã accumulation step ÊâçÊõ¥Êñ∞
            progress_bar.update(1)
            completed_steps += 1

        if completed_steps >= args.max_train_steps:
            break

    # ===== Evaluation =====
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
        metric.add_batch(predictions=predictions, references=references)

    eval_metric = metric.compute()
    accelerator.print(f"Epoch {epoch}: {eval_metric}")

# ===== Save Model & Results =====
if args.output_dir is not None:
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(
        args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
    )
    if accelerator.is_main_process:
        tokenizer.save_pretrained(args.output_dir)
        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
            json.dump({f"eval_{k}": v for k, v in eval_metric.items()}, f)

accelerator.wait_for_everyone()
accelerator.end_training()

***** Running training *****
  Num examples = 21714
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (parallel/distributed/accumulation) = 2
  Gradient Accumulation steps = 2
  Total optimization steps = 10857


  0%|          | 0/10857 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: {'accuracy': 0.9521435692921236}


In [19]:
!zip -r /kaggle/working/working.zip /kaggle/working/*

  adding: kaggle/working/all_results.json (stored 0%)
  adding: kaggle/working/config.json (deflated 54%)
  adding: kaggle/working/model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 7%)
  adding: kaggle/working/special_tokens_map.json (deflated 42%)
  adding: kaggle/working/tokenizer_config.json (deflated 75%)
  adding: kaggle/working/tokenizer.json (deflated 75%)
  adding: kaggle/working/vocab.txt (deflated 48%)
