In [1]:
import json

with open("/content/drive/MyDrive/GAI_HW2_V2/train.json", "r") as fin, open("/content/drive/MyDrive/GAI_HW2_V2/sft_data_500words.jsonl", "w") as fout:
    for line in fin:
        data = json.loads(line)
        prompt = (
            "The following is the introduction of a scientific paper. "
            "Please generate an informative and self-contained abstract that accurately reflects the main contributions and findings. "
            "The abstract should be written in formal academic style and must not exceed 500 words.\n\n"
            "[Introduction]\n" + data["introduction"] + "\n\n[Abstract]"
        )
        response = data["abstract"]
        fout.write(json.dumps({"prompt": prompt, "response": response}) + "\n")


In [1]:
!pip install peft bitsandbytes accelerate transformers datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.1

In [3]:
# QLoRA fine-tuning script for JungZoona/T3Q-qwen2.5-14b-v1.0-e3
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import Trainer, DataCollatorForLanguageModeling
import torch
import os

# Paths
model_name = "JungZoona/T3Q-qwen2.5-14b-v1.0-e3"
dataset_path = "/content/drive/MyDrive/GAI_HW2_V2/sft_data_500words.jsonl"

# Load dataset
dataset = load_dataset("json", data_files={"train": dataset_path}, split="train")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

# Apply QLoRA
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Preprocess
def tokenize_function(example):
    prompt = example["prompt"]
    response = example["response"]
    full_text = prompt + "\n" + response
    return tokenizer(full_text, truncation=True, max_length=1024)

tokenized_dataset = dataset.map(tokenize_function, remove_columns=dataset.column_names)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training args
training_args = TrainingArguments(
    output_dir="qlora-finetuned_500words",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="steps",              # ✅ 每 N step 儲存 checkpoint
    save_steps=100,
    save_total_limit=3,                 # ✅ 最多保留 3 個 checkpoint
    # load_best_model_at_end=True,       # ✅ 訓練結束時載入最佳模型
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# Train
trainer.train()

# Save adapter model
model.save_pretrained("/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned_500words")
tokenizer.save_pretrained("/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned_500words")


Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.26k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/879 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/47.5k [00:00<?, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/4.73G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*ar

Step,Training Loss
10,1.9836
20,1.8988
30,1.8171
40,1.9356
50,1.8362
60,1.8369
70,1.86
80,1.8401
90,1.7959
100,1.8083


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


('/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned_500words/tokenizer_config.json',
 '/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned_500words/special_tokens_map.json',
 '/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned_500words/vocab.json',
 '/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned_500words/merges.txt',
 '/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned_500words/added_tokens.json',
 '/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned_500words/tokenizer.json')

In [5]:
import torch
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [2]:
# Generate abstracts using the fine-tuned QLoRA model
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import gc
from peft import PeftModel, PeftConfig
# from transformers import AutoTokenizer, AutoModelForCausalLM
# 先載入 adapter 的設定，確認 base model 是誰
peft_config = PeftConfig.from_pretrained("/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned_500words")
base_model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,   # ✅ 確保使用正確 base
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    offload_folder="./offload",
    max_memory={0: "35GiB", "cpu": "30GiB"}
)
model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned_500words")

# base_model = AutoModelForCausalLM.from_pretrained(
#     "JungZoona/T3Q-qwen2.5-14b-v1.0-e3",
#     device_map="auto",
#     torch_dtype=torch.float16,
#     trust_remote_code=True,
#     offload_folder="./offload"
# )
# model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned_500words", trust_remote_code=True)

# # Load fine-tuned model and tokenizer
# model_path = "/content/drive/MyDrive/GAI_HW2_V2/qlora-finetuned"
# model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16)
# tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load test set
test_path = "/content/drive/MyDrive/GAI_HW2_V2/test.json"
with open(test_path, "r") as f:
    test_data = [json.loads(line) for line in f]

# Generate abstracts
results = []
for entry in test_data:
    paper_id = entry["paper_id"]
    intro = entry["introduction"]

    prompt = (
        "The following is the introduction of a scientific paper. "
        "Please generate a concise and informative abstract.\n\n"
        f"[Introduction]\n{intro}\n\n[Abstract]"
    )

    messages = [{"role": "user", "content": prompt}]
    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    outputs = model.generate(
      **inputs,
      max_new_tokens=512,               # ✅ 給模型足夠長度完成摘要
      do_sample=True,                   # ✅ 啟用隨機取樣（非貪婪）
      temperature=0.7,                  # ✅ 溫度降低 → 生成更穩定有邏輯
      top_p=0.9,                        # ✅ nucleus sampling → 控制隨機性
      eos_token_id=tokenizer.eos_token_id  # ✅ 給模型一個結束的信號
    )
    generated = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)

    results.append({"paper_id": paper_id, "abstract": generated.strip()})
    print(f"Generated abstract for paper {paper_id}")
    torch.cuda.empty_cache()
    gc.collect()

# Save output
output_path = "/content/drive/MyDrive/GAI_HW2_V2/sample_submission_512tokens_500words.json"
with open(output_path, "w") as f:
    json.dump(results, f, indent=2)

print(f"✅ Done. Output saved to {output_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/879 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/47.5k [00:00<?, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/4.73G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Generated abstract for paper 408
Generated abstract for paper 409
Generated abstract for paper 410
Generated abstract for paper 411
Generated abstract for paper 412
Generated abstract for paper 413
Generated abstract for paper 414
Generated abstract for paper 415
Generated abstract for paper 416
Generated abstract for paper 417
Generated abstract for paper 418
Generated abstract for paper 419
Generated abstract for paper 420
Generated abstract for paper 421
Generated abstract for paper 422
Generated abstract for paper 423
Generated abstract for paper 424
Generated abstract for paper 425
Generated abstract for paper 426
Generated abstract for paper 427
Generated abstract for paper 428
Generated abstract for paper 429
Generated abstract for paper 430
Generated abstract for paper 431
Generated abstract for paper 432
Generated abstract for paper 433
Generated abstract for paper 434
Generated abstract for paper 435
Generated abstract for paper 436
Generated abstract for paper 437
Generated 

In [None]:
import json

# 輸入檔案路徑（你原始的 submission 檔案）
input_path = "sample_submission_512tokens.json"

# 輸出檔案路徑（轉換後的檔案）
output_path = "313511022_3.json"

# 讀取 JSON 陣列
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)  # 是一個 list，每個元素是 dict

# 每個 dict 寫成獨立的一行 JSON（和 sample_submission.json 相同格式）
with open(output_path, "w", encoding="utf-8") as f:
    for item in data:
        json_line = json.dumps(item, ensure_ascii=False)
        f.write(json_line + "\n")

print(f"✅ 格式已轉換完成，輸出檔案：{output_path}")
