In [None]:
# Google Drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/LLM/few-shot'

/content/drive/MyDrive/LLM/few-shot


In [None]:
!pip install torch transformers datasets sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collec

In [None]:
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_from_disk
from sklearn.metrics import accuracy_score
import json

# 설정
device = "cuda" if torch.cuda.is_available() else "cpu"
k_values = [0, 1, 2, 4, 8, 16]

# 데이터 로딩
train_set = load_from_disk('/content/drive/MyDrive/LLM/few-shot/data/train')
dev_set = load_from_disk('/content/drive/MyDrive/LLM/few-shot/data/dev')
test_set = load_from_disk('/content/drive/MyDrive/LLM/few-shot/data/test')

In [None]:
train_set

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 800
})

In [None]:
dev_set

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 100
})

In [None]:
test_set

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 100
})

In [None]:
train_list = list(train_set)

support_examples_dict = {}
for k in k_values:
  if k > 0:
    random.seed(42 + k)
    support_examples_dict[k] = random.sample(train_list, k)
  else:
    support_examples_dict[k] = []

In [None]:
for k in k_values:
  print(support_examples_dict[k])

[]
[{'pubid': 14627582, 'question': 'Double reading of barium enemas: is it necessary?', 'context': {'contexts': ['The purpose of our study was to determine the effectiveness, clinical impact, and feasibility of double reading barium enemas.', "Independent double readings of 1,003 consecutive barium enemas (822 double- and 181 single-contrast examinations) were prospectively performed. From this pool of 1,003 examinations, 994 were included in our study. Examinations showing at least one polyp or carcinoma 5 mm or larger were considered to have positive results. For combined readings, results were considered positive if either of the two interpreters reported finding a polyp or carcinoma. A McNemar test was used to compare the first reader's results with the combined results of the first and second readers. Results were retrospectively correlated with endoscopic or surgical results in 360 patients, and agreement between first and combined readings and endoscopic results was determined.

In [None]:
# 텍스트 생성 함수
def generate(model, tokenizer, prompt, max_new_tokens=64):
  inputs = tokenizer(prompt, return_tensors='pt', padding=True).to(device)
  outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
  decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return decoded[len(prompt):].strip()

# 평가 함수
def evaluate(model, tokenizer, support_examples, test_set, result_file):
  preds, labels, results = [], [], []

  for example in test_set:
    prompt = "Examples:"
    for support in support_examples:
      prompt += f"Question: {support['question']}\nAnswer: {support['final_decision']}\n"

    prompt += "Answer based on the above examples"
    prompt += f"Question: {example['question']}\nAnswer: "
    prompt += "Answer only one word from the following options: yes, no, or maybe\nDo not provide any explanations.\nAnswer:"

    answer = generate(model, tokenizer, prompt)
    answer_lower = answer.lower().strip()

    # 정확한 매칭만 허용
    if answer_lower.startswith("yes"):
        pred = "yes"
    elif answer_lower.startswith("no"):
        pred = "no"
    elif answer_lower.startswith("maybe"):
        pred = "maybe"
    else:
        pred = 'error'  # fallback

    preds.append(pred)
    labels.append(example['final_decision'])
    results.append({
        'question': example['question'],
        'answer': answer,
        'prediction': pred,
        'label': example['final_decision']
    })

  acc = accuracy_score(labels, preds)

  with open(result_file, "w", encoding="utf-8") as f:
      json.dump(results, f, indent=2, ensure_ascii=False)

  return acc, preds, labels

In [None]:
huggingface_model_id = "Qwen/Qwen3-1.7B"

print("\n==== Evaluating Original Hugging Face Qwen Model ====")
model = AutoModelForCausalLM.from_pretrained(huggingface_model_id, device_map="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(huggingface_model_id, trust_remote_code=True)
model.eval()

for k in k_values:
	support_examples = support_examples_dict[k]
	result_file = f"results_qwen_k{k}.json"
	print(f"Evaluating model {k} shots...")
	acc, preds, labels = evaluate(model, tokenizer, support_examples, test_set, result_file)
	print(f"[Original Qwen (HF)] Accuracy: {acc:.4f}")


==== Evaluating Original Hugging Face Qwen Model ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model 0 shots...
[Original Qwen (HF)] Accuracy: 0.5600
Evaluating model 1 shots...
[Original Qwen (HF)] Accuracy: 0.3600
Evaluating model 2 shots...
[Original Qwen (HF)] Accuracy: 0.4200
Evaluating model 4 shots...
[Original Qwen (HF)] Accuracy: 0.4700
Evaluating model 8 shots...
[Original Qwen (HF)] Accuracy: 0.4100
Evaluating model 16 shots...
[Original Qwen (HF)] Accuracy: 0.4600


In [None]:
# IA3 SFT 모델 평가
ia3_model_path = "/content/drive/MyDrive/LLM/few-shot/models/qwen_pubmedqa_ia3_sft/final"

print("\n==== Evaluating IA3 SFT Model ====")
model = AutoModelForCausalLM.from_pretrained(ia3_model_path, trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(ia3_model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

for k in k_values:
	support_examples = support_examples_dict[k]
	result_file = f"results_ia3_k{k}.json"
	print(f"Evaluating model {k} shots...")
	acc, preds, labels = evaluate(model, tokenizer, support_examples, test_set, result_file)
	print(f"[AI3 Qwen] Accuracy: {acc:.4f}")


==== Evaluating IA3 SFT Model ====


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model 0 shots...
[AI3 Qwen] Accuracy: 0.5100
Evaluating model 1 shots...
[AI3 Qwen] Accuracy: 0.3600
Evaluating model 2 shots...
[AI3 Qwen] Accuracy: 0.3800
Evaluating model 4 shots...
[AI3 Qwen] Accuracy: 0.4400
Evaluating model 8 shots...
[AI3 Qwen] Accuracy: 0.4200
Evaluating model 16 shots...
[AI3 Qwen] Accuracy: 0.5000


In [None]:
metaicl_base_path = "/content/drive/MyDrive/LLM/few-shot/models/metaicl_qwen_meta"

for model_k in k_values:
    print(f"\n==== Loading MetaICL Model trained with k={model_k} shots ====")
    model_path = f"{metaicl_base_path}/k_{model_k}/final"
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    model.eval()

    for eval_k in k_values:
        support_examples = support_examples_dict[eval_k]
        result_file = f"results_metaicl_model_k{model_k}_eval_k{eval_k}.json"
        print(f"Evaluating model trained on k={model_k} shots with support set k={eval_k}...")
        acc, preds, labels = evaluate(model, tokenizer, support_examples, test_set, result_file)
        print(f"[MetaICL Model k={model_k}] Accuracy with support k={eval_k}: {acc:.4f}")


==== Loading MetaICL Model trained with k=0 shots ====


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Evaluating model trained on k=0 shots with support set k=0...
[MetaICL Model k=0] Accuracy with support k=0: 0.5000
Evaluating model trained on k=0 shots with support set k=1...
[MetaICL Model k=0] Accuracy with support k=1: 0.3600
Evaluating model trained on k=0 shots with support set k=2...
[MetaICL Model k=0] Accuracy with support k=2: 0.4400
Evaluating model trained on k=0 shots with support set k=4...
[MetaICL Model k=0] Accuracy with support k=4: 0.4500
Evaluating model trained on k=0 shots with support set k=8...
[MetaICL Model k=0] Accuracy with support k=8: 0.4400
Evaluating model trained on k=0 shots with support set k=16...
[MetaICL Model k=0] Accuracy with support k=16: 0.5100

==== Loading MetaICL Model trained with k=1 shots ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model trained on k=1 shots with support set k=0...
[MetaICL Model k=1] Accuracy with support k=0: 0.4800
Evaluating model trained on k=1 shots with support set k=1...
[MetaICL Model k=1] Accuracy with support k=1: 0.3400
Evaluating model trained on k=1 shots with support set k=2...
[MetaICL Model k=1] Accuracy with support k=2: 0.5000
Evaluating model trained on k=1 shots with support set k=4...
[MetaICL Model k=1] Accuracy with support k=4: 0.4900
Evaluating model trained on k=1 shots with support set k=8...
[MetaICL Model k=1] Accuracy with support k=8: 0.4100
Evaluating model trained on k=1 shots with support set k=16...
[MetaICL Model k=1] Accuracy with support k=16: 0.5200

==== Loading MetaICL Model trained with k=2 shots ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model trained on k=2 shots with support set k=0...
[MetaICL Model k=2] Accuracy with support k=0: 0.5300
Evaluating model trained on k=2 shots with support set k=1...
[MetaICL Model k=2] Accuracy with support k=1: 0.3500
Evaluating model trained on k=2 shots with support set k=2...
[MetaICL Model k=2] Accuracy with support k=2: 0.3900
Evaluating model trained on k=2 shots with support set k=4...
[MetaICL Model k=2] Accuracy with support k=4: 0.4800
Evaluating model trained on k=2 shots with support set k=8...
[MetaICL Model k=2] Accuracy with support k=8: 0.4100
Evaluating model trained on k=2 shots with support set k=16...
[MetaICL Model k=2] Accuracy with support k=16: 0.5300

==== Loading MetaICL Model trained with k=4 shots ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model trained on k=4 shots with support set k=0...
[MetaICL Model k=4] Accuracy with support k=0: 0.6000
Evaluating model trained on k=4 shots with support set k=1...
[MetaICL Model k=4] Accuracy with support k=1: 0.3600
Evaluating model trained on k=4 shots with support set k=2...
[MetaICL Model k=4] Accuracy with support k=2: 0.4300
Evaluating model trained on k=4 shots with support set k=4...
[MetaICL Model k=4] Accuracy with support k=4: 0.5100
Evaluating model trained on k=4 shots with support set k=8...
[MetaICL Model k=4] Accuracy with support k=8: 0.4200
Evaluating model trained on k=4 shots with support set k=16...
[MetaICL Model k=4] Accuracy with support k=16: 0.4300

==== Loading MetaICL Model trained with k=8 shots ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model trained on k=8 shots with support set k=0...
[MetaICL Model k=8] Accuracy with support k=0: 0.5000
Evaluating model trained on k=8 shots with support set k=1...
[MetaICL Model k=8] Accuracy with support k=1: 0.3700
Evaluating model trained on k=8 shots with support set k=2...
[MetaICL Model k=8] Accuracy with support k=2: 0.4100
Evaluating model trained on k=8 shots with support set k=4...
[MetaICL Model k=8] Accuracy with support k=4: 0.5400
Evaluating model trained on k=8 shots with support set k=8...
[MetaICL Model k=8] Accuracy with support k=8: 0.3800
Evaluating model trained on k=8 shots with support set k=16...
[MetaICL Model k=8] Accuracy with support k=16: 0.4800

==== Loading MetaICL Model trained with k=16 shots ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model trained on k=16 shots with support set k=0...
[MetaICL Model k=16] Accuracy with support k=0: 0.5000
Evaluating model trained on k=16 shots with support set k=1...
[MetaICL Model k=16] Accuracy with support k=1: 0.3400
Evaluating model trained on k=16 shots with support set k=2...
[MetaICL Model k=16] Accuracy with support k=2: 0.3900
Evaluating model trained on k=16 shots with support set k=4...
[MetaICL Model k=16] Accuracy with support k=4: 0.4500
Evaluating model trained on k=16 shots with support set k=8...
[MetaICL Model k=16] Accuracy with support k=8: 0.3900
Evaluating model trained on k=16 shots with support set k=16...
[MetaICL Model k=16] Accuracy with support k=16: 0.4700
