In [None]:
# Google Drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/LLM/few-shot'

/content/drive/MyDrive/LLM/few-shot


In [None]:
!pip install transformers datasets



In [None]:
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_from_disk
from sklearn.metrics import accuracy_score
import json

In [None]:
# 설정
device = "cuda" if torch.cuda.is_available() else "cpu"
k_values = [0, 1, 2, 4, 8, 16]

In [None]:
# 데이터 로딩
train_set = load_from_disk('/content/drive/MyDrive/LLM/few-shot/data/train')
dev_set = load_from_disk('/content/drive/MyDrive/LLM/few-shot/data/dev')
test_set = load_from_disk('/content/drive/MyDrive/LLM/few-shot/data/test')

In [None]:
train_set

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 800
})

In [None]:
dev_set

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 100
})

In [None]:
test_set

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 100
})

In [None]:
# 텍스트 생성 함수
def generate(model, tokenizer, prompt, max_new_tokens=64):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

In [None]:
def evaluate(model, tokenizer, support_examples, test_set, result_file):
    preds, labels, results = [], [], []

    for example in test_set:
        prompt = ""
        for support in support_examples:
            prompt += (
                f"Q: {support['question']}\n"
                f"C: {support['context']}\n"
                f"A: {support['final_decision']}\n\n"
            )
        prompt += (
            f"Q: {example['question']}\n"
            f"C: {example['context']}\n"
            "A: Answer only one word from the following options: yes, no, or maybe.\n"
            "Do not provide any explanations.\nAnswer:"
        )

        answer = generate(model, tokenizer, prompt)
        answer_lower = answer.lower().strip()

        # 정확한 매칭만 허용
        if answer_lower.startswith("yes"):
            pred = "yes"
        elif answer_lower.startswith("no"):
            pred = "no"
        elif answer_lower.startswith("maybe"):
            pred = "maybe"
        else:
            pred = 'error'  # fallback

        preds.append(pred)
        labels.append(example["final_decision"])
        results.append({
            "question": example["question"],
            "context": example["context"],
            "prediction": pred,
            "raw_answer": answer,
            "label": example["final_decision"]
        })

    acc = accuracy_score(labels, preds)

    with open(result_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    return acc, preds, labels

In [None]:
train_list = list(train_set)

support_examples_dict = {}
for k in k_values:
    if k > 0:
        random.seed(42 + k)  # k에 따라 다른 seed 사용
        support_examples_dict[k] = random.sample(train_list, k)
    else:
        support_examples_dict[k] = []

In [None]:
metaicl_base_path = "/content/drive/MyDrive/LLM/few-shot/models/metaicl_qwen_meta"

for model_k in k_values:
    print(f"\n==== Loading MetaICL Model trained with k={model_k} shots ====")
    model_path = f"{metaicl_base_path}/k_{model_k}/final"
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    model.eval()

    for eval_k in k_values:
        support_examples = support_examples_dict[eval_k]
        result_file = f"results_metaicl_model_k{model_k}_eval_k{eval_k}.json"
        print(f"Evaluating model trained on k={model_k} shots with support set k={eval_k}...")
        acc, preds, labels = evaluate(model, tokenizer, support_examples, test_set, result_file)
        print(f"[MetaICL Model k={model_k}] Accuracy with support k={eval_k}: {acc:.4f}")


==== Loading MetaICL Model trained with k=0 shots ====


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Evaluating model trained on k=0 shots with support set k=0...




[MetaICL Model k=0] Accuracy with support k=0: 0.7900
Evaluating model trained on k=0 shots with support set k=1...




[MetaICL Model k=0] Accuracy with support k=1: 0.6900
Evaluating model trained on k=0 shots with support set k=2...




[MetaICL Model k=0] Accuracy with support k=2: 0.6500
Evaluating model trained on k=0 shots with support set k=4...




[MetaICL Model k=0] Accuracy with support k=4: 0.6900
Evaluating model trained on k=0 shots with support set k=8...




[MetaICL Model k=0] Accuracy with support k=8: 0.6800
Evaluating model trained on k=0 shots with support set k=16...




[MetaICL Model k=0] Accuracy with support k=16: 0.6600

==== Loading MetaICL Model trained with k=1 shots ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model trained on k=1 shots with support set k=0...




[MetaICL Model k=1] Accuracy with support k=0: 0.7900
Evaluating model trained on k=1 shots with support set k=1...




[MetaICL Model k=1] Accuracy with support k=1: 0.6900
Evaluating model trained on k=1 shots with support set k=2...




[MetaICL Model k=1] Accuracy with support k=2: 0.6500
Evaluating model trained on k=1 shots with support set k=4...




[MetaICL Model k=1] Accuracy with support k=4: 0.6900
Evaluating model trained on k=1 shots with support set k=8...




[MetaICL Model k=1] Accuracy with support k=8: 0.6800
Evaluating model trained on k=1 shots with support set k=16...




[MetaICL Model k=1] Accuracy with support k=16: 0.6600

==== Loading MetaICL Model trained with k=2 shots ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model trained on k=2 shots with support set k=0...




[MetaICL Model k=2] Accuracy with support k=0: 0.7900
Evaluating model trained on k=2 shots with support set k=1...




[MetaICL Model k=2] Accuracy with support k=1: 0.6900
Evaluating model trained on k=2 shots with support set k=2...




[MetaICL Model k=2] Accuracy with support k=2: 0.6500
Evaluating model trained on k=2 shots with support set k=4...




[MetaICL Model k=2] Accuracy with support k=4: 0.6900
Evaluating model trained on k=2 shots with support set k=8...




[MetaICL Model k=2] Accuracy with support k=8: 0.6800
Evaluating model trained on k=2 shots with support set k=16...




[MetaICL Model k=2] Accuracy with support k=16: 0.6600

==== Loading MetaICL Model trained with k=4 shots ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model trained on k=4 shots with support set k=0...




[MetaICL Model k=4] Accuracy with support k=0: 0.7900
Evaluating model trained on k=4 shots with support set k=1...




[MetaICL Model k=4] Accuracy with support k=1: 0.6900
Evaluating model trained on k=4 shots with support set k=2...




[MetaICL Model k=4] Accuracy with support k=2: 0.6500
Evaluating model trained on k=4 shots with support set k=4...




[MetaICL Model k=4] Accuracy with support k=4: 0.6900
Evaluating model trained on k=4 shots with support set k=8...




[MetaICL Model k=4] Accuracy with support k=8: 0.6800
Evaluating model trained on k=4 shots with support set k=16...




[MetaICL Model k=4] Accuracy with support k=16: 0.6600

==== Loading MetaICL Model trained with k=8 shots ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model trained on k=8 shots with support set k=0...




[MetaICL Model k=8] Accuracy with support k=0: 0.7900
Evaluating model trained on k=8 shots with support set k=1...




[MetaICL Model k=8] Accuracy with support k=1: 0.6900
Evaluating model trained on k=8 shots with support set k=2...




[MetaICL Model k=8] Accuracy with support k=2: 0.6500
Evaluating model trained on k=8 shots with support set k=4...




[MetaICL Model k=8] Accuracy with support k=4: 0.6900
Evaluating model trained on k=8 shots with support set k=8...




[MetaICL Model k=8] Accuracy with support k=8: 0.6800
Evaluating model trained on k=8 shots with support set k=16...




[MetaICL Model k=8] Accuracy with support k=16: 0.6600

==== Loading MetaICL Model trained with k=16 shots ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model trained on k=16 shots with support set k=0...




[MetaICL Model k=16] Accuracy with support k=0: 0.7900
Evaluating model trained on k=16 shots with support set k=1...




[MetaICL Model k=16] Accuracy with support k=1: 0.6900
Evaluating model trained on k=16 shots with support set k=2...




[MetaICL Model k=16] Accuracy with support k=2: 0.6500
Evaluating model trained on k=16 shots with support set k=4...




[MetaICL Model k=16] Accuracy with support k=4: 0.6900
Evaluating model trained on k=16 shots with support set k=8...




[MetaICL Model k=16] Accuracy with support k=8: 0.6800
Evaluating model trained on k=16 shots with support set k=16...




[MetaICL Model k=16] Accuracy with support k=16: 0.6600


In [None]:
# IA3 SFT 모델 평가
ia3_model_path = "/content/drive/MyDrive/LLM/few-shot/models/qwen_pubmedqa_ia3_sft/final"

print("\n==== Evaluating IA3 SFT Model ====")
model = AutoModelForCausalLM.from_pretrained(ia3_model_path, trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(ia3_model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

for k in k_values:
	support_examples = support_examples_dict[k]
	result_file = f"results_ia3_k{k}.json"
	print(f"Evaluating model {k} shots...")
	acc, preds, labels = evaluate(model, tokenizer, support_examples, test_set, result_file)
	print(f"[AI3 Qwen] Accuracy: {acc:.4f}")


==== Evaluating IA3 SFT Model ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating model 0 shots...




[AI3 Qwen] Accuracy: 0.7800
Evaluating model 1 shots...




[AI3 Qwen] Accuracy: 0.6700
Evaluating model 2 shots...




[AI3 Qwen] Accuracy: 0.6500
Evaluating model 4 shots...




[AI3 Qwen] Accuracy: 0.7000
Evaluating model 8 shots...




[AI3 Qwen] Accuracy: 0.7200
Evaluating model 16 shots...




[AI3 Qwen] Accuracy: 0.6800


In [None]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m890.9/897.5 kB[0m [31m35.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
# Original Hugging Face Qwen 모델 평가
huggingface_model_id = "microsoft/biogpt"

print("\n==== Evaluating Original Hugging Face BioGPT Model ====")
model = AutoModelForCausalLM.from_pretrained(huggingface_model_id, device_map="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(huggingface_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

for k in k_values:
	support_examples = support_examples_dict[k]
	result_file = f"results_biogpt_k{k}.json"
	print(f"Evaluating model {k} shots...")
	acc, preds, labels = evaluate(model, tokenizer, support_examples, test_set, result_file)
	print(f"[Original biogpt (HF)] Accuracy: {acc:.4f}")


==== Evaluating Original Hugging Face BioGPT Model ====
Evaluating model 0 shots...
[Original biogpt (HF)] Accuracy: 0.0000
Evaluating model 1 shots...


IndexError: index out of range in self

In [None]:
# Original Hugging Face Qwen 모델 평가
huggingface_model_id = "Qwen/Qwen3-1.7B"

print("\n==== Evaluating Original Hugging Face Qwen Model ====")
model = AutoModelForCausalLM.from_pretrained(huggingface_model_id, device_map="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(huggingface_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

for k in k_values:
	support_examples = support_examples_dict[k]
	result_file = f"results_qwen_hf_k{k}.json"
	print(f"Evaluating model {k} shots...")
	acc, preds, labels = evaluate(model, tokenizer, support_examples, test_set, result_file)
	print(f"[Original Qwen (HF)] Accuracy: {acc:.4f}")


==== Evaluating Original Hugging Face Qwen Model ====


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/9.73k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Evaluating model 0 shots...


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
huggingface_model_id = "Qwen/Qwen3-1.7B"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델/토크나이저 로딩 후 pad_token 설정
model = AutoModelForCausalLM.from_pretrained(huggingface_model_id, device_map="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(huggingface_model_id, trust_remote_code=True)

# pad_token을 eos_token으로 설정 (없는 경우 eos_token 사용)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

model.to(device)
model.eval()

def generate(model, tokenizer, prompt, max_new_tokens=64):
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=tokenizer.model_max_length
    )
    inputs = {k: v.to(device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

# evaluate 함수는 그대로 사용

# 평가 반복문
for k in k_values:
    support_examples = support_examples_dict[k]
    result_file = f"results_biogpt_k{k}.json"
    print(f"Evaluating model {k} shots...")
    acc, preds, labels = evaluate(model, tokenizer, support_examples, test_set, result_file)
    print(f"[Original biogpt (HF)] Accuracy: {acc:.4f}")


config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/9.73k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Evaluating model 0 shots...




KeyboardInterrupt: 