### Execute on colab

In [None]:
# Google Drive mount
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !pip install huggingface_hub transformers bitsandbytes datasets
# !pip install -U transformers accelerate bitsandbytes torch torchvision torchaudio
# !pip install --upgrade datasets

In [None]:
# from datasets import load_dataset, DatasetDict

# # Load dataset
# dataset = load_dataset("pubmed_qa", "pqa_labeled")

# # Split
# train_valid_test = dataset["train"].train_test_split(test_size=0.2, seed=42)
# dev_test = train_valid_test["test"].train_test_split(test_size=0.5, seed=42)

# # Group into DatasetDict
# dataset_splits = DatasetDict({
#     "train": train_valid_test["train"],
#     "dev": dev_test["train"],
#     "test": dev_test["test"]
# })

# # Save to disk
# dataset_splits.save_to_disk("data")

# # MetaCLI 데이터셋 생성
# biosses = load_dataset("biosses")
# glue_sst2 = load_dataset("glue", "sst2")
# ag_news = load_dataset("ag_news")
# trec = load_dataset("trec")

# metacli = DatasetDict({
#     "biosses": biosses["train"] if "train" in biosses else biosses,
#     "glue_sst2": glue_sst2["train"],
#     "ag_news": ag_news["train"],
#     "trec": trec["train"]
# })

# # Save to disk
# metacli.save_to_disk("data/metacli")

### MetaICL

In [None]:
import os
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_from_disk, load_dataset
from peft import get_peft_model, IA3Config, TaskType
from tqdm import tqdm
from utils import format_metaicl_prompt
from torch.utils.data import Dataset
from transformers import default_data_collator

In [None]:
# --- 설정 ---
model_id = "Qwen/Qwen3-1.7B"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_seq_length = 1024
k_values = [0, 1, 2, 4, 8, 16]  # 실험할 k 값 목록

In [None]:
# --- 데이터 로드 ---
pubmedqa_data = load_from_disk("data")
metacli_data = load_from_disk("data/metacli")

# 메타 태스크 구성
meta_tasks = {
    "PubMedQA": pubmedqa_data["train"],
    "Biosses": metacli_data["biosses"],
    "SST2": metacli_data["glue_sst2"],
    "AGNews": metacli_data["ag_news"],
    "TREC": metacli_data["trec"]
}

target_task_names = ["PubMedQA", "Biosses", "SST2", "AGNews", "TREC"]  # 메타 학습에 포함할 태스크들

In [None]:
# 출력 예시
print(pubmedqa_data)             # PubMedQA 데이터셋 정보
print(pubmedqa_data["train"])    # train split 정보
print(pubmedqa_data["train"][0]) # 첫 번째 샘플 데이터

In [None]:
print("PubMedQA Train Size:", len(pubmedqa_data["train"]))
print("Biosses Size: ", len(metacli_data["biosses"]))
print("SST2 Size: ", len(metacli_data["glue_sst2"]))
print("News Size: ", len(metacli_data["ag_news"]))
print("Trec Size: ", len(metacli_data["trec"]))

In [None]:
import torch
import random
import os
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    default_data_collator,
)
from peft import get_peft_model, IA3Config
from torch.utils.data import Dataset, DataLoader

# device 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 메타 프롬프트 구성 함수
def format_metaicl_prompt(task_name, query_example, k_examples, tokenizer, max_length=768):
    def example_to_string(example, task_name, include_output=True):
        task_map = {
            "MedNLI": ("premise", "hypothesis", "label"),
            "PubMedQA": ("context", "question", "final_decision"),
        }
        if task_name in task_map:
            fields = task_map[task_name]
            input_str = f"{fields[0].capitalize()}: {example.get(fields[0], '')}\n"
            input_str += f"{fields[1].capitalize()}: {example.get(fields[1], '')}\nAnswer:"
            output_str = f" {example.get(fields[2], '')}"
        else:
            input_str = f"Question: {example.get('question', '')}\nAnswer:"
            output_str = f" {example.get('answer', '')}"

        return input_str + output_str if include_output else input_str

    prompt_parts = [example_to_string(e, task_name) for e in k_examples]
    query_prompt = example_to_string(query_example, task_name, include_output=False)
    query_full = example_to_string(query_example, task_name, include_output=True)

    prompt_parts.append(query_prompt)
    prompt_str = "\n".join(prompt_parts)
    full_text = prompt_str + query_full[len(query_prompt):]

    tokenized_prompt = tokenizer(prompt_str, truncation=True, max_length=max_length, padding="max_length", return_tensors="pt")
    tokenized_full = tokenizer(full_text, truncation=True, max_length=max_length, padding="max_length", return_tensors="pt")

    input_ids = tokenized_prompt["input_ids"]
    attention_mask = tokenized_prompt["attention_mask"]
    labels = tokenized_full["input_ids"]

    labels_masked = labels.clone()
    labels_masked[labels_masked == tokenizer.pad_token_id] = -100
    labels_masked[:, :input_ids.shape[1]] = -100

    return input_ids.squeeze(0), attention_mask.squeeze(0), labels_masked.squeeze(0)


# Dataset 클래스 정의
class MetaICLDataset(Dataset):
    def __init__(self, data_list):
        self.data = data_list

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            "input_ids": item["input_ids"],
            "attention_mask": item["attention_mask"],
            "labels": item["labels"],
        }


# --- 학습 루프 시작 ---
for k_shots_meta_train in k_values:
    print(f"\n=== Starting training for k = {k_shots_meta_train} ===\n")

    # 캐시 파일 경로
    cache_path = f"./cache/formatted_data_k{k_shots_meta_train}.pt"
    os.makedirs("./cache", exist_ok=True)

    if os.path.exists(cache_path):
        print(f"Loading cached data from {cache_path}")
        formatted_data = torch.load(cache_path)
    else:
        print("Generating formatted data...")
        formatted_data = []
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        tokenizer.pad_token = tokenizer.eos_token

        with torch.no_grad():
            for task_name, dataset in meta_tasks.items():
                print(f"Formatting meta-training data for {task_name} (k={k_shots_meta_train})...")
                dataset_list = list(dataset)
                num_samples_per_task = min(len(dataset_list), 1000)

                for _ in tqdm(range(num_samples_per_task), desc=f"Processing {task_name}"):
                    try:
                        if len(dataset_list) < k_shots_meta_train + 1:
                            continue
                        sampled_examples = random.sample(dataset_list, k_shots_meta_train + 1)
                        support_examples = sampled_examples[:k_shots_meta_train]
                        query_example = sampled_examples[k_shots_meta_train]

                        input_ids, attention_mask, labels = format_metaicl_prompt(
                            task_name, query_example, support_examples, tokenizer, max_length=768
                        )

                        formatted_data.append({
                            "input_ids": input_ids,
                            "attention_mask": attention_mask,
                            "labels": labels,
                        })
                    except Exception as e:
                        print(f"Error processing sample from {task_name}: {e}")
                        continue

        torch.save(formatted_data, cache_path)
        print(f"Saved formatted data to {cache_path}")

    # 모델 및 토크나이저 로드
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    base_model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

    # PEFT 설정
    peft_config = IA3Config(
        target_modules=["q_proj", "v_proj"],
        feedforward_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(base_model, peft_config).to(device)
    model.print_trainable_parameters()

    # 데이터셋 로딩
    train_dataset = MetaICLDataset(formatted_data)

    # 학습 설정
    output_dir = f"./results/metaicl_qwen_meta/k_{k_shots_meta_train}"
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        num_train_epochs=1,
        learning_rate=5e-5,
        fp16=True,
        save_total_limit=2,
        logging_dir=f"./logs/metaicl/k_{k_shots_meta_train}",
        logging_steps=50,
        save_strategy="epoch",
        report_to="none",
        torch_compile=True,
    )

    # Trainer 정의
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=default_data_collator,
    )

    # DataLoader 병렬화 설정
    trainer.train_dataloader = lambda: DataLoader(
        train_dataset,
        batch_size=training_args.per_device_train_batch_size,
        shuffle=True,
        collate_fn=default_data_collator,
        num_workers=4,
    )

    # 학습 시작
    trainer.train()

    # 모델 저장
    trainer.save_model(f"{output_dir}/final")
    tokenizer.save_pretrained(f"{output_dir}/final")

    print(f"\n=== Finished training for k = {k_shots_meta_train} ===\n")

In [None]:
from transformers import AutoTokenizer
from peft import PeftModel
from huggingface_hub import HfApi, create_repo, upload_folder, login
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["HUGGINGFACE_TOKEN"] = os.getenv("HUGGINGFACE_TOKEN")

# Huggingface Login
login(token = os.environ["HUGGINGFACE_TOKEN"])


# 사용자 설정
k_values = [0, 1, 2, 4, 8, 16]
hf_username = "yerim00"

for k in k_values:
    model_path = f"./results/metaicl_qwen_meta/k_{k}/final"
    repo_name = f"metaicl-peft-k{k}"
    repo_id = f"{hf_username}/{repo_name}"

    create_repo(repo_id, exist_ok=True)

    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

    model.push_to_hub(repo_id)
    tokenizer.push_to_hub(repo_id)

    print(f"Uploaded k={k} model to: https://huggingface.co/{repo_id}")