In [None]:
import sys
from torch.nn.utils.rnn import pad_sequence
sys.path.append("./Incontext-learning") # this part works for goole colab

In [None]:
import utility
import metric
import importlib
importlib.reload(utility)
importlib.reload(metric)
from utility import data_selection
from metric import Rec2FTP

In [None]:
import transformers
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# check point

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load SST-2 dataset
dataset = load_dataset("glue", "sst2")

# Load Qwen3 tokenizer and model
model_name = "Qwen/Qwen3-0.6B"
model_path = "/home/students/wli/UniHeidelberg/semster2/final_projects/models/Qwen3-0.6B-Base"
model_path = model_name
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

# Make sure tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [None]:
train_dataset = dataset["train"].select(range(300)) # the orginal paper use this
test_dataset = dataset["validation"].select(range(300,700))
# Format examples as causal LM inputs
def preprocess_function(examples):
    label_map = {0: "negative", 1: "positive"}
    inputs = [
        f"Sentence: {sentence} Label: {label_map[label]}"
        for sentence, label in zip(examples["sentence"], examples["label"])
    ]
    # Tokenize with padding/truncation
    tokenized = tokenizer(
        inputs,
        truncation=True,
        padding=False,
        max_length=256,
        # return_tensors="pt"
        return_tensors=None
    )
    # Set labels equal to input_ids for causal LM loss
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenize the dataset
# tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
train_tokenized_datasets = train_dataset.map(preprocess_function, batched = True, remove_columns = train_dataset.column_names)
test_tokenized_datasets = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)
# Data collator (handles padding dynamically in batch)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [None]:
# test the behavior of LLM before fine tuing
prompt = "Sentence: The movie was good. Label:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# You can control max_length or early stopping
output_ids = model.generate(
    **inputs,
    max_new_tokens=1,    # adjust based on expected label length
    do_sample=False       # greedy decoding
)

# Decode generated tokens
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

In [None]:
outcome, candidate, demonstrations = data_selection(model, tokenizer, train_dataset,test_dataset, num_data_points=2, seed_max = 1)

In [None]:
Rec2FTP(model, model, tokenizer, demonstrations, test_dataset)


In [None]:
# fine tune the model

def custom_collator(features):
    input_ids = [torch.tensor(f["input_ids"], dtype=torch.long) for f in features]
    attention_mask = [torch.tensor(f["attention_mask"], dtype=torch.long) for f in features]
    labels = [torch.tensor(f["labels"], dtype=torch.long) for f in features]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

data_collator = custom_collator

# Training arguments
training_args = TrainingArguments(
    output_dir="./qwen3_sst2_lm",
    eval_strategy="steps",
    eval_steps=1,
    # save_steps=500,
    logging_steps=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=2,
    # gradient_accumulation_steps=,
    num_train_epochs=1,
    learning_rate=5e-5,
    # weight_decay=0.01,
    # save_total_limit=1,
    save_strategy = "no",
    # fp16=True,
    # push_to_hub=False,
    report_to="none"
)


# ** To do fine tuning here only need to fine tune the Key and Value
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=demonstrations.map(preprocess_function, batched=True, remove_columns=demonstrations.column_names),
    eval_dataset=test_tokenized_datasets,
    # tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()


In [None]:
prompt = "Sentence: macdowell , whose wifty southern charm has anchored lighter affairs ... brings an absolutely riveting conviction to her role .  Label:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# You can control max_length or early stopping
output_ids = model.generate(
    **inputs,
    max_new_tokens=1,    # adjust based on expected label length
    do_sample=False       # greedy decoding
)

# Decode generated tokens
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)


Chooes 32 demonstration examples that can achieve the best validation porformance.

In order to make questions much easier, I split the validation dataset into the train and test set. And to find the 32 examples that can achiveve the best porformance on the test data.