<a href="https://colab.research.google.com/github/weiguo-li/Incontext-learning/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import sys
from torch.nn.utils.rnn import pad_sequence
sys.path.append("./Incontext-learning") # this part works for goole colab

In [2]:
import transformers
import torch

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# check point

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load SST-2 dataset
dataset = load_dataset("glue", "sst2", split="validation")

# Load Qwen3 tokenizer and model
model_name = "Qwen/Qwen3-0.6B"
model_path = "/home/students/wli/UniHeidelberg/semster2/final_projects/models/Qwen3-0.6B-Base"
model_path = model_name
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

# Make sure tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [78]:
train_dataset = dataset.select(range(300)) # the orginal paper use this
test_dataset = dataset.select(range(300,340))
# Format examples as causal LM inputs
def preprocess_function(examples):
    label_map = {0: "negative", 1: "positive"}
    inputs = [
        f"Sentence: {sentence} Label: {label_map[label]}"
        for sentence, label in zip(examples["sentence"], examples["label"])
    ]
    # Tokenize with padding/truncation
    tokenized = tokenizer(
        inputs,
        truncation=True,
        padding=False,
        max_length=256,
        # return_tensors="pt"
        return_tensors=None
    )
    # Set labels equal to input_ids for causal LM loss
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenize the dataset
# tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
train_tokenized_datasets = train_dataset.map(preprocess_function, batched = True, remove_columns = train_dataset.column_names)
test_tokenized_datasets = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)
# Data collator (handles padding dynamically in batch)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [11]:
# test the behavior of LLM before fine tuing
prompt = "Sentence: The movie was good. Label:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# You can control max_length or early stopping
output_ids = model.generate(
    **inputs,
    max_new_tokens=1,    # adjust based on expected label length
    do_sample=False       # greedy decoding
)

# Decode generated tokens
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

Sentence: The movie was good. Label: positive


In [7]:
def custom_collator(features):
    input_ids = [torch.tensor(f["input_ids"], dtype=torch.long) for f in features]
    attention_mask = [torch.tensor(f["attention_mask"], dtype=torch.long) for f in features]
    labels = [torch.tensor(f["labels"], dtype=torch.long) for f in features]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

data_collator = custom_collator

# Training arguments
training_args = TrainingArguments(
    output_dir="./qwen3_sst2_lm",
    eval_strategy="steps",
    eval_steps=1,
    # save_steps=500,
    logging_steps=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    # gradient_accumulation_steps=,
    num_train_epochs=1,
    learning_rate=5e-5,
    weight_decay=0.01,
    # save_total_limit=1,
    save_strategy = "no",
    # fp16=True,
    # push_to_hub=False,
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    # tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()


Step,Training Loss,Validation Loss
1,5.3175,4.292704
2,4.6292,5.005636
3,4.3233,4.390951
4,3.7313,4.082888
5,2.9977,3.953163
6,3.6601,3.890997


TrainOutput(global_step=6, training_loss=4.109845280647278, metrics={'train_runtime': 3.5935, 'train_samples_per_second': 3.339, 'train_steps_per_second': 1.67, 'total_flos': 1030693847040.0, 'train_loss': 4.109845280647278, 'epoch': 1.0})

In [81]:
prompt = "Sentence: macdowell , whose wifty southern charm has anchored lighter affairs ... brings an absolutely riveting conviction to her role .  Label:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# You can control max_length or early stopping
output_ids = model.generate(
    **inputs,
    max_new_tokens=1,    # adjust based on expected label length
    do_sample=False       # greedy decoding
)

# Decode generated tokens
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)


Sentence: macdowell , whose wifty southern charm has anchored lighter affairs ... brings an absolutely riveting conviction to her role .  Label: positive


In [9]:
inputs

{'input_ids': tensor([[84564,    25,   576,  5700,   572, 34531,    13,  9402, 22035]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [10]:
with torch.no_grad():
  model(**inputs)

Chooes 32 demonstration examples that can achieve the best validation porformance.

In order to make questions much easier, I split the validation dataset into the train and test set. And to find the 32 examples that can achiveve the best porformance on the test data.

In [106]:
import importlib.util
import sys

# Define the path to the utility.py file
utility_file_path = "/content/Incontext-learning/utility.py"

# Create a module spec from the file path
spec = importlib.util.spec_from_file_location("utility_module", utility_file_path)

# Create a new module from the spec
utility_module = importlib.util.module_from_spec(spec)

# Add the module to sys.modules
sys.modules["utility_module"] = utility_module

# Execute the module
spec.loader.exec_module(utility_module)

# Now you can access the data_selection function
data_selection = utility_module.data_selection

In [107]:
outcome = data_selection(model, tokenizer, train_dataset,test_dataset, num_data_points=3)

Accuracy is 0.65
Accuracy is 0.625
Accuracy is 0.775
Accuracy is 0.7
Accuracy is 0.4
Accuracy is 0.875
Accuracy is 0.8
Accuracy is 0.675
Accuracy is 0.375
Accuracy is 0.675
Accuracy is 0.8
Accuracy is 0.425
Accuracy is 0.675
Accuracy is 0.775
Accuracy is 0.6
Accuracy is 0.725
Accuracy is 0.85
Accuracy is 0.675
Accuracy is 0.65
Accuracy is 0.65
Accuracy is 0.6
Accuracy is 0.85
Accuracy is 0.475
Accuracy is 0.875
Accuracy is 0.375
Accuracy is 0.875
Accuracy is 0.575
Accuracy is 0.55
Accuracy is 0.9
Accuracy is 0.45
Accuracy is 0.85
Accuracy is 0.65
Accuracy is 0.45
Accuracy is 0.65
Accuracy is 0.875
Accuracy is 0.45
Accuracy is 0.75
Accuracy is 0.7
Accuracy is 0.775
Accuracy is 0.75
Accuracy is 0.625
Accuracy is 0.525
Accuracy is 0.6
Accuracy is 0.9
Accuracy is 0.825
Accuracy is 0.725
Accuracy is 0.35
Accuracy is 0.775
Accuracy is 0.7
Accuracy is 0.55
Accuracy is 0.85
Accuracy is 0.35
Accuracy is 0.65
Accuracy is 0.7
Accuracy is 0.725
Accuracy is 0.7
Accuracy is 0.45
Accuracy is 0.4
Accu

In [77]:
test_dataset

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 4
})