In [1]:
import sys
from torch.nn.utils.rnn import pad_sequence
sys.path.append("./Incontext-learning") # this part works for goole colab

In [2]:
import transformers
import torch

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# check point

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load SST-2 dataset
dataset = load_dataset("glue", "sst2", split="validation")

# Load Qwen3 tokenizer and model
model_name = "Qwen/Qwen3-0.6B"
model_path = "/home/students/wli/UniHeidelberg/semster2/final_projects/models/Qwen3-0.6B-Base"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

# Make sure tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [5]:
train_dataset = dataset.select(range(12))
test_dataset = dataset.select(range(12,16))
# Format examples as causal LM inputs
def preprocess_function(examples):
    label_map = {0: "negative", 1: "positive"}
    inputs = [
        f"Sentence: {sentence} Label: {label_map[label]}"
        for sentence, label in zip(examples["sentence"], examples["label"])
    ]
    # Tokenize with padding/truncation
    tokenized = tokenizer(
        inputs,
        truncation=True,
        padding=False,
        max_length=256,
        # return_tensors="pt"
        return_tensors=None
    )
    # Set labels equal to input_ids for causal LM loss
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenize the dataset
# tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
train_tokenized_datasets = train_dataset.map(preprocess_function, batched = True, remove_columns = train_dataset.column_names)
test_tokenized_datasets = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)
# Data collator (handles padding dynamically in batch)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [6]:
def custom_collator(features):
    input_ids = [torch.tensor(f["input_ids"], dtype=torch.long) for f in features]
    attention_mask = [torch.tensor(f["attention_mask"], dtype=torch.long) for f in features]
    labels = [torch.tensor(f["labels"], dtype=torch.long) for f in features]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

data_collator = custom_collator

# Training arguments
training_args = TrainingArguments(
    output_dir="./qwen3_sst2_lm",
    eval_strategy="steps",
    eval_steps=1,
    save_steps=500,
    logging_steps=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    # gradient_accumulation_steps=,
    num_train_epochs=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=1,
    fp16=True,
    push_to_hub=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss
1,4.951,5.154828
2,5.9838,5.154828
3,5.8723,5.154828
4,4.9032,5.154828
5,5.0716,5.154828
6,4.7336,5.154828
7,6.1805,5.154828


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 10.90 GiB of which 13.06 MiB is free. Including non-PyTorch memory, this process has 10.89 GiB memory in use. Of the allocated memory 10.69 GiB is allocated by PyTorch, and 33.24 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch

def custom_collator(features):
    input_ids = [torch.tensor(f["input_ids"], dtype=torch.long) for f in features]
    attention_mask = [torch.tensor(f["attention_mask"], dtype=torch.long) for f in features]
    labels = [torch.tensor(f["labels"], dtype=torch.long) for f in features]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Test again
loader = DataLoader(train_tokenized_datasets, batch_size=4, collate_fn=custom_collator)
batch = next(iter(loader))
print("✅ Custom collator worked")
print("input_ids shape:", batch["input_ids"].shape)
print("labels shape:", batch["labels"].shape)
