In [1]:
!pip install -q -U torch datasets transformers peft accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m139.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.1/193.1 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.6/63.6 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import os
import shutil
import gc
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset

gc.collect()
torch.cuda.empty_cache()

drive.mount('/content/drive')

MODEL_NAME = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
DATASET_ID = "Naholav/CodeGen-Deep-5K"
OUTPUT_DIR = "./qwen-lora-deep"
DRIVE_SAVE_PATH = "/content/drive/MyDrive/qwen-lora-deep"
SYSTEM_PROMPT = "You are an expert Python programmer. Please read the problem carefully before writing any Python code."

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    use_cache=False
)

model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
model.enable_input_require_grads()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)

dataset = load_dataset(DATASET_ID)

def process_data_with_masking(example):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": example['input']},
        {"role": "assistant", "content": example['solution']}
    ]
    full_text = tokenizer.apply_chat_template(messages, tokenize=False)

    messages_input = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": example['input']},
    ]
    input_text_only = tokenizer.apply_chat_template(messages_input, tokenize=False) + "<|im_start|>assistant\n"

    tokenized_full = tokenizer(full_text, truncation=True, max_length=2048, add_special_tokens=False)
    tokenized_input = tokenizer(input_text_only, truncation=True, max_length=2048, add_special_tokens=False)

    input_ids = tokenized_full["input_ids"]
    labels = input_ids.copy()

    input_len = len(tokenized_input["input_ids"])
    if input_len > len(labels):
        input_len = len(labels)

    for i in range(input_len):
        labels[i] = -100

    return {
        "input_ids": input_ids,
        "attention_mask": tokenized_full["attention_mask"],
        "labels": labels
    }

tokenized_dataset = dataset.map(process_data_with_masking, remove_columns=dataset['train'].column_names)
split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.05)

collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    bf16=True,
    optim="adamw_torch",
    save_strategy="steps",
    save_steps=50,
    eval_strategy="steps",
    eval_steps=50,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': False},
    report_to="none",
    ddp_find_unused_parameters=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    data_collator=collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

shutil.make_archive(DRIVE_SAVE_PATH, 'zip', OUTPUT_DIR)

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

CodeGen-Deep-5K.jsonl:   0%|          | 0.00/55.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss,Validation Loss
50,0.4148,0.365321
100,0.3666,0.339809
150,0.3457,0.320655
200,0.276,0.313036
250,0.2556,0.303731
300,0.2617,0.294177
350,0.2148,0.300902
400,0.2068,0.29475


'/content/drive/MyDrive/qwen-lora-deep.zip'