# Install dependencies

In [1]:
!pip install -qqq bitsandbytes
!pip install -qqq torch
!pip install -qqq transformers
!pip install -qqq peft
!pip install -qqq accelerate
!pip install -qqq datasets
!pip install -qqq einops

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [2]:
import os
import random
import numpy as np
import pandas as pd
import torch
import bitsandbytes as bnb

from sklearn.model_selection import train_test_split
from datasets import Dataset
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)

In [3]:
import warnings
warnings.filterwarnings(
    "ignore",
    message=".*torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly.*"
)

In [4]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

## Prepare data and training scripts

In [5]:
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
DATA_PATH = "/kaggle/input/vim-data/final.csv"

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [7]:
df = pd.read_csv(DATA_PATH).dropna(subset=["description", "key"])
train_df, val_df = train_test_split(df, test_size=0.1, random_state=SEED)
print("Train size:", len(train_df))
print("Validation size:", len(val_df))

Train size: 2368
Validation size: 264


In [8]:
def generate_prompt(example):
    prompt = f"How to {example['description']} using vim motions? " \
             f"Write only the symbol sequence representing the vim motion."
    return prompt, example["key"]

def tokenize_example(example):
    prompt_text, answer_text = generate_prompt(example)
    full_text = prompt_text + "\n" + answer_text
    tokenized = tokenizer(
        full_text,
        max_length=64,
        truncation=True,
        padding="max_length"
    )
    return tokenized

train_dataset = Dataset.from_pandas(train_df).map(tokenize_example)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_example)

Map:   0%|          | 0/2368 [00:00<?, ? examples/s]

Map:   0%|          | 0/264 [00:00<?, ? examples/s]

In [9]:
def train_with_lora(
    rank_value: int = 8,
    lora_dropout: float = 0.05,
    learning_rate: float = 1e-4,
    weight_decay: float = 0,
    epochs: int = 10,
    output_dir_prefix: str = "finetune_qwen_vim",
    verbose=True,
    save_model=True,
    scheduler='cosine_with_restarts',
    scheduler_kwargs={},
    early_stopping_patience=None,
):
    """
    Trains Qwen model with LoRA for the specified rank, dropout, 
    learning rate, and weight decay.
    """
    if verbose:
        print(f"\n=== Training with LoRA rank={rank_value}, dropout={lora_dropout}, lr={learning_rate}, wd={weight_decay} ===")
    
    # Load base model fresh
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        trust_remote_code=True,
        quantization_config=bnb_config
    )
    model = prepare_model_for_kbit_training(model)

    # Create LoRA config
    lora_config = LoraConfig(
        r=rank_value,
        lora_alpha=32,
        # target_modules=["q_proj", "v_proj"],          omit the line to train all parameters
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    # Wrap model with PEFT
    model = get_peft_model(model, lora_config)
    model.config.use_cache = False

    # Define training arguments
    training_args = TrainingArguments(
        eval_strategy='epoch',
        save_strategy='epoch',
        save_total_limit=1,
        output_dir=f"{output_dir_prefix}_rank_{rank_value}_dropout_{lora_dropout}_lr_{learning_rate}_wd_{weight_decay}",
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        fp16=True,
        optim="paged_adamw_8bit",
        lr_scheduler_type=scheduler,
        lr_scheduler_kwargs={'num_cycles': 10} if scheduler=='cosine_with_restarts' else scheduler_kwargs,
        report_to="none",
        seed=SEED,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        disable_tqdm=False if verbose else True,
        logging_strategy="epoch" if verbose else "no"
    )
    
    if early_stopping_patience:
        early_stop_callback = EarlyStoppingCallback(
            early_stopping_patience=early_stopping_patience,
            early_stopping_threshold=0.0
        )

    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        callbacks=[early_stop_callback]
    )

    # Training
    trainer.train()

    # Save LoRA adapter
    if save_model:
        lora_dir = f"/kaggle/working/trained_{MODEL_NAME}"
        trainer.save_model(lora_dir)
        if verbose:
            print(f"Best model saved to '{lora_dir}'.")

    # Evaluate on validation dataset to get final loss
    eval_metrics = trainer.evaluate()
    final_val_loss = eval_metrics["eval_loss"]
    
    return final_val_loss

## Train 0.5B model

In [10]:
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"

train_with_lora(
    rank_value=32,
    lora_dropout=0.05,
    learning_rate=1e-4,
    weight_decay=0.001,
    epochs=30,
    scheduler='cosine',
    early_stopping_patience=5,
)


=== Training with LoRA rank=32, dropout=0.05, lr=0.0001, wd=0.001 ===


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,1.8569,1.302172
2,1.1456,1.126271
3,1.001,1.045073
4,0.9145,0.980919
5,0.8503,0.943711
6,0.7978,0.919739
7,0.7552,0.896619
8,0.7163,0.881569
9,0.6846,0.872577
10,0.655,0.867802


Best model saved to '/kaggle/working/trained_Qwen/Qwen2.5-0.5B-Instruct'.


0.8562170267105103

## Train 7B model

In [11]:
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

train_with_lora(
    rank_value=32,
    lora_dropout=0.05,
    learning_rate=1e-4,
    weight_decay=0.001,
    epochs=30,
    scheduler='cosine',
    early_stopping_patience=3,
)


=== Training with LoRA rank=32, dropout=0.05, lr=0.0001, wd=0.001 ===


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,1.3179,0.942748
2,0.8189,0.839399
3,0.7228,0.787783
4,0.6621,0.768428
5,0.6153,0.766744
6,0.5763,0.76901
7,0.5416,0.782582
8,0.5111,0.789177


Best model saved to '/kaggle/working/trained_Qwen/Qwen2.5-7B-Instruct'.


0.7667436003684998

In [12]:
!zip -r /kaggle/working/folder.zip /kaggle/working/


  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/trained_Qwen/ (stored 0%)
  adding: kaggle/working/trained_Qwen/Qwen2.5-0.5B-Instruct/ (stored 0%)
  adding: kaggle/working/trained_Qwen/Qwen2.5-0.5B-Instruct/README.md (deflated 66%)
  adding: kaggle/working/trained_Qwen/Qwen2.5-0.5B-Instruct/adapter_model.safetensors (deflated 7%)
  adding: kaggle/working/trained_Qwen/Qwen2.5-0.5B-Instruct/adapter_config.json (deflated 53%)
  adding: kaggle/working/trained_Qwen/Qwen2.5-0.5B-Instruct/training_args.bin (deflated 51%)
  adding: kaggle/working/trained_Qwen/Qwen2.5-7B-Instruct/ (stored 0%)
  adding: kaggle/working/trained_Qwen/Qwen2.5-7B-Instruct/README.md (deflated 66%)
  adding: kaggle/working/trained_Qwen/Qwen2.5-7B-Instruct/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/trained_Qwen/Qwen2.5-7B-Instruct/adapter_config.json (deflated 53%)
  adding: kaggle/working/trained_Qwen/Qwen2.5-7B-Instruct/training_args.bin (deflated 51%)
  adding: kaggle/working/.vir