# Tiny A11y Model Fine-tune Notebook
This notebook fine-tunes DeepSeek-Coder on WCAG + MDN accessibility datasets.

Requirements: Free GPU runtime in Colab (T4 recommended).

In [5]:
# Install required packages
!pip install --upgrade pip
!pip install datasets transformers peft huggingface_hub accelerate

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2


In [6]:
# Login to Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()  # Paste your token when prompted

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
# Fine-tune DeepSeek-Coder with LoRA for accessibility prompts
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, PeftModelForCausalLM
from datasets import load_dataset
import torch


In [8]:
# -----------------------------
# 1️⃣ Load dataset
# -----------------------------
dataset = load_dataset('younglim/a11y-dataset', split='train')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/154k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26 [00:00<?, ? examples/s]

In [9]:
# -----------------------------
# 2️⃣ Load base model and tokenizer
# -----------------------------
base_model_name = 'deepseek-ai/deepseek-coder-1.3b-instruct'
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token is set

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16
)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [10]:
# -----------------------------
# 3️⃣ Add LoRA adapters
# -----------------------------
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['q_proj','v_proj'],
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(base_model, peft_config)
model.train()  # Ensure in train mode


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32256, 2048)
        (layers): ModuleList(
          (0-23): 24 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_features=2048, out_featu

In [19]:
# -----------------------------
# 4️⃣ Tokenize dataset
# -----------------------------
def tokenize_fn(example):
    # Tokenize text
    tokenized = tokenizer(
        example['text'],
        truncation=True,
        padding='max_length',
        max_length=512,
    )
    # Set labels for causal LM
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_fn, batched=True)

# Disable shuffling for very small dataset to get consistent gradients
tokenized_dataset.set_format(type='torch')


Map:   0%|          | 0/26 [00:00<?, ? examples/s]

In [22]:
# -----------------------------
# 5️⃣ Training arguments
# -----------------------------
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,       # keep small if GPU memory limited
    gradient_accumulation_steps=4,       # simulate batch size of 8
    num_train_epochs=5,                  # more epochs for tiny dataset
    learning_rate=5e-5,                  # lower LR for stability
    fp16=True,                            # keep FP16 if GPU
    save_strategy="epoch",               # save after each epoch
    logging_strategy="steps",
    logging_steps=5,
    push_to_hub=True,
    hub_model_id='younglim/tiny-a11y-model'
)

In [23]:
# -----------------------------
# 6 Trainer setup
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset ,
    # eval_dataset=eval_dataset,   # optional if you have validation data
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [24]:
# -----------------------------
# 7 Train
# -----------------------------
trainer.train()


Step,Training Loss
5,3.4876
10,3.4627
15,4.3022
20,4.5241


TrainOutput(global_step=20, training_loss=3.9441463470458986, metrics={'train_runtime': 28.237, 'train_samples_per_second': 4.604, 'train_steps_per_second': 0.708, 'total_flos': 511973334712320.0, 'train_loss': 3.9441463470458986, 'epoch': 5.0})

In [25]:
# -----------------------------
# 9️⃣ Save LoRA adapter only
# -----------------------------
model.save_pretrained("tiny-a11y-model")  # LoRA adapter weights

In [26]:
# -----------------------------
# 🔟 Push the fine-tuned LoRA adapter to Hugging Face Hub
# -----------------------------
trainer.push_to_hub()

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...702434.7b3e51fab01b.298.0: 100%|##########| 5.15kB / 5.15kB            

  ...results/training_args.bin: 100%|##########| 5.84kB / 5.84kB            

  ...702623.7b3e51fab01b.298.1: 100%|##########| 6.32kB / 6.32kB            

  ...adapter_model.safetensors: 100%|##########| 6.30MB / 6.30MB            

CommitInfo(commit_url='https://huggingface.co/younglim/tiny-a11y-model/commit/1baf7bf4c114e91563c9dcb6d70926af666d90e7', commit_message='End of training', commit_description='', oid='1baf7bf4c114e91563c9dcb6d70926af666d90e7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/younglim/tiny-a11y-model', endpoint='https://huggingface.co', repo_type='model', repo_id='younglim/tiny-a11y-model'), pr_revision=None, pr_num=None)

🎉 After running all cells, your fine-tuned Tiny A11y Model will be available at:
[https://huggingface.co/younglim/tiny-a11y-model](https://huggingface.co/younglim/tiny-a11y-model)