# Tiny A11y LoRA Chatbot
This notebook loads your LoRA fine-tuned `younglim/tiny-a11y-model` on DeepSeek-Coder and lets you chat interactively.

In [1]:
# Install required packages
!pip install --upgrade pip
!pip install transformers accelerate peft gradio

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2


In [27]:
# Load base model and tokenizer (FP16 + device_map for GPU)
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModelForCausalLM
import torch

base_model_name = 'deepseek-ai/deepseek-coder-1.3b-instruct'
lora_model_name = 'younglim/tiny-a11y-model'

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load base model in FP16 on GPU
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",        # automatically place layers on GPU
    torch_dtype=torch.float16 # use half-precision for speed
)

# Freeze base model parameters to maximize LoRA adapter influence
for param in base_model.parameters():
    param.requires_grad = False

# Load LoRA adapter
model = PeftModelForCausalLM.from_pretrained(base_model, lora_model_name)

# Ensure model is in eval mode
model.eval()

# Optional: move model to GPU if not already via device_map
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32256, 2048)
        (layers): ModuleList(
          (0-23): 24 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_features=2048, out_featu

In [30]:
# Chat function using PyTorch generate with LoRA maximized
def chat(prompt, max_new_tokens=300):
    """
    Generates text using LoRA-adapted model with greedy search (no sampling)
    to reduce hallucinations.
    """
    # Tokenize input and move to same device as model
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate output
    with torch.no_grad():  # ensure no gradients for faster inference
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,       # deterministic generation
            num_beams=3,           # small beam search for better accuracy
            early_stopping=True,   # stop when all beams finish
            use_cache=True
        )

    # Decode output
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [31]:
# Launch Gradio interface
import gradio as gr

iface = gr.Interface(
    fn=chat,
    inputs=gr.Textbox(lines=5, label='Enter your prompt'),
    outputs=gr.Textbox(lines=15, label='Model response'),
    title='Tiny A11y Chatbot by younglim',
    description='Ask the Tiny A11y Model anything about accessibility.'
)

# Set share=True to get a public link
iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9c6c3fe7931c3e0b9a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [5]:
prompt = "Explain why alt text is important for images."
response = chat(prompt)
print(response)


Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


KeyboardInterrupt: 