In [1]:
import torch
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoProcessor,
    BitsAndBytesConfig,
    Idefics3ForConditionalGeneration,
)

USE_LORA = True
USE_QLORA = False
SMOL = True

model_id = "HuggingFaceTB/SmolVLM-Base" if SMOL else "HuggingFaceM4/Idefics3-8B-Llama3"

processor = AutoProcessor.from_pretrained(model_id)

if USE_QLORA or USE_LORA:
    lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules=[
            "down_proj",
            "o_proj",
            "k_proj",
            "q_proj",
            "gate_proj",
            "up_proj",
            "v_proj",
        ],
        use_dora=False if USE_QLORA else True,
        init_lora_weights="gaussian",
    )
    lora_config.inference_mode = False
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )

    print("Loading model...")
    model = Idefics3ForConditionalGeneration.from_pretrained(
        model_id,
        quantization_config=bnb_config if USE_QLORA else None,
        # _attn_implementation="flash_attention_2",
        device_map="auto",
    )
    model.add_adapter(lora_config)
    model.enable_adapters()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    print(model.get_nb_trainable_parameters())
else:
    model = Idefics3ForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        # _attn_implementation="flash_attention_2",
    ).to(DEVICE)

    # if you'd like to only fine-tune LLM
    for param in model.model.vision_model.parameters():
        param.requires_grad = False


Loading model...




(11269248, 2257542128)


In [2]:
# load adapter weights if needed
model.load_adapter("./SmolVLM-Base-ocr-isl/checkpoint-1500", adapter_name="checkpoint-1500")

<All keys matched successfully>

In [None]:
# merge adapter weights into model
model.merge_adapter(["checkpoint-1500"])
model.eval()

# push to hub
model.push_to_hub("SmolVLM-Base-ocr-isl-checkpoint-1500", use_auth_token=True)
# processor.push_to_hub("SmolVLM-Base-ocr-isl-checkpoint-1500", use_auth_token=True)



Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...oint-1500/adapter_model.safetensors:   1%|1         |  558kB / 45.2MB            

  ...phnv42jdb/adapter_model.safetensors:   3%|2         | 1.15MB / 45.2MB            

CommitInfo(commit_url='https://huggingface.co/Sigurdur/SmolVLM-Base-ocr-isl-checkpoint-1500/commit/0f07d0561750399354e4a54ac317f1c0c05b29c5', commit_message='Upload model', commit_description='', oid='0f07d0561750399354e4a54ac317f1c0c05b29c5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Sigurdur/SmolVLM-Base-ocr-isl-checkpoint-1500', endpoint='https://huggingface.co', repo_type='model', repo_id='Sigurdur/SmolVLM-Base-ocr-isl-checkpoint-1500'), pr_revision=None, pr_num=None)

: 

In [None]:
# inference example
image_url = "https://upload.wikimedia.org/wikipedia/commons/4/4b/Example.jpg"
from PIL import Image
import requests
image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")

