In [2]:
!pip install -U transformers



## Local Inference on GPU
Model page: https://huggingface.co/ds4sd/SmolDocling-256M-preview

In [21]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="ds4sd/SmolDocling-256M-preview")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "how many fingers are there?"}
        ]
    },
]
pipe(text=messages)

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 49932.19it/s]
Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG'},
     {'type': 'text', 'text': 'how many fingers are there?'}]}],
  'generated_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG'},
     {'type': 'text', 'text': 'how many fingers are there?'}]},
   {'role': 'assistant', 'content': ' 2.'}]}]

In [22]:
# Load model directly
from transformers import AutoProcessor, AutoModelForVision2Seq

processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForVision2Seq.from_pretrained("ds4sd/SmolDocling-256M-preview")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "how many fingers are in the picture?"}
        ]
    },
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)


outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 23497.50it/s]
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 2.<end_of_utterance>


## Test with clerk's receipt

In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from PIL import Image

pipe = pipeline("image-text-to-text", model="ds4sd/SmolDocling-256M-preview")

# load local image
image = Image.open("images/clerk_receipt.png")

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": image},
            {"type": "text", "text": "what is the total amount of the charge for this receipt?  You will see it next to a text that says \"TOTAL\""}
        ]
    },
]
result = pipe(text=messages)

print(result[0]["generated_text"][1]["content"])

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 38304.15it/s]
Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 $26.65.


In [17]:
pipe(text=messages)

[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1248x850>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt?  You will see it next to a text that says "TOTAL"'}]}],
  'generated_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1248x850>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt?  You will see it next to a text that says "TOTAL"'}]},
   {'role': 'assistant', 'content': ' $26.65.'}]}]

# Test with Everlane Receipt

In [18]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from PIL import Image

pipe = pipeline("image-text-to-text", model="ds4sd/SmolDocling-256M-preview")

# load local image
image = Image.open("images/everlane_receipt.png")

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": image},
            {"type": "text", "text": "what is the total amount of the charge for this receipt?  You will see it next to a text that says \"TOTAL\""}
        ]
    },
]
pipe(text=messages)

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 12748.64it/s]
Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1120x1396>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt?  You will see it next to a text that says "TOTAL"'}]}],
  'generated_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1120x1396>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt?  You will see it next to a text that says "TOTAL"'}]},
   {'role': 'assistant', 'content': ' $70.32'}]}]

In [19]:
pipe(text=messages)

[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1120x1396>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt?  You will see it next to a text that says "TOTAL"'}]}],
  'generated_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1120x1396>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt?  You will see it next to a text that says "TOTAL"'}]},
   {'role': 'assistant', 'content': ' $70.32'}]}]

## Test with Yard House Receipt

In [20]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from PIL import Image

pipe = pipeline("image-text-to-text", model="ds4sd/SmolDocling-256M-preview")

# load local image
image = Image.open("images/yard_house_receipt.jpg")

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": image},
            {"type": "text", "text": "what is the total amount of the charge for this receipt?"}
        ]
    },
]
pipe(text=messages)

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 40524.68it/s]
Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=750x1000>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt?'}]}],
  'generated_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=750x1000>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt?'}]},
   {'role': 'assistant', 'content': ' $27.35.'}]}]

In [21]:
pipe(text=messages)

[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=750x1000>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt?'}]}],
  'generated_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=750x1000>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt?'}]},
   {'role': 'assistant', 'content': ' $27.35.'}]}]

In [26]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from PIL import Image

pipe = pipeline("image-text-to-text", model="ds4sd/SmolDocling-256M-preview")

# load local image
image = Image.open("images/yard_house_receipt.jpg")

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": image},
            {"type": "text", "text": "what is the total amount of the charge for this receipt? You will see it next to a text that says \"TOTAL\""}
        ]
    },
]
pipe(text=messages)

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 16644.06it/s]
Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=750x1000>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt? You will see it next to a text that says "TOTAL"'}]}],
  'generated_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=750x1000>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt? You will see it next to a text that says "TOTAL"'}]},
   {'role': 'assistant', 'content': ' $136.77.'}]}]

# Test with receipt from Hana Market

In [22]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from PIL import Image

pipe = pipeline("image-text-to-text", model="ds4sd/SmolDocling-256M-preview")

# load local image
image = Image.open("images/hana_market_receipt.jpeg")

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": image},
            {"type": "text", "text": "what is the total amount of the charge for this receipt? You will see it next to a text that says \"TOTAL\""}
        ]
    },
]
pipe(text=messages)

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 28149.69it/s]
Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=4032x3024>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt? You will see it next to a text that says "TOTAL"'}]}],
  'generated_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=4032x3024>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt? You will see it next to a text that says "TOTAL"'}]},
   {'role': 'assistant', 'content': ' $23.90'}]}]

In [23]:
pipe(text=messages)

[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=4032x3024>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt? You will see it next to a text that says "TOTAL"'}]}],
  'generated_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=4032x3024>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt? You will see it next to a text that says "TOTAL"'}]},
   {'role': 'assistant', 'content': ' $23.90'}]}]

## Text with HEB receipt

In [24]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from PIL import Image

pipe = pipeline("image-text-to-text", model="ds4sd/SmolDocling-256M-preview")

# load local image
image = Image.open("images/heb_receipt.jpeg")
# Attempt 1.
# {"type": "text", "text": "what is the total amount of the charge for this receipt?"}
# Response
# The total charge for this receipt is $125.

# Attempt 2
# {"type": "text", "text": "what is the total amount of the charge for this receipt? You will see it next to a text that says \"TOTAL\" or TOTAL SALE."}
# Response
#  $120.00.
#  $120.00.

# Attempt 3
# {"type": "text", "text": "what is the total amount of the charge for this receipt? You will see it next to a text that says **Total Sale**"}
# Response
#  $12,000.

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": image},
            {"type": "text", "text": "what is the total amount of the charge for this receipt? You will see it next to a text that says **Total Sale**"}
        ]
    },
]
pipe(text=messages)

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 24966.10it/s]
Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=4032x3024>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt? You will see it next to a text that says **Total Sale**'}]}],
  'generated_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=4032x3024>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt? You will see it next to a text that says **Total Sale**'}]},
   {'role': 'assistant', 'content': ' $12,000.'}]}]

In [25]:
pipe(text=messages)

[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=4032x3024>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt? You will see it next to a text that says **Total Sale**'}]}],
  'generated_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=4032x3024>},
     {'type': 'text',
      'text': 'what is the total amount of the charge for this receipt? You will see it next to a text that says **Total Sale**'}]},
   {'role': 'assistant', 'content': ' $12,000.'}]}]

## Testing with HuggingFaceTB/SmolLM2-135M

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "HuggingFaceTB/SmolLM2-135M"
device = "cuda" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
inputs = tokenizer.encode("Gravity is", return_tensors="pt").to(device)
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))