## Install dependencies

In [None]:
#Using current pre-installed packages
!pip install addict transformers==4.46.3 tokenizers==0.20.3 PyMuPDF img2pdf einops easydict Pillow numpy

#If there is problem use:
# pip install addict transformers==4.46.3 tokenizers==0.20.3 PyMuPDF img2pdf einops easydict Pillow numpy torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --extra-index-url https://download.pytorch.org/whl/


## Clone DeepSeekOcr repo

In [None]:
! git lfs install

In [None]:
! git clone https://huggingface.co/deepseek-ai/DeepSeek-OCR

In [None]:
!mv DeepSeek-OCR/ DeepSeek-OCR-local/


## Downloading images

In [None]:
!wget -O dogs.jpg https://images.wagwalkingweb.com/media/daily_wag/blog_articles/hero/1723114015.705158/popular-dogs-hero-1.jpg
!wget -O polish_ocr.jpg https://upload.wikimedia.org/wikipedia/commons/thumb/a/a4/J%C3%B3zef_Ignacy_Kraszewski_-_Poezye_i_urywki_proz%C4%85.djvu/page35-1024px-J%C3%B3zef_Ignacy_Kraszewski_-_Poezye_i_urywki_proz%C4%85.djvu.jpg
!wget -O table_markdown.png https://ksiegujesie.pl/wp-content/uploads/2023/12/cennik-2024.png
!wget -O lion.jpg https://i.natgeofe.com/k/1d33938b-3d02-4773-91e3-70b113c3b8c7/lion-male-roar.jpg?wp=1&w=1084.125&h=609
!wget -O figure.png https://clauswilke.com/dataviz/visualizing_distributions_I_files/figure-html/titanic-age-stacked-hist-1.png

## Import Libraries

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch
import os
import time
from IPython.display import Image, display
from PIL import Image, ImageDraw
import sys
from io import StringIO
from typing import Optional
import re


## Output Utils

In [None]:
################################
############UTILS###############
################################

# Bounding box constants

# catch all <|det|> a <|/det|>
BOUNDING_BOX_CONTAINER_PATTERN = re.compile(
    r"<\|det\|>(.*?)<\|/det\|>",
    re.DOTALL
)

# catch all bb
BOX_COORD_PATTERN = re.compile(
    r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]"
)

BOUNDING_BOX_COLOR = "red"
BOUNDING_BOX_WIDTH = 3
NORMALIZATION_FACTOR = 1000


def parse_ocr_output(raw_output: str) -> str:
    """Parse raw OCR output to remove debug info and format cleanly"""
    lines = raw_output.split('\n')
    parsed_lines = []
    in_content = False

    # Patterns to skip (debug/metadata)
    skip_patterns = [
        'BASE:', 'PATCHES:', 'NO PATCHES', 'directly resize',
        'image size:', 'valid image tokens:', 'output texts tokens',
        'compression ratio:', 'save results:', '====', '===',
    ]

    for line in lines:
        stripped = line.strip()

        # Skip empty lines and debug patterns
        if not stripped or any(pattern in line for pattern in skip_patterns):
            continue

        # Handle ref/det structured data
        if '<|ref|>' in line:
            # Extract all reference-detection pairs from this line
            import re
            pattern = r'<\|ref\|>(.*?)<\|/ref\|>(?:<\|det\|>\[\[(.*?)\]\]<\|/det\|>)?'
            matches = re.findall(pattern, line)

            if matches:
                for ref_text, coords in matches:
                    if coords:
                        # Format with coordinates
                        parsed_lines.append(f"• **{ref_text}** → `[{coords}]`")
                    else:
                        # Just the reference text
                        parsed_lines.append(ref_text.strip())
            continue

        # Regular content - add as is
        parsed_lines.append(stripped)

    result = '\n'.join(parsed_lines)
    return result if result.strip() else raw_output




# -----------------------
# Main function
# -----------------------
def extract_and_draw_bounding_boxes(text_result: str, original_image: Image.Image) -> Optional[Image.Image]:
    """
    Extract bounding box coordinates from text_result and draw them on the image.
    """
    all_boxes = []

    # znajdź wszystkie fragmenty między <|det|> a <|/det|>
    for container in BOUNDING_BOX_CONTAINER_PATTERN.findall(text_result):
        # znajdź wszystkie [x1, y1, x2, y2] w środku
        boxes = BOX_COORD_PATTERN.findall(container)
        for coords in boxes:
            all_boxes.append(tuple(map(int, coords)))

    if not all_boxes:
        print("⚠️ No bounding boxes found.")
        return None

    print(f"✅ Found {len(all_boxes)} bounding boxes: {all_boxes}")

    # rysowanie
    image_with_bboxes = original_image.copy()
    draw = ImageDraw.Draw(image_with_bboxes)
    w, h = original_image.size
    w_scale = w / NORMALIZATION_FACTOR
    h_scale = h / NORMALIZATION_FACTOR

    for (x1n, y1n, x2n, y2n) in all_boxes:
        x1 = int(x1n * w_scale)
        y1 = int(y1n * h_scale)
        x2 = int(x2n * w_scale)
        y2 = int(y2n * h_scale)
        draw.rectangle([x1, y1, x2, y2], outline=BOUNDING_BOX_COLOR, width=BOUNDING_BOX_WIDTH)

    return image_with_bboxes


def load_img(path):
  print(path)
  return Image.open(path)


## Loading Model

In [None]:
# LOADING MODEL
model_name = './DeepSeek-OCR-local/'

start = time.time()


if torch.cuda.is_available():
    print(" Using CUDA GPU")
    model = AutoModel.from_pretrained(
    model_name,
    # flash attention requires modern hardware to utilize
    # FlashAttention requires Ampere or newer:

    # Works on: A100, RTX 3090, RTX 4080/4090, H100, L40

    # Doesn’t work on: T4, V100, RTX 2080, GTX 1080
    # _attn_implementation='flash_attention_2',
    _attn_implementation='eager',
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    use_safetensors=True
)
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = model.eval().cuda()
else:
    #default hugginhface implementation have cuda hardcoded and doesn't support cpu
    # to use cpu have to modify modelling_deepseekocr.py file like in this disscusion
    # https://huggingface.co/deepseek-ai/DeepSeek-OCR/discussions/21/files#d2h-465181
    print("Using CPU (no GPU detected)")
    model = AutoModel.from_pretrained(model_name, _attn_implementation='eager', trust_remote_code=True, use_safetensors=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = model.eval().to(torch.bfloat16)
end = time.time()

print(f"Model and tokenizer loaded in {end - start:.2f} seconds.")


## Model Inference
Here are the prompt examples that you can play with:


document: <image>\n<|grounding|>Convert the document to markdown.

other image: <image>\n<|grounding|>OCR this image.

without layouts: <image>\nFree OCR.

figures in document: <image>\nParse the figure.

general: <image>\nDescribe this image in detail.

rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.


In [None]:

def perform_vllm(model,prompt,img_path,output_path,draw_bbox=True):
  #prompts examples:
  # document: <image>\n<|grounding|>Convert the document to markdown.
  # other image: <image>\n<|grounding|>OCR this image.
  # without layouts: <image>\nFree OCR.
  # figures in document: <image>\nParse the figure.
  # general: <image>\nDescribe this image in detail.
  # rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
  # '先天下之忧而忧'


  image = load_img(img_path)

  #Different model sizes
  # Tiny: base_size = 512, image_size = 512, crop_mode = False
  # Small: base_size = 640, image_size = 640, crop_mode = False
  # Base: base_size = 1024, image_size = 1024, crop_mode = False
  # Large: base_size = 1280, image_size = 1280, crop_mode = False

  # Gundam: base_size = 1024, image_size = 640, crop_mode = True
  start = time.time()
  # Capture stdout
  captured_output = StringIO()
  old_stdout = sys.stdout
  sys.stdout = captured_output


  #locally it saved data to disk, but it doesn't work in colab
  # so have to take data from stdout
  try:
    result = model.infer(tokenizer, prompt=prompt, image_file=img_path, output_path = output_path, base_size = 1024, image_size = 640, crop_mode=True, save_results = True, test_compress = True)
  finally:
    sys.stdout = old_stdout

  # Get captured text
  console_output = captured_output.getvalue()
  text_result = console_output if console_output else str(result)
  parsed_result = parse_ocr_output(text_result)
  end = time.time()
  print(f"Inference completed in {end - start:.2f} seconds.")
  if draw_bbox:
    # Try to extract and draw bounding boxes
    result_image = extract_and_draw_bounding_boxes(text_result, image)
    display(result_image)
  else:
    display(image)
  print(f"Raw vllm result:{text_result}\n\n")
  # print(f"Parsed Vllm result:\n{parsed_result}")

## OCR

In [None]:
perform_vllm(model,prompt="<image>\nFree OCR.",
             img_path="polish_ocr.jpg", output_path="output/polish_ocr", draw_bbox=True)

## Markdown

In [None]:
perform_vllm(model,prompt="<image>\n<|grounding|>Convert the document to markdown.",
             img_path="table_markdown.png", output_path="output/markdown", draw_bbox=True)

## Object Detection

In [None]:
perform_vllm(model,prompt="<image>\nLocate <|ref|>dog<|/ref|> in the image.",
             img_path="dogs.jpg", output_path="output/dogs", draw_bbox=True)

## Image Understanding

In [None]:
perform_vllm(model,prompt="<image>\nDescribe this image in detail.",
             img_path="lion.jpg", output_path="output/lion", draw_bbox=False)

## Parse Figure

In [None]:
perform_vllm(model,prompt="<image>\nParse the figure.",
             img_path="figure.png", output_path="output/figure", draw_bbox=False)

## Your examples

In [None]:
perform_vllm(model,prompt="CHOOSE PROMPT FOR YOU NEED",
             img_path="INPUT_IMAGE_PATH", output_path="OUTPUT_PATH", draw_bbox=False)