In [2]:
!pip install transformers pdf2image pytesseract docx2txt fitz torch


Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.9.2-py3-none-any.whl.metadata (6.8 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading n

In [3]:
!pip uninstall pymupdf -y
!pip install --no-cache-dir pymupdf


[0mCollecting pymupdf
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m145.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.4


In [5]:
!apt-get install -y poppler-utils


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 29 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Fetched 186 kB in 1s (312 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 125044 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.6_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.6) ...
Setting up poppler-utils (22.02.0-2ubuntu0.6) ...
Processing triggers for man-db (2.10.2-1) ...


In [6]:
!apt-get install -y tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 29 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (5,554 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 125074 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

In [13]:
import os
import torch
import pytesseract
import re
from PIL import Image
from pdf2image import convert_from_path
import fitz  # PyMuPDF for PDFs
import docx2txt
from google.colab import files
from transformers import VisionEncoderDecoderModel, DonutProcessor

# Load Donut Model for document understanding
model_name = "naver-clova-ix/donut-base-finetuned-docvqa"
processor = DonutProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

# Function to extract text from a text-based PDF
def extract_text_from_pdf(pdf_path):
    """Extract text from a digital (non-scanned) PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()

# Function to convert PDF to images (for scanned PDFs)
def convert_pdf_to_images(pdf_path):
    """Convert a scanned PDF to images (one per page)."""
    return convert_from_path(pdf_path)

# Function to extract text from DOCX
def extract_text_from_docx(docx_path):
    """Extract text from a DOCX file."""
    return docx2txt.process(docx_path)

# Function to extract text from images using OCR
def extract_text_from_image(image):
    """Extract text from an image using OCR."""
    text = pytesseract.image_to_string(image)
    text = re.sub(r'\s+', ' ', text).strip()  # Clean extracted text
    return text

# Function to process different file types
def process_document(file_path):
    """Detect file type and process appropriately (PDF, DOCX, Image)."""
    ext = file_path.lower().split('.')[-1]

    if ext in ["png", "jpg", "jpeg"]:
        image = Image.open(file_path).convert("RGB")
        return None, image

    elif ext == "pdf":
        images = convert_pdf_to_images(file_path)  # Convert PDF to images
        if len(images) > 0:
            return None, images[0]  # Use the first page as an image
        else:
            text = extract_text_from_pdf(file_path)
            return text, None

    elif ext == "docx":
        text = extract_text_from_docx(file_path)
        return text, None

    else:
        raise ValueError("Unsupported file format! Only PDFs, DOCX, and images are supported.")

# Function to ask a question using Donut
def ask_question(document_text, document_image, question):
    """Answer questions using Donut model for images or text-based documents."""
    if document_image:
        print("🔍 Processing Image-Based Document...")
        inputs = processor(images=document_image, text=question, return_tensors="pt", legacy=False)
    else:
        print("📝 Processing Text-Based Document...")
        inputs = processor(text=document_text, text_pair=question, return_tensors="pt", legacy=False)

    outputs = model.generate(**inputs)
    answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    return answer if answer else "Answer not found."

# Upload file in Google Colab
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
file_path = file_name

# Process document (extract text or convert to image)
document_text, document_image = process_document(file_path)

# Ask a question
question = input("Enter your question: ")
answer = ask_question(document_text, document_image, question)

# Display Answer
print(f"\n📌 Answer: {answer}")


Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    2560,
    1920
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "qkv_bias": true,
  "transformers_version": "4.48.3",
  "use_absolute_embeddings": false,
  "window_size": 10
}

Config of the decoder: <class 'transformers.models.mbart.modeling_mbart.MBartForCausalLM'> is overwritten by shared decoder config: MBartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "add

Saving thumbnail23.png to thumbnail23.png
Enter your question: what is the diagonsis of the patient?
🔍 Processing Image-Based Document...

📌 Answer: what is the diagonsis of the patient? after thorough examination, no specific medicine conditions or acute illnesses wereidentifi
