In [4]:
!pip install llama_parse mistralai

Collecting llama_parse
  Downloading llama_parse-0.6.4.post1-py3-none-any.whl.metadata (6.9 kB)
Collecting mistralai
  Downloading mistralai-1.6.0-py3-none-any.whl.metadata (30 kB)
Collecting llama-cloud-services>=0.6.4 (from llama_parse)
  Downloading llama_cloud_services-0.6.9-py3-none-any.whl.metadata (2.9 kB)
Collecting eval-type-backport>=0.2.0 (from mistralai)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting llama-cloud<0.2.0,>=0.1.17 (from llama-cloud-services>=0.6.4->llama_parse)
  Downloading llama_cloud-0.1.17-py3-none-any.whl.metadata (902 bytes)
Collecting llama-index-core>=0.11.0 (from llama-cloud-services>=0.6.4->llama_parse)
  Downloading llama_index_core-0.12.27-py3-none-any.whl.metadata (2.6 kB)
Collecting python-dotenv<2.0.0,>=1.0.1 (from llama-cloud-services>=0.6.4->llama_parse)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting banks<3.0.0,>=2.0.0 (from llama-index-core>=0.11.0->llama-cloud-services>=0.

In [5]:
from llama_parse import LlamaParse
import nest_asyncio
from dotenv import load_dotenv
import os
from pathlib import Path

load_dotenv()

nest_asyncio.apply()

In [6]:
from google.colab import userdata

LLAMA_API_KEY = userdata.get("LLAMA_API_KEY")
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
GEMINI_API_KEY = userdata.get("GEMINI_API_KEY")
MISTRAL_API_KEY = userdata.get("MISTRAL_API_KEY")
ANTHROPIC_API_KEY = userdata.get("ANTHROPIC_API_KEY")

In [38]:
tiff_dir = Path("./data/tiff")
pdf_dir = Path("./data/pdf")
jpeg_dir = Path("./data/jpeg")

In [36]:
from PIL import Image, ImageSequence
from tqdm import tqdm


def tiff_to_pdf(tiff_path: Path) -> None:
    pdf_path = pdf_dir / Path(tiff_path.stem).with_suffix(".pdf")
    print(f"{tiff_path.name} -> {pdf_path.name}")

    if not os.path.exists(tiff_path):
        raise Exception(f"{tiff_path} does not find.")
    image = Image.open(tiff_path)

    images = []
    for i, page in enumerate(ImageSequence.Iterator(image)):
        page = page.convert("RGB")
        images.append(page)
    if len(images) == 1:
        images[0].save(pdf_path)
    else:
        images[0].save(pdf_path, save_all=True, append_images=images[1:])
    print("OK")


def tiff_to_jpeg(tiff_path: Path) -> None:
    jpeg_path = jpeg_dir / Path(tiff_path.stem).with_suffix(".jpeg")

    print(f"{tiff_path.name} -> {jpeg_path.name}")

    tiff_image = Image.open(tiff_path)
    # Convert the image to JPEG format
    jpeg_image = tiff_image.convert("RGB")

    # Save the JPEG image
    jpeg_image.save(str(jpeg_path))
    print("OK")

In [40]:
# Convert tiff to both jpeg and pdf formats
for tiff_fp in tqdm(tiff_dir.iterdir()):
    if not tiff_fp.is_file():
        continue
    tiff_to_pdf(tiff_fp)
    tiff_to_jpeg(tiff_fp)

2it [00:00, 17.40it/s]

UB04 Matlock.tif -> UB04 Matlock.pdf
OK
UB04 Matlock.tif -> UB04 Matlock.jpeg
OK
HCFA.3.tif -> HCFA.pdf
OK
HCFA.3.tif -> HCFA.jpeg
OK
HCFA.5.tif -> HCFA.pdf
OK
HCFA.5.tif -> HCFA.jpeg
OK
GoodUB04.3_Page_1.tiff -> GoodUB04.pdf


4it [00:00,  5.65it/s]

OK
GoodUB04.3_Page_1.tiff -> GoodUB04.jpeg
OK
Perfect UB04.tiff -> Perfect UB04.pdf


6it [00:00,  7.58it/s]

OK
Perfect UB04.tiff -> Perfect UB04.jpeg
OK
Wilson 1500 form.tif -> Wilson 1500 form.pdf
OK
Wilson 1500 form.tif -> Wilson 1500 form.jpeg


8it [00:01,  4.22it/s]

OK
UB04 Morrison.tif -> UB04 Morrison.pdf
OK
UB04 Morrison.tif -> UB04 Morrison.jpeg
OK
GoodUB04.3_Page_3.tiff -> GoodUB04.pdf


9it [00:01,  3.95it/s]

OK
GoodUB04.3_Page_3.tiff -> GoodUB04.jpeg
OK
HCFA.2.tiff -> HCFA.pdf




OK
HCFA.2.tiff -> HCFA.jpeg
OK
itemized billing1.tif -> itemized billing1.pdf


11it [00:02,  4.31it/s]

OK
itemized billing1.tif -> itemized billing1.jpeg
OK
HCFA.4.tif -> HCFA.pdf
OK
HCFA.4.tif -> HCFA.jpeg


12it [00:02,  4.77it/s]

OK
GoodUB04.2_Page_3.tiff -> GoodUB04.pdf
OK
GoodUB04.2_Page_3.tiff -> GoodUB04.jpeg


13it [00:02,  4.63it/s]

OK
GoodUB04.2_Page_1.tiff -> GoodUB04.pdf
OK
GoodUB04.2_Page_1.tiff -> GoodUB04.jpeg


14it [00:02,  4.39it/s]

OK
UB04 Forconi 2023.tif -> UB04 Forconi 2023.pdf


15it [00:03,  3.30it/s]

OK
UB04 Forconi 2023.tif -> UB04 Forconi 2023.jpeg
OK
itemized billing2.tif -> itemized billing2.pdf


16it [00:03,  2.95it/s]

OK
itemized billing2.tif -> itemized billing2.jpeg
OK
GoodUB04.2_Page_4.tiff -> GoodUB04.pdf
OK
GoodUB04.2_Page_4.tiff -> GoodUB04.jpeg


17it [00:04,  3.45it/s]

OK
GoodUB04.1_Page_1.tiff -> GoodUB04.pdf
OK
GoodUB04.1_Page_1.tiff -> GoodUB04.jpeg
OK
GoodUB04.2_Page_2.tiff -> GoodUB04.pdf
OK
GoodUB04.2_Page_2.tiff -> GoodUB04.jpeg


19it [00:04,  4.34it/s]

OK
GoodUB04.3_Page_2.tiff -> GoodUB04.pdf
OK
GoodUB04.3_Page_2.tiff -> GoodUB04.jpeg


20it [00:04,  4.29it/s]

OK
itemized billing3.tif -> itemized billing3.pdf


21it [00:06,  1.86it/s]

OK
itemized billing3.tif -> itemized billing3.jpeg
OK
HCFA.1.tiff -> HCFA.pdf
OK
HCFA.1.tiff -> HCFA.jpeg
OK
UB04 Hill not great quality.tif -> UB04 Hill not great quality.pdf


23it [00:06,  2.27it/s]

OK
UB04 Hill not great quality.tif -> UB04 Hill not great quality.jpeg
OK
GoodUB04.1_Page_3.tiff -> GoodUB04.pdf
OK
GoodUB04.1_Page_3.tiff -> GoodUB04.jpeg
OK
Burris 1500 forms.tiff -> Burris 1500 forms.pdf


25it [00:06,  3.64it/s]

OK
Burris 1500 forms.tiff -> Burris 1500 forms.jpeg
OK





In [44]:
import json
from mistralai import Mistral
from mistralai import DocumentURLChunk

api_key = MISTRAL_API_KEY
client = Mistral(api_key=api_key)
MISTRAL_MODEL = "mistral-ocr-latest"


def mistral_ocr(pdf_file):
    uploaded_pdf = client.files.upload(
        file={
            "file_name": str(pdf_file),
            "content": open(pdf_file, "rb"),
        },
        purpose="ocr",
    )

    signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)

    ocr_response = client.ocr.process(
        model=MISTRAL_MODEL,
        document=DocumentURLChunk(document_url=signed_url.url),
        include_image_base64=True,
    )

    # Convert response to JSON format
    response_dict = json.loads(ocr_response.model_dump_json())
    return response_dict

In [45]:
ocr_results = {}

In [46]:
mistral_res = {}

for pdf_file in tqdm(pdf_dir.iterdir()):
    if not pdf_file.is_file():
        continue
    try:
        ocr_result = mistral_ocr(pdf_file)
    except Exception as e:
        ocr_result = str(e)
    mistral_res[str(pdf_file)] = ocr_result


for file in mistral_res:
    text_content = []
    for page in mistral_res[file]["pages"]:
        text_content.append(page["markdown"])

mistral_res = "\n".join(text_content)

ocr_results[MISTRAL_MODEL] = mistral_res

12it [00:26,  2.25s/it]


In [47]:
# Gemini 2.0 Flash
GEMINI_MODEL = "gemini-2.0-flash-001"
parser = LlamaParse(
    api_key=LLAMA_API_KEY,
    result_type="markdown",
    verbose=True,
    language="en",
    take_screenshot=True,
    premium_mode=True,
    auto_mode_trigger_on_table_in_page=True,
    use_vendor_multimodal_model=True,
    vendor_multimodal_api_key=GEMINI_API_KEY,
    vendor_multimodal_model_name=GEMINI_MODEL,
)

gemini_res = {}

for file in tqdm(pdf_dir.iterdir()):
    result = parser.load_data(file)
    gemini_res[str(file)] = result[0].get_content()

ocr_results[GEMINI_MODEL] = gemini_res

0it [00:00, ?it/s]

Started parsing the file under job_id b52ddb2a-a992-4c70-b977-704af66f7e32


1it [00:28, 28.26s/it]

Started parsing the file under job_id b0f3eb5f-c24c-4852-a6b5-481bdecc02c5


2it [00:55, 27.42s/it]

Started parsing the file under job_id 9eccfaf6-21dd-4a5b-a53e-d0ee33ea3e81


3it [01:14, 23.88s/it]

Started parsing the file under job_id 29198af8-77d5-4d18-ba7f-45f8edae5245


4it [01:28, 19.92s/it]

Started parsing the file under job_id eae035b0-f3a4-4489-aa22-368b249d1d12


5it [01:43, 18.03s/it]

Started parsing the file under job_id c23d73cb-33e1-4e48-95e4-a73e169b67e3
.

6it [02:56, 36.87s/it]

Started parsing the file under job_id 4801d698-8e36-49c4-8ab6-ac4f277686c4


7it [03:23, 33.47s/it]

Started parsing the file under job_id 87170992-c218-4a5a-bb85-65ccc9cd21d5


8it [03:49, 31.31s/it]

Started parsing the file under job_id 0256d37e-ef75-47be-8ea0-d04886fbd2f2


9it [04:16, 29.71s/it]

Started parsing the file under job_id 9704c9bb-2479-47ac-9369-f48376044eac
.

10it [05:30, 43.45s/it]

Started parsing the file under job_id ef61a4d7-af15-4491-b89f-f3b4deb65a9d


11it [05:57, 38.38s/it]

Started parsing the file under job_id e42e787a-a33d-4b93-8e08-d2da03c7e911
.

12it [06:52, 34.36s/it]


In [48]:
# Sonnet 3.7
ANTHROPIC_MODEL = "anthropic-sonnet-3.7"
parser = LlamaParse(
    api_key=LLAMA_API_KEY,
    result_type="markdown",
    verbose=True,
    language="en",
    take_screenshot=True,
    premium_mode=True,
    auto_mode_trigger_on_table_in_page=True,
    use_vendor_multimodal_model=True,
    vendor_multimodal_api_key=ANTHROPIC_API_KEY,
    vendor_multimodal_model_name=ANTHROPIC_MODEL,
)

anthropic_res = {}

for file in tqdm(pdf_dir.iterdir()):
    result = parser.load_data(file)
    anthropic_res[str(file)] = result[0].get_content()

0it [00:00, ?it/s]

Started parsing the file under job_id b39417ec-b4d2-4235-81cb-783676286f79
.

1it [01:30, 90.75s/it]

Started parsing the file under job_id 30ccfb77-e1d9-4044-bc48-a80cc0d714df
.

2it [02:31, 73.23s/it]

Started parsing the file under job_id 2926f636-1194-4105-8553-ebd7d8990d69


3it [03:09, 57.01s/it]

Started parsing the file under job_id 1679f920-bfef-4bc1-8afe-d99b58570191


4it [03:52, 51.61s/it]

Started parsing the file under job_id 191df752-a77f-40c2-be72-83e561887649


5it [04:29, 46.42s/it]

Started parsing the file under job_id 1fcc2a7d-4d2f-4e5a-a066-b03987785d90


6it [04:41, 34.46s/it]

Started parsing the file under job_id 4eefbf3d-caec-4d70-bb52-3ca11fb47a1f
.

7it [05:35, 40.92s/it]

Started parsing the file under job_id 6defda87-8273-46b2-8f6f-8d420d349023
..

8it [07:33, 65.48s/it]

Started parsing the file under job_id a3121672-334c-4af3-9b70-4f3b462352bd
.

9it [09:09, 75.02s/it]

Started parsing the file under job_id f505ad56-444b-4081-af67-afd461a36fa8
...

9it [12:35, 83.93s/it]

Error while parsing the file '<bytes/buffer>': Job ID: f505ad56-444b-4081-af67-afd461a36fa8 failed with status: ERROR, Error code: RATE_LIMIT_EXCEEDED, Error message: Rate limit on vendor model exceeded. Please check your token consumption limits.





IndexError: list index out of range

In [50]:
ocr_results[ANTHROPIC_MODEL] = anthropic_res

In [51]:
# GPT-4o
OPENAI_MODEL = "openai-gpt4o"

parser = LlamaParse(
    api_key=LLAMA_API_KEY,
    result_type="markdown",
    verbose=True,
    language="en",
    premium_mode=True,
    take_screenshot=True,
    auto_mode_trigger_on_table_in_page=True,
    use_vendor_multimodal_model=True,
    vendor_multimodal_api_key=OPENAI_API_KEY,
    vendor_multimodal_model_name=OPENAI_MODEL,
)

openai_res = {}

for file in tqdm(pdf_dir.iterdir()):
    result = parser.load_data(file)
    openai_res[str(file)] = result[0].get_content()

ocr_results[OPENAI_MODEL] = openai_res

0it [00:00, ?it/s]

Started parsing the file under job_id 36313c9c-29a6-488c-903e-ef1bb4a166bf
.

1it [01:09, 69.51s/it]

Started parsing the file under job_id 0d8d77e8-2102-4ba6-aa6d-6debb77ef4d1


2it [01:53, 54.27s/it]

Started parsing the file under job_id 287a437b-2746-44b0-b2b4-aba0b9b33863


3it [01:59, 32.21s/it]

Started parsing the file under job_id 06651a20-cfca-463b-aae0-8f838e6b318c
..

4it [04:25, 77.27s/it]

Started parsing the file under job_id 4e37f2a5-bc3a-4ef0-97a7-11633e4e6b81
..

5it [06:47, 100.71s/it]

Started parsing the file under job_id cad80a29-33b1-445a-9b43-10bd31a37a92


6it [07:03, 71.99s/it] 

Started parsing the file under job_id 9224ac2b-9c21-49e2-8f86-ba3b5c7b9c36
.

7it [08:17, 72.34s/it]

Started parsing the file under job_id aa203f95-7c9b-4860-b131-0eba65242709


8it [08:48, 59.41s/it]

Started parsing the file under job_id fd175c6a-ace5-44e6-bbce-28a81065bae9
.

9it [09:46, 58.90s/it]

Started parsing the file under job_id b6fa033e-61d5-4f1a-b28a-b669ab622357
.

10it [11:23, 70.78s/it]

Started parsing the file under job_id 460476d2-01ec-483c-9386-e1a327d7ee39
...

11it [14:15, 101.79s/it]

Started parsing the file under job_id 695e7e05-d772-4c72-8170-930fc26b9556


12it [14:42, 73.55s/it]


In [53]:
with open("/content/results.json", "w") as fp:
    json.dump(ocr_results, fp)