In [1]:
# imports and helper functions

import os
import sys
import webdataset as wds
import braceexpand
import tempfile
import glob
from itertools import islice
import random

def summarize(sample):
    for k, v in sample.items():
        print(k, repr(v)[:100])

def read_binary(fname):
    with open(fname, "rb") as stream:
        return stream.read()

# Parallel Processing of Shards: Large Scale OCR

This notebook illustrates how to take a large collection of shards consisting of PDFs and process them using `pdftoppm` and `tessearact` into a new dataset consisting of page images and corresponding OCR output.

The general approach is to process each shard sequentially and to process multiple shards in parallel. The basic structure of such a job looks like:

```Python
with WebDataset(srcname) as src:
    with TarWriter(dstname) as dst:
        for sample in src:
            ... do something with sample ...
            dst.write(sample)
upload(dstname)
```

# The Arxiv Dataset of PDFs

In [2]:
# The dataset is tar files containing PDFs, each using the Arxiv naming convention.

!gsutil cat gs://webdataset/testdata/arxiv-pdfs-{000000..000001}.tar | tar tf - | sed 5q

1808.00020v6.pdf


1511.05082v1.pdf


1610.08000v1.pdf
1506.03736v2.pdf
1909.03824v1.pdf


tar: stdout: write error


In [3]:
# Arxiv naming convenitions are incompatible with WebDataset, but we can add
# a file renaming function to the WebDataset to fix this.

def arxiv_rename(name):
    return name.replace(".pdf", "").replace(".", "_") + ".pdf"

# For this example, we just use two shards, but usually, you would have hundreds
# or thousands of shards.

dataset = "gs://webdataset/testdata/arxiv-pdfs-{000000..000001}.tar"

# Let's open the dataset and read the first sample.

shardurls = list(braceexpand.braceexpand(dataset))
ds = wds.WebDataset(shardurls, rename_files=arxiv_rename)
sample = next(iter(ds))
summarize(sample)

GOPEN gs://webdataset/testdata/arxiv-pdfs-000000.tar {}


__key__ '1808_00020v6'
__url__ 'gs://webdataset/testdata/arxiv-pdfs-000000.tar'
pdf b'%PDF-1.5\n%\x8f\n18 0 obj\n<< /Filter /FlateDecode /Length 5428 >>\nstream\nx\xda\xad[]\xb3\xe3\xb


# Running Tesseract on a Single PDF

In [4]:
def process_sample(sample, maxpages=9999, shuffle=True):
    """Process a sample from the Arxiv dataset.

    This function converts the PDF file to a sequence of JPEG images
    and then invokes Tesseract to recognize the text in the images.
    It returns a sequence of samples, one per page, each containing
    the JPEG image and the hOCR output from Tesseract.
    """

    # We work in a temporary directory; most operations are command line tools

    with tempfile.TemporaryDirectory() as dirname:

        # Write the PDF file to disk and convert it to a sequence of JPEGs using pdftoppm
        pdfpath = dirname + "/sample.pdf"
        with open(pdfpath, "wb") as stream:
            stream.write(sample["pdf"])
        assert os.system(f"(cd {dirname} && pdftoppm -forcenum -jpeg -r 300 -l 9999 sample.pdf page)") == 0
        
        # Next, we are going to iterate over the pages, convert them to text using tesseract,
        pages = sorted(glob.glob(dirname + "/page-*.jpg"))
        if shuffle:
            random.shuffle(pages)

        for page in islice(pages, maxpages):
            page_without_suffix = page[:-4]
            base = os.path.basename(page_without_suffix)

            # Invoke Tesseract to convert the page image to hOCR.
            os.system(f"tesseract {page} {page_without_suffix} hocr")

            # Construct the output sample.
            nsample = {
                "__key__": sample["__key__"] + f"/{base}",
                "jpg": read_binary(page_without_suffix + ".jpg"),
                "hocr": read_binary(page_without_suffix + ".hocr"),
            }

            # This function returns an iterator over the recognized pages.
            yield nsample

In [5]:
output = next(process_sample(sample))
summarize(output)

Tesseract Open Source OCR Engine v4.1.1 with Leptonica


__key__ '1808_00020v6/page-14'
jpg b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x01,\x01,\x00\x00\xff\xdb\x00C\x00\x08\x06\x06\x07\x0
hocr b'<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional/


# Processing a Shard of PDF Files

In [6]:
def process_shard(src, dst, maxpdfs=999999, maxpages=9999):
    """Process a shard of the Arxiv dataset.

    This function reads a shard of the Arxiv dataset, processes each sample
    using the process_sample function, and writes the page images and corresponding
    hOCR output to a new shard, one sample per page.

    The maxpdfs and maxpages parameters can be used to limit the number of
    samples and pages processed. This is useful for testing, as well as limit
    the number of pages selected from very long PDF documents.
    """
    with wds.TarWriter(dst) as sink:
        for sample in islice(wds.WebDataset(src, rename_files=arxiv_rename), maxpdfs):
            print(sample["__key__"], sample.keys())
            for nsample in process_sample(sample, maxpages=maxpages):
                print("    ", nsample["__key__"])
                sink.write(nsample)

In [7]:
!rm -f output.tar
process_shard(shardurls[0], "output.tar", maxpdfs=2, maxpages=2)

GOPEN output.tar {}
GOPEN gs://webdataset/testdata/arxiv-pdfs-000000.tar {}


1808_00020v6 dict_keys(['__key__', '__url__', 'pdf'])


Tesseract Open Source OCR Engine v4.1.1 with Leptonica


     1808_00020v6/page-10


Tesseract Open Source OCR Engine v4.1.1 with Leptonica


     1808_00020v6/page-04
1511_05082v1 dict_keys(['__key__', '__url__', 'pdf'])


Tesseract Open Source OCR Engine v4.1.1 with Leptonica


     1511_05082v1/page-03


Tesseract Open Source OCR Engine v4.1.1 with Leptonica


     1511_05082v1/page-11


In [8]:
!tar tvf output.tar

-r--r--r-- bigdata/bigdata 78493 2023-12-18 00:10 1808_00020v6/page-10.hocr
-r--r--r-- bigdata/bigdata 915037 2023-12-18 00:10 1808_00020v6/page-10.jpg
-r--r--r-- bigdata/bigdata  90999 2023-12-18 00:10 1808_00020v6/page-04.hocr
-r--r--r-- bigdata/bigdata 993979 2023-12-18 00:10 1808_00020v6/page-04.jpg
-r--r--r-- bigdata/bigdata  61068 2023-12-18 00:10 1511_05082v1/page-03.hocr
-r--r--r-- bigdata/bigdata 782846 2023-12-18 00:10 1511_05082v1/page-03.jpg
-r--r--r-- bigdata/bigdata  31699 2023-12-18 00:10 1511_05082v1/page-11.hocr
-r--r--r-- bigdata/bigdata 428418 2023-12-18 00:10 1511_05082v1/page-11.jpg


# Parallelizing Processing with Ray

This illustrates how to use Ray to process many shards in parallel.

You don't need to use Ray for this, you can also invoke `process_shard` in parallel using a job queueing system or using some other distributed computing framework.

Generally, it is easiest to process each shard sequentially, and to process multiple shards in parallel. However, you could use additional parallelization to perform processing of the samples in parallel.

In [9]:
maxpdfs = 2  # for testing, we just use two PDFs per shard
maxpages = 2  # for testing, we just use two pages per PDF
upload_cmd = "echo gsutil cp {src} {dst}"  # for testing, we don't actually upload the completed shards

import ray
if not ray.is_initialized():
    ray.init()

@ray.remote(num_cpus=4)
def process_shard_parallel(src, dstbucket, maxpdfs=999999, maxpages=9999):
    """Process a shard of the Arxiv dataset and upload the output shard to a bucket.

    This function reads a shard of the Arxiv dataset, processes each sample
    using the process_sample function, and writes the page images and corresponding 
    hOCR output to a new shard, one sample per page. The output shard is then
    uploaded to the specified bucket using `upload_cmd`.
    """
    dst = dstbucket + "/" + os.path.basename(src)
    with tempfile.NamedTemporaryFile() as tmp:
        process_shard(src, tmp.name, maxpdfs=maxpdfs, maxpages=maxpages)
        assert os.system(upload_cmd.format(src=tmp.name, dst=dst)) == 0

!rm -f output.tar
ray.get([process_shard_parallel.remote(src, "gs://somebucket", maxpdfs=maxpdfs, maxpages=maxpages) for src in shardurls])


2023-12-18 00:10:51,643	INFO worker.py:1664 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


[36m(process_shard_parallel pid=718187)[0m GOPEN /tmp/tmp8x64grix {}
[36m(process_shard_parallel pid=718187)[0m GOPEN gs://webdataset/testdata/arxiv-pdfs-000000.tar {}


[36m(process_shard_parallel pid=718192)[0m 1402_1973v2 dict_keys(['__key__', '__url__', 'pdf'])


[36m(process_shard_parallel pid=718192)[0m Tesseract Open Source OCR Engine v4.1.1 with Leptonica


[36m(process_shard_parallel pid=718192)[0m      1402_1973v2/page-16


[36m(process_shard_parallel pid=718192)[0m Tesseract Open Source OCR Engine v4.1.1 with Leptonica
[36m(process_shard_parallel pid=718192)[0m GOPEN gs://webdataset/testdata/arxiv-pdfs-000001.tar {}[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m


[36m(process_shard_parallel pid=718187)[0m Tesseract Open Source OCR Engine v4.1.1 with Leptonica


[36m(process_shard_parallel pid=718192)[0m      1402_1973v2/page-04
[36m(process_shard_parallel pid=718192)[0m 1612_01474v3 dict_keys(['__key__', '__url__', 'pdf'])[32m [repeated 2x across cluster][0m


[36m(process_shard_parallel pid=718187)[0m Detected 71 diacritics


[36m(process_shard_parallel pid=718192)[0m Tesseract Open Source OCR Engine v4.1.1 with Leptonica


[36m(process_shard_parallel pid=718187)[0m      1808_00020v6/page-03


[36m(process_shard_parallel pid=718187)[0m      1808_00020v6/page-15
[36m(process_shard_parallel pid=718187)[0m 1511_05082v1 dict_keys(['__key__', '__url__', 'pdf'])


[36m(process_shard_parallel pid=718187)[0m      1511_05082v1/page-16[32m [repeated 2x across cluster][0m


[36m(process_shard_parallel pid=718187)[0m Tesseract Open Source OCR Engine v4.1.1 with Leptonica[32m [repeated 4x across cluster][0m


[36m(process_shard_parallel pid=718192)[0m gsutil cp /tmp/tmpda6ohiy3 gs://somebucket/arxiv-pdfs-000001.tar


[None, None]