# Installs and Imports

In [None]:
!pip install python-doctr
!pip install tf2onnx
!pip install opencv-python



Collecting python-doctr
  Downloading python_doctr-0.10.0-py3-none-any.whl.metadata (33 kB)
Collecting pypdfium2<5.0.0,>=4.11.0 (from python-doctr)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyclipper<2.0.0,>=1.2.0 (from python-doctr)
  Downloading pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting langdetect<2.0.0,>=1.0.9 (from python-doctr)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz<4.0.0,>=3.0.0 (from python-doctr)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collect



In [None]:
#utilities
import os
import time
import json
import tqdm
from tqdm import tqdm

#deep learning
import torch
print(torch.__version__)  # Should print a compatible version
print(torch.cuda.is_available())  #

#ocr model
import doctr
from doctr.models import ocr_predictor
from doctr.io import DocumentFile



2.5.1+cu121
True


In [None]:
!unzip /content/TIF_Double_Directory.zip

Archive:  /content/TIF_Double_Directory.zip
   creating: TIF_Double_Directory/
  inflating: __MACOSX/._TIF_Double_Directory  
   creating: TIF_Double_Directory/4005813/
   creating: TIF_Double_Directory/4105565/
   creating: TIF_Double_Directory/4104649/
  inflating: TIF_Double_Directory/.DS_Store  
  inflating: __MACOSX/TIF_Double_Directory/._.DS_Store  
   creating: TIF_Double_Directory/4305776/
   creating: TIF_Double_Directory/4104165/
   creating: TIF_Double_Directory/48167_4000820/
   creating: TIF_Double_Directory/4205888/
   creating: TIF_Double_Directory/4302583/
   creating: TIF_Double_Directory/4005318/
   creating: TIF_Double_Directory/4307375/
   creating: TIF_Double_Directory/4005800/
   creating: TIF_Double_Directory/4306466/
   creating: TIF_Double_Directory/4306607/
   creating: TIF_Double_Directory/4302425/
   creating: TIF_Double_Directory/4307353/
   creating: TIF_Double_Directory/4004046/
   creating: TIF_Double_Directory/4104592/
   creating: TIF_Double_Directory/

# Initializing OCR Model

In [None]:
#initialize ocr model, given model is the closest to textract
  #det_arch - text detection model
  #reco_arch - text recognition mdoel
ocr = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)


Downloading https://doctr-static.mindee.com/models?id=v0.9.0/db_resnet50-649fa22b.weights.h5&src=0 to /root/.cache/doctr/models/db_resnet50-649fa22b.weights.h5


  0%|          | 0/101736536 [00:00<?, ?it/s]

DEBUG:tensorflow:Layer lstm will use cuDNN kernels when running on GPU.
DEBUG:tensorflow:Layer lstm will use cuDNN kernels when running on GPU.
DEBUG:tensorflow:Layer lstm will use cuDNN kernels when running on GPU.
DEBUG:tensorflow:Layer lstm_1 will use cuDNN kernels when running on GPU.
DEBUG:tensorflow:Layer lstm_1 will use cuDNN kernels when running on GPU.
DEBUG:tensorflow:Layer lstm_1 will use cuDNN kernels when running on GPU.


Downloading https://doctr-static.mindee.com/models?id=v0.9.0/crnn_vgg16_bn-9c188f45.weights.h5&src=0 to /root/.cache/doctr/models/crnn_vgg16_bn-9c188f45.weights.h5


  0%|          | 0/63386336 [00:00<?, ?it/s]



# Helper Functions - Collect Data, Run OCR, Process OCR Output, and Output final JSON

## Single Page Approach

In [None]:
def get_ocr_single(document):

  """
    This function is our OCR wrapper - does OCR + processes OCR model output.
    This function is for the one-page approach. It will expect a docTR document with one page.
    This function accepts a docTR document object, processes it using the docTR OCR model to extract the text,
    deals with model output to output a concatenated string with the text.

    Parameters:
    - document (DocumentFile): A Doctr document object

    Returns:
    - text (str): The concatenated OCR text extracted from the document, where words are separated by spaces.
  """

  #OCR
  ocr_result = ocr(document) #passing docTR document object to model

  #Output Processing
  text = '' #creating an empty string

  for page in ocr_result.pages: #iterating through the one-page docTR object structure
    for block in page.blocks:
      for line in block.lines:
        for word in line.words:
          text += word.value + ' ' #adding a space after each word
        text += '\n' + ' ' #adding a line after each word


  return text



In [None]:
#function that takes a directory, retrieves data from it, converts them to Doctr document objects, passes it to the ocr model, and collects the output

def get_data_single(directory):


  """
    This function is for handling the one page approach.
    This function retrieves TIFF files from the specified directory, converts them into DocTR document objects,
    processes them through the docTR OCR model to extract text, and returns the output in a structured format.

    Parameters:
    - directory : The directory containing the TIFF files to be processed.It assumes that all TIFF documents are contained within one.

    Returns:
    - output_json (dict): A dictionary where the keys are document names and the values are the corresponding OCR output texts.
    - document_names (list): A list of the names of the documents processed (TIFF files). For Debugging.
    - document_paths (list): A list of the full file paths of the documents. For Debugging.
    - document_objects (list): A list of Doctr document objects created from the TIFF files. For Debugging.
    - document_texts (list): A list of OCR text outputs corresponding to each document. For Debugging.
  """

  start_time = time.time()

  #Storage
  document_names = [item for item in os.listdir(directory) if item.endswith('tif')]
  document_paths = []
  document_objects = [] #doctr document objects
  document_texts = [] #ocr output from document objects
  output_json = {} # final outpus json


  #Path Collection
    #gets the paths of all stored TIFF files
  for doc in os.listdir(directory):
    if doc.endswith('.tif'):
      path = os.path.join(directory, doc)
      document_paths.append(path)


  #Doc Object Creation
    #creates docTR document objects from the TIFF files
  for path in document_paths:
    document = DocumentFile.from_images(path)
    document_objects.append(document)

  #OCR
    #stores a list of all OCR outputs
  for doc in tqdm(document_objects, desc = 'Processing Document'):
    text = get_ocr_single(doc) #calling out ocr wrapper
    document_texts.append(text)

  end_time = time.time()
  print(f'Run time for {len(document_names)} documents: {end_time - start_time}.')

  #Populating JSON
    #keys are document names, values are OCR outputs
  for name, text in zip(document_names, document_texts):
    output_json[name] = text


  #returns the JSON and other information for debugging.
  return output_json, document_names, document_paths, document_objects, document_texts

In [None]:
#Running
directory = '/content/' #defining directory name
output_json, document_names, document_paths, document_images, document_texts = get_data_single(directory = directory) #collecting outputs

Processing Document: 100%|██████████| 25/25 [03:51<00:00,  9.25s/it]

Run time for 25 documents: 232.0867350101471.





In [None]:
with open('output_json_25_newline.json', 'w') as file:
  json.dump(output_json, file, indent = 4)

In [None]:
len(output_json)

25

## Two-Page Approach

In [None]:
def get_ocr_double(document): #accepts DocTR documenr


  """
    This function is our OCR wrapper - does OCR + processes OCR model output.
    This function is for the two-page approach. It will expect a docTR document with two-pages.
    This function accepts a docTR document object with two-pages, processes it using the docTR OCR model to extract the text,
    deals with model output to output a concatenated string with the text.

    Parameters:
    - document (DocumentFile): A Doctr document object

    Returns:
    - text (str): The concatenated OCR text extracted from the document, where words are separated by spaces.
  """

  #OCR
  ocr_result = ocr(document)  #passing two-page docTR object through the OCR model


  #Output Processing
  text = ''

  for page in ocr_result.pages: #iterating through document object
      for block in page.blocks:
          for line in block.lines:
              for word in line.words:
                  text += word.value + ' ' #each word followed by a space
              text += '\n' + ' ' #each line followed by a new line

  return text



In [None]:
def get_data_double(directory):


  """
    This function is for handling the two page approach.
    This function retrieves TIFF files from the specified directory, converts them into DocTR document objects (each with two pages),
    processes them through the docTR OCR model to extract text, and returns the output in a structured format.

    Parameters:
    - directory : The directory containing the TIFF files to be processed. Expects a nested directory structure as below:

      document_directory/
      │
      ├── document_1/
      │   ├── page1.tif
      │   └── page2.tif
      │
      ├── document_2/
      │   ├── page1.tif
      │   └── page2.tif
      │
      ├── document_3/
      │   ├── page1.tif
      │   └── page2.tif

    Returns:
    - output_json (dict): A dictionary where the keys are document names and the values are the corresponding OCR output texts.
    - document_names (list): A list of the names of the documents processed (TIFF files). For Debugging.
    - document_paths (list): A list of the full file paths of the documents. For Debugging.
    - document_objects (list): A list of Doctr document objects created from the TIFF files. For Debugging.
    - document_texts (list): A list of OCR text outputs corresponding to each document. For Debugging.
  """

  start_time = time.time()

  #Storage
  internal_directory_paths = [] #names of all internal directories (document_1/, document_2/)
  document_names = [] #names of all documents
  document_paths = [] #full paths for all documents
  first_page_names = [] #the name of the first document, this will be the key in the output JSON

  document_objects = [] #document objects
  document_texts = [] #ocr outputs
  output_json = {} #final output JSON

  #Path Collection
    #gets the paths of all stored files, we need it in the end. we do not need this, just for debugging.
    #can also do internal_directory_paths = [directory for directory in master directory]
  for internal_directory in os.listdir(directory):
      if internal_directory != ".DS_Store":
        directory_path = os.path.join(directory, internal_directory)
        internal_directory_paths.append(directory_path) #this will store all internal directory paths
        if os.path.isdir(directory_path):
          for page in os.listdir(directory_path):
             if page != '.DS_Store':
              document_names.append(page)
              if page.endswith('.tif'):
                document_path = os.path.join(directory, internal_directory, page)
                document_paths.append(document_path) #this will store all full paths

  #Document Creation
    #converts every internal folder into a DocTR document object - with 2 pages.
  for internal_directory in internal_directory_paths:
    pages = os.listdir(internal_directory) #pages within the directory for a document
    if ".DS_Store" in pages:
      pages.remove(".DS_Store") #weird google issue
    pages.sort() #sorting by page number, important for OCR output
    page1 = pages[0]
    page2 = pages[1]
    first_page_names.append(page1)
    page1 = os.path.join(internal_directory, page1) #getting full path
    page2 = os.path.join(internal_directory, page2) #getting full path
    document = DocumentFile.from_images([page1, page2]) #docTR doc object created with two pages
    document_objects.append(document) #adding it to the object list



  #OCR
  for i, doc in tqdm(enumerate(document_objects, start = 1), desc = 'Processing Document', total = len(document_objects)):
    print(f'Doing OCR on Document {i}')
    text = get_ocr_double(doc) #calling  ocr wrapper function
    document_texts.append(text) #adding to a list of texts

  end_time = time.time()
  print(f'Run time for {len(first_page_names)} documents: {end_time - start_time}.')

  #Populating JSON
  #keys are document names, values are OCR outputs
  for name, text in zip(first_page_names, document_texts):
    output_json[name] = text

  return output_json, document_names, first_page_names, document_objects, document_texts

# Main

In [None]:
#Running
directory = '/content/TIF_Double_Directory' #giving directory name
output_json, document_names, first_page_names, document_objects, document_texts = get_data_double(directory = directory) #collecting output


#Saving JSON to local machine
with open('output_json__two_25.json', 'w') as file:
  json.dump(output_json, file, indent = 8)

Doing OCR on Document 1
Doing OCR on Document 2
Doing OCR on Document 3
Doing OCR on Document 4
Doing OCR on Document 5
Doing OCR on Document 6
Doing OCR on Document 7
Doing OCR on Document 8
Doing OCR on Document 9
Doing OCR on Document 10
Doing OCR on Document 11
Doing OCR on Document 12
Doing OCR on Document 13
Doing OCR on Document 14
Doing OCR on Document 15
Doing OCR on Document 16
Doing OCR on Document 17
Doing OCR on Document 18
Doing OCR on Document 19
Doing OCR on Document 20
Doing OCR on Document 21
Doing OCR on Document 22
Doing OCR on Document 23
Doing OCR on Document 24
Doing OCR on Document 25
Run time for 50 documents: 470.6102738380432.
