<a href="https://colab.research.google.com/github/upascal/Historical-Computational-Social-Science/blob/main/batch_OCR_to_TXT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

****Batch OCR PDF to TXT with Tesseract****

Code based on 
https://colab.research.google.com/drive/1UqCSFTFF4Pi3RxS-iGL7-H_sGIgccusO

changes:
Removed natas.
Added batch processing

# Install dependencies

In [None]:
%%shell
pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html -U
sudo apt-get install tesseract-ocr tesseract-ocr-por
sudo apt-get install -y libsm6 libxext6 libxrender-dev
sudo apt-get install -y poppler-utils

pip install pytesseract
pip install pdf2image

In [None]:
#NOTE: Natas doesn't work in colab because it requires an older version of pytorch 

#%%shell
#pip3 install natas
#python3 -m natas.download
#python3 -m spacy download en_core_web_md

#Imports

In [None]:

#Libraries for PDF image conversion 
from pdf2image import convert_from_bytes, convert_from_path
import pytesseract
import numpy as np
import cv2
from PIL import Image
from google.colab.patches import cv2_imshow


#File Mangement
import re
import os
from google.colab import drive
from google.colab import files

# Mount PDF Folder from Drive

In [None]:
#Mount Google Drive
drive.mount('/content/drive') #(click, copy, press enter)

In [None]:
#Inspect Directory
!ls

In [None]:
#NOTE: Use the site menu to find file path

#Change Directory to data folder
%cd /content/drive/My Drive/Dissertation/Data/Annual Reports/SEC

In [None]:
#Make new directory for text ouptut 
!mkdir text_output

In [None]:
#Set directory variables
directory = '/content/drive/My Drive/Dissertation/Data/Annual Reports/SEC'
new_directory = directory+"/text_output/"

# Define OCR Functions

In [None]:
def binarization(image):
    """

    Function that applies binarization to the image.

    Parameters
    ----------
    image : PIL.Image.Image
        Image to be binarized.

    Returns
    -------
    PIL.Image.Image
        Binarized image.
    """ 
    image = image.convert('RGB')
    npimage = np.asarray(image).astype(np.uint8)  
    npimage[:, :, 0] = 0
    npimage[:, :, 2] = 0
    im = cv2.cvtColor(npimage, cv2.COLOR_RGB2GRAY) 
    thresh = cv2.adaptiveThreshold(im,255,cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY,11,2)
    binimage = Image.fromarray(thresh) 
    return binimage


def get_pages_as_images(file):
    """
    Converts PDF pages into an image and performs binarization.
  
    Parameters
    ----------
    file : bytes
        Document in `bytes` containing the `PDF`.

    Returns
    -------
    PIL.Image.Image
        Document transformed into image and binariezed
    """ 
    images = convert_from_bytes(file, 250, grayscale=True)
    images_return = []
    num_page = 1
    for image in images:
        left = image.size[0]*0.05
        right = image.size[0]*0.95
        top = image.size[1]*0.05
        bottom = image.size[1]*0.95
        image = image.crop((left, top, right,bottom))
        images_return.append(binarization(image))
        
    return images_return

def get_ocr_documents(images):
    """

    Cycles through PDF images with pytesseract and converts images to string.
    
    Parameters
    ----------
    images : PIL.Image.Image
        Images resulting from PDF conversion.

    Returns
    -------
    String
        A text containing all pages from the PDF.
    """ 

    pages_text = []
    for image in images:
      #https://stackoverflow.com/questions/44619077/pytesseract-ocr-multiple-config-options
      pages_text.append(pytesseract.image_to_string(image, config='--psm 4' ,lang='eng'))
    return ''.join(pages_text)

def is_number(s):
    """

    Checks if the variable is a number
    
    Parameters
    ----------
    image : int
        The first parameter.

    Returns
    -------
    bool
        True if the type change occurs. False otherwise.
    """ 
    try:
        float(s)
        return True
    except ValueError:
        return False

# Run OCR on documents


In [None]:
for filename in os.listdir(directory):
    
    if filename.endswith(".pdf"):
        print("Starting OCR on {}".format(filename))
        #Load PDF
        file_path = os.path.join(directory, filename)
        pdf_file = open(file_path, "rb").read()

        #OCR PDF
        images = get_pages_as_images(pdf_file) 
        pages_text = get_ocr_documents(images)
        print("Finished OCR")

        #Save to new Directory as TXT
        base_name = os.path.splitext(os.path.basename(filename))[0] #removes "PDF"
        output_file_name = "{}_OCR.txt".format(base_name) #creates output file name
        output_file_path = new_directory+output_file_name
        with open(output_file_path, 'w') as f:
          f.write(pages_text)
        print("Saved text as {}".format(output_file_name))

    else:
        print("{} is not a PDF".format(filename))
        continue





Go to text_output Drive folder to view 

In [None]:
%cd directory+"/text_output/"