## PyTesseract

In [None]:
!pip install pytesseract
!sudo apt install tesseract-ocr
import pytesseract
from PIL import Image
import re
import json

def parse_document(image_path):
    # Open the image using PIL
    image = Image.open(image_path)

    # Use pytesseract to extract text
    text = pytesseract.image_to_string(image)

    # Define regular expressions to extract information
    data = {
        "filing_id": re.search(r"Filing ID\s*:\s*(\d+)", text),
        "document_id": re.search(r"Document ID\s*:\s*(\d+)", text),
        "bank_name": "FIRST COMMUNITY BANK" if "FIRST COMMUNITY BANK" in text else None,
        "debtor": {
            "name": re.search(r"1b\. INDIVIDUAL'S SURNAME FIRST PERSONAL NAME ADDITIONAL NAME\(S\)/INITIAL\(S\) SUFFIX\s+(.+?)\n", text),
            "address": re.search(r"1c\. MAILING ADDRESS\s+(.+?)\s+(LEAD HILL AR\s+\d+ USA)", text)
        },
        "secured_party": {
            "name": re.search(r"3a\. ORGANIZATION'S NAME\s+(.+?)\n", text),
            "address": re.search(r"3c\. MAILING ADDRESS:\s+(.+?)\s+(.+?)\s+(\d+ USA)", text)
        },
        "collateral": re.search(r"4\. COLLATERAL:\s+(.+?)\n\n", text, re.DOTALL)
    }

    # Extracted values need to be stripped and checked for None
    for key, value in data.items():
        if isinstance(value, re.Match):
            data[key] = value.group(1).strip()
        elif isinstance(value, dict):
            for sub_key, sub_value in value.items():
                if isinstance(sub_value, re.Match):
                    value[sub_key] = sub_value.group(1).strip()
                else:
                    value[sub_key] = None
        else:
            data[key] = None

    return data

# Path to the image file
image_path = '/content/4000028037756.tiff'

# Parse the document and print the result
parsed_data = parse_document(image_path)
print(json.dumps(parsed_data, indent=4))


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
{
    "filing_id": "4000028037756",
    "document_id": "17503743001",
    "bank_name": null,
    "debtor": {
        "name": "HUEBNER RICHIE HICKS",
        "address": null
    },
    "secured_party": {
        "name": "FIRST COMMUNITY BANK",
        "address": "CITY STATE |POSTAL CODE COUNTRY"
    },
    "collateral": "This financing statement covers the following collateral:"
}


## NER

In [5]:
!pip install pytesseract opencv-python spacy
!sudo apt install tesseract-ocr
!python -m spacy download en_core_web_sm

import pytesseract
import cv2
import re
import json
import spacy
from PIL import Image

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_image(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)

    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply thresholding to get a binary image
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)

    return binary

def extract_entities(text):
    # Process the text with SpaCy
    doc = nlp(text)

    # Extract entities
    entities = {}
    for ent in doc.ents:
        entities[ent.label_] = ent.text  # Store the entity label and its text

    return entities

def parse_document(image_path):
    # Preprocess the image
    processed_image = preprocess_image(image_path)

    # Use pytesseract to extract text from the preprocessed image
    text = pytesseract.image_to_string(processed_image)

    # Extract entities using NER
    ner_data = extract_entities(text)

    # Define regular expressions to extract specific information
    data = {
        "filing_id": re.search(r"Filing ID\s*:\s*(\d+)", text),
        "document_id": re.search(r"Document ID\s*:\s*(\d+)", text),
        "bank_name": "FIRST COMMUNITY BANK" if "FIRST COMMUNITY BANK" in text else None,
        "debtor": {
            "name": re.search(r"1b\. INDIVIDUAL'S SURNAME FIRST PERSONAL NAME ADDITIONAL NAME\(S\)/INITIAL\(S\) SUFFIX\s+(.+?)\n", text),
            "address": re.search(r"1c\. MAILING ADDRESS\s+(.+?)\s+(LEAD HILL AR\s+\d+ USA)", text)
        },
        "secured_party": {
            "name": re.search(r"3a\. ORGANIZATION'S NAME\s+(.+?)\n", text),
            "address": re.search(r"3c\. MAILING ADDRESS:\s+(.+?)\s+(.+?)\s+(\d+ USA)", text)
        },
        "collateral": re.search(r"4\. COLLATERAL:\s+(.+?)\n\n", text, re.DOTALL)
    }

    # Extracted values need to be stripped and checked for None
    for key, value in data.items():
        if isinstance(value, re.Match):
            data[key] = value.group(1).strip()
        elif isinstance(value, dict):
            for sub_key, sub_value in value.items():
                if isinstance(sub_value, re.Match):
                    value[sub_key] = sub_value.group(1).strip()
                else:
                    value[sub_key] = None
        else:
            data[key] = None

    # Combine NER data with parsed data
    data.update(ner_data)

    return data

# Path to the image file
image_path = '/content/4000028037756.tiff'

# Parse the document and print the result
parsed_data = parse_document(image_path)
print(json.dumps(parsed_data, indent=4))


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
{
    "filing_id": null,
    "document_id": "17503743001",
    "bank_name": null,
    "debtor": {
        "name": nul