In [18]:
def detect_document(path):
    """Detects document features in an image."""
    from google.cloud import vision

    client = vision.ImageAnnotatorClient()

    with open(path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    # Specify language as English
    image_context = vision.ImageContext(language_hints=["en"])

    response = client.document_text_detection(image=image, image_context=image_context)

    for page in response.full_text_annotation.pages:
        for block in page.blocks:
            print(f"\nBlock confidence: {block.confidence}\n")

            for paragraph in block.paragraphs:
                print("Paragraph confidence: {}".format(paragraph.confidence))

                for word in paragraph.words:
                    word_text = "".join([symbol.text for symbol in word.symbols])
                    print(
                        "Word text: {} (confidence: {})".format(
                            word_text, word.confidence
                        )
                    )

                    for symbol in word.symbols:
                        print(
                            "\tSymbol: {} (confidence: {})".format(
                                symbol.text, symbol.confidence
                            )
                        )

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )
    
    return response.full_text_annotation.text


In [34]:
detect_document("data/form_info_001/apartment_no/02110.jpg")


Block confidence: 0.8273344039916992

Paragraph confidence: 0.8273344039916992
Word text: Furst (confidence: 0.8869720101356506)
	Symbol: F (confidence: 0.7302184104919434)
	Symbol: u (confidence: 0.798406183719635)
	Symbol: r (confidence: 0.9474553465843201)
	Symbol: s (confidence: 0.9712465405464172)
	Symbol: t (confidence: 0.9875335097312927)
Word text: Door (confidence: 0.7933574914932251)
	Symbol: D (confidence: 0.5096052289009094)
	Symbol: o (confidence: 0.903228759765625)
	Symbol: o (confidence: 0.8595099449157715)
	Symbol: r (confidence: 0.901086151599884)
Word text: Rite (confidence: 0.7867642641067505)
	Symbol: R (confidence: 0.6802905201911926)
	Symbol: i (confidence: 0.5332353115081787)
	Symbol: t (confidence: 0.9713659286499023)
	Symbol: e (confidence: 0.9621654152870178)


'Furst Door Rite'

In [36]:
detect_document("data/form_info_001/street_no/02112.jpg")


Block confidence: 0.9626752734184265

Paragraph confidence: 0.9626752734184265
Word text: 244-246 (confidence: 0.9626752734184265)
	Symbol: 2 (confidence: 0.954619824886322)
	Symbol: 4 (confidence: 0.9492464661598206)
	Symbol: 4 (confidence: 0.9696445465087891)
	Symbol: - (confidence: 0.9245619773864746)
	Symbol: 2 (confidence: 0.9778190851211548)
	Symbol: 4 (confidence: 0.9862685203552246)
	Symbol: 6 (confidence: 0.9765665531158447)


'244-246'

In [33]:
import difflib

def best_match(text, options):
    """
    Find the best match for a given text from a list of options.

    :param text: The text to match.
    :param options: A list of possible options to match against.
    :return: The option that best matches the text.
    """
    return difflib.get_close_matches(text, options, n=1, cutoff=0.0)[0]

# Example usage

text = detect_document("data/form_info_001/street/02112.jpg")
text = text.lower()
options = ["flower", "clay", "grand", "figueroa", "bunker hill ave", "cinnabar", "hope", "west"]
best_match_result = best_match(text, options)
best_match_result


Block confidence: 0.8680002093315125

Paragraph confidence: 0.8680002093315125
Word text: 2 (confidence: 0.8680002093315125)
	Symbol: 2 (confidence: 0.8680002093315125)

Block confidence: 0.6058118343353271

Paragraph confidence: 0.6058118343353271
Word text: trand (confidence: 0.6058118343353271)
	Symbol: t (confidence: 0.3441177010536194)
	Symbol: r (confidence: 0.44613581895828247)
	Symbol: a (confidence: 0.745262861251831)
	Symbol: n (confidence: 0.6531288623809814)
	Symbol: d (confidence: 0.8404139280319214)


'grand'

In [37]:

import os
import pandas as pd
import difflib
from google.cloud import vision

def detect_document(path):
    client = vision.ImageAnnotatorClient()
    with open(path, "rb") as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    image_context = vision.ImageContext(language_hints=["en"])
    response = client.document_text_detection(image=image, image_context=image_context)

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )
    return response.full_text_annotation.text

def best_match(text, options):
    return difflib.get_close_matches(text, options, n=1, cutoff=0.0)[0]

def process_images_sync():
    base_path = "data/form_info_001"
    directories = ["street", "street_no", "apartment_no"]
    street_options = ["flower", "clay", "grand", "figueroa", "bunker hill ave", "cinnabar", "hope", "west"]

    data = []

    for index in os.listdir(os.path.join(base_path, directories[0]))[:10]:
        print(index)
        if not index.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
            continue
        record = {'index': index}
        for dir in directories:
            image_path = os.path.join(base_path, dir, index)
            text = detect_document(image_path).lower()
            if dir == "street":
                text = best_match(text, street_options)
            record[dir] = text
        data.append(record)

    return pd.DataFrame(data)

df = process_images_sync()
df.set_index('index', inplace=True)
df.to_csv("processed_text_data.csv")


02725.jpg
00132.jpg
00654.jpg
02043.jpg
02057.jpg
00640.jpg
00898.jpg
01238.jpg
00126.jpg
02731.jpg


In [38]:
def process_image(index, directories, street_options, base_path):
    record = {'index': index}
    for dir in directories:
        image_path = os.path.join(base_path, dir, index)
        text = detect_document(image_path).lower()
        if dir == "street":
            text = best_match(text, street_options)
        record[dir] = text
    return record

def process_images_concurrently():
    base_path = "data/form_info_001"
    directories = ["street", "street_no", "apartment_no"]
    street_options = ["flower", "clay", "grand", "figueroa", "bunker hill ave", "cinnabar", "hope", "west"]

    all_files = os.listdir(os.path.join(base_path, directories[0]))[:1000]
    assert len(all_files) < 11
    data = []

    with ThreadPoolExecutor(max_workers=32) as executor:
        future_to_file = {executor.submit(process_image, file, directories, street_options, base_path): file for file in all_files}
        for future in concurrent.futures.as_completed(future_to_file):
            data.append(future.result())

    return pd.DataFrame(data)

df = process_images_concurrently()
df.set_index('index', inplace=True)
df.to_csv("processed_text_data.csv")

RuntimeError: asyncio.run() cannot be called from a running event loop