# GCV OCR

Send images to Google Cloud Vision for OCR

GCV Documentation for:
* [client libraries](https://cloud.google.com/vision/docs/quickstart-client-libraries#client-libraries-install-python)
* [OCR](https://cloud.google.com/vision/docs/ocr)

TODO: take a look at whether any of these could be helpful:

https://cloud.google.com/vision/docs/reference/rpc/google.cloud.vision.v1#google.cloud.vision.v1.TextDetectionParams
https://cloud.google.com/vision/docs/reference/rpc/google.cloud.vision.v1#google.cloud.vision.v1.ImageContext

### bash commands I've been using:
- `ls -1 data/images/20210513 | grep -E '\.jpg$' | wc -l && ls -lah data/images/20210513/ | grep -E "_automl\.json" | wc -l && ls -lah data/images/20210513/ | grep -E "_ocr\.json" | wc -l`
- `rsync -av --exclude=".*" ./ /home/ariutta/Dropbox\ \(Gladstone\)/archive/pathway-figure-ocr/`

In [39]:
import hashlib
import io
import json
import os
import re
import subprocess
import sys
import warnings
from itertools import zip_longest
from pathlib import Path, PurePath

from wand.image import Image

In [40]:
google_application_credentials_path = Path(
    Path.home(),
    ".credentials/api-project-453052878726-f42cadc718aa.json",
)

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(
    google_application_credentials_path
)

In [41]:
# Imports the Google Cloud client library

from google.cloud import vision
from google.protobuf.json_format import MessageToDict


def get_text(input_path, language_hints=[]):

    # Instantiates a client
    client = vision.ImageAnnotatorClient()

    with input_path.open("rb") as f:
        content = f.read()

    image = vision.Image(content=content)

    response = client.text_detection(
        image=image,
        image_context={"language_hints": language_hints},
    )

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(
                response.error.message
            )
        )

    text_annotations = [MessageToDict(x._pb) for x in response.text_annotations]

    return text_annotations

## OCR all images

In [76]:
target_date = "20210513"
images_dir = Path(f"../data/images/{target_date}")

ocred_images_count_path = Path(
    f"../data/images/{target_date}/ocred_images_count.log"
)

figure_paths = list()
for ext in ("*.jpg", "*.jpeg", "*.png"):
    for f in images_dir.rglob(ext):
        figure_paths.append(f)
total_figure_path_count = len(figure_paths)
print(f"total_figure_path_count: {total_figure_path_count}")

language_hints = []

i = 0
for figure_path in figure_paths:
    ocr_output_path = figure_path.with_name(f"{figure_path.stem}_ocr.json")

    # don't do the same figure more than once
    if ocr_output_path.exists():
        continue

    automl_output_path = figure_path.with_name(
        f"{figure_path.stem}_automl.json"
    )

    # GC AutoML rejected a small number of the JPGs as invalid
    if not automl_output_path.exists():
        continue

    with automl_output_path.open("r") as f:
        automl_output = json.load(f)
        if automl_output["displayName"] != "pathway":
            continue

    try:
        text_annotations = get_text(figure_path, language_hints)

        # Specifying utf8 and 'ensure_ascii=False' ensures the output file uses
        # Greek characters, not escape encoding like "\u..."
        with ocr_output_path.open("w", encoding="utf8") as f:
            json.dump(text_annotations, f, ensure_ascii=False)

        i += 1
        with open(ocred_images_count_path, "w") as f:
            f.write(f"{i} of {total_figure_path_count}\n")
    except:
        e = sys.exc_info()[0]
        print(f"failed for {str(figure_path)}")
        print("<p>Error: %s</p>" % e)

print(f"Figures OCRed in last run: {i}")

total_figure_path_count: 124447
Figures OCRed in last run: 0


## OCR the sample images