In [1]:
# coding: utf-8
"""Convert all the pdf from a given to image and send image to Azure OCR
"""
import json
import requests
import os
import io
from pdf2image import convert_from_bytes, convert_from_path
from PIL import Image
import time
import pandas as pd
import urllib
from pdf2image import convert_from_bytes, convert_from_path
import os
import ntpath
import numpy as np

from boltons.setutils import IndexedSet
import re
import string

def pil_to_array(pil_image):
    """convert a PIL image object to a byte array

    Arguments:
        pil_image {PIL} -- Pillow image object

    Returns:
        {bytes} -- PIL image object in a form of byte array
    """
    image_byte_array = io.BytesIO()
    pil_image.save(image_byte_array, format='PNG')
    image_data = image_byte_array.getvalue()
    return image_data


def image_to_text(image_data):
    """convert an image object to an array of text lines 

    Arguments:
        image_data {bytes} -- image byte array

    Returns:
        list -- array of strings representing lines
    """
    # azure subscription key
    subscription_key = "5bbff59395264a04ae8594b1c8448c6b"
    assert subscription_key
    # azure vision api
    vision_base_url = "https://vishalocrtest.cognitiveservices.azure.com/"
    # ocr subsection
    ocr_url = vision_base_url + "ocr"
    headers = {'Ocp-Apim-Subscription-Key': subscription_key,
 'Content-Type': 'application/octet-stream'}
    params = {'language': 'unk', 'detectOrientation': 'true'}

    # get response from the server
    response = requests.post(ocr_url, headers=headers, params=params, data=image_data)
    response.raise_for_status()
    # get json data to parse it later
    analysis = response.json()
    # all the line from a page, including noise
    full_text = []
    for region in analysis['regions']:
        line = region['lines']
        for element in line:
            line_text = ' '.join([word['text'] for word in element['words']])
            full_text.append(line_text.lower())
    # clean array containing only important data
    user_requests = []
    for line in full_text:
        user_requests.append(line)

    return user_requests


def get_information(input_path):
    # points of interest from all the pages
    global_poi = []
    # get and array of PIL image objects -> an object per page
    images = convert_from_path(input_path)
    # create a byte array for each page
    for image in images:
        byte_array = pil_to_array(image)
        page_poi = image_to_text(byte_array)
        global_poi += page_poi
    return global_poi

PATH = "C:\\Users\\digidev\\workspace\\ocr\\ifs-digitization-conversion-service-java_poc\\src\\main\\resources\\Ex1.pdf"
poi = get_information(PATH)
items = poi

PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

In [6]:
import os.path

from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials

SUBSCRIPTION_KEY_ENV_NAME = "5bbff59395264a04ae8594b1c8448c6b"
COMPUTERVISION_LOCATION = os.environ.get(
    "COMPUTERVISION_LOCATION", "westcentralus")

IMAGES_FOLDER = os.path.join(os.path.dirname(
    os.path.realpath(r'C:\Users\digidev\workspace\ocr')), "images")


def image_analysis_in_stream(subscription_key):
    """ImageAnalysisInStream.
    This will analyze an image from a stream and return all available features.
    """
    client = ComputerVisionClient(
        endpoint="https://vishalocrtest.cognitiveservices.azure.com/",
        credentials=CognitiveServicesCredentials(subscription_key)
    )

    with open(os.path.join(IMAGES_FOLDER, "house.jpg"), "rb") as image_stream:
        image_analysis = client.analyze_image_in_stream(
            image=image_stream,
            visual_features=[
                VisualFeatureTypes.image_type,  # Could use simple str "ImageType"
                VisualFeatureTypes.faces,      # Could use simple str "Faces"
                VisualFeatureTypes.categories,  # Could use simple str "Categories"
                VisualFeatureTypes.color,      # Could use simple str "Color"
                VisualFeatureTypes.tags,       # Could use simple str "Tags"
                VisualFeatureTypes.description  # Could use simple str "Description"
            ]
        )

    print("This image can be described as: {}\n".format(
        image_analysis.description.captions[0].text))

    print("Tags associated with this image:\nTag\t\tConfidence")
    for tag in image_analysis.tags:
        print("{}\t\t{}".format(tag.name, tag.confidence))

    print("\nThe primary colors of this image are: {}".format(
        image_analysis.color.dominant_colors))


def recognize_text(subscription_key):
    """RecognizeTextUsingRecognizeAPI.
    This will recognize text of the given image using the recognizeText API.
    """
    import time
    client = ComputerVisionClient(
        endpoint="https://vishalocrtest.cognitiveservices.azure.com/",
        credentials=CognitiveServicesCredentials(subscription_key)
    )

    with open(os.path.join(IMAGES_FOLDER, "make_things_happen.jpg"), "rb") as image_stream:
        job = client.recognize_text_in_stream(
            image=image_stream,
            mode="Printed",
            raw=True
        )
    operation_id = job.headers['Operation-Location'].split('/')[-1]

    image_analysis = client.get_text_operation_result(operation_id)
    while image_analysis.status in ['NotStarted', 'Running']:
        time.sleep(1)
        image_analysis = client.get_text_operation_result(
            operation_id=operation_id)

    print("Job completion is: {}\n".format(image_analysis.status))

    print("Recognized:\n")
    lines = image_analysis.recognition_result.lines
    print(lines[0].words[0].text)  # "make"
    print(lines[1].words[0].text)  # "things"
    print(lines[2].words[0].text)  # "happen"


def recognize_printed_text_in_stream(subscription_key):
    """RecognizedPrintedTextUsingOCR_API.
    This will do an OCR analysis of the given image.
    """
    client = ComputerVisionClient(
        endpoint="https://vishalocrtest.cognitiveservices.azure.com/",
        credentials=CognitiveServicesCredentials(subscription_key)
    )

    with open(os.path.join(IMAGES_FOLDER, "computer_vision_ocr.png"), "rb") as image_stream:
        image_analysis = client.recognize_printed_text_in_stream(
            image=image_stream,
            language="en"
        )

    lines = image_analysis.regions[0].lines
    print("Recognized:\n")
    for line in lines:
        line_text = " ".join([word.text for word in line.words])
        print(line_text)


if __name__ == "__main__":
    import sys, os.path
    sys.path.append(os.path.abspath(os.path.join(r'C:\Users\digidev\workspace\ocr', "..", "..", "..")))
    from samples.tools import execute_samples
    execute_samples(globals(), SUBSCRIPTION_KEY_ENV_NAME)

ModuleNotFoundError: No module named 'samples'