# OCR(Optical Character Recognition)/READ
* 문서에있는 텍스트 모조리 읽어버리기

In [None]:
"""
This code sample shows Prebuilt Read operations with the Azure AI Document Intelligence client library.
The async versions of the samples require Python 3.8 or later.

To learn more, please visit the documentation - Quickstart: Document Intelligence (formerly Form Recognizer) SDKs
https://learn.microsoft.com/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api?pivots=programming-language-python
"""

from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
import numpy as np

"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""
endpoint = ""
key = ""

def format_bounding_box(bounding_box):
    if not bounding_box:
        return "N/A"
    reshaped_bounding_box = np.array(bounding_box).reshape(-1, 2)
    return ", ".join(["[{}, {}]".format(x, y) for x, y in reshaped_bounding_box])

def analyze_read(file_path):
    document_intelligence_client  = DocumentIntelligenceClient(
            endpoint=endpoint, credential=AzureKeyCredential(key)
        )
    if "http" in file_path:
        # TODO: URL 파일처리
        # sample document (URL)
        formUrl = file_path
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-read", AnalyzeDocumentRequest(url_source=formUrl)
        )
        result = poller.result()

    elif not "http" in file_path:
        # TODO: 로컬파일 경로가져오기
        # smaple document (local)
        local_path = file_path
        with open(local_path, "rb") as f:
            poller = document_intelligence_client.begin_analyze_document(
                "prebuilt-read",
                body= f
            )
        result = poller.result()

    # print(result.pages[0]["words"])
    # a = result.pages[0]["words"]

    # for i in a:
    #     print(i["polygon"])


    print ("Document contains content: ", result.content)

    # for idx, style in enumerate(result.styles):
    #     print(
    #         "Document contains {} content".format(
    #             "handwritten" if style.is_handwritten else "no handwritten"
    #         )
    #     )

    for page in result.pages:
        print("----Analyzing Read from page #{}----".format(page.page_number))
        print(
            "Page has width: {} and height: {}, measured with unit: {}".format(
                page.width, page.height, page.unit
            )
        )

        for line_idx, line in enumerate(page.lines):
            print(
                "...Line # {} has text content '{}' within bounding box '{}'".format(
                    line_idx,
                    line.content,
                    format_bounding_box(line.polygon),
                )
            )

    #     for word in page.words:
    #         print(
    #             "...Word '{}' has a confidence of {}".format(
    #                 word.content, word.confidence
    #             )
    #         )

    print("----------------------------------------")

if __name__ == "__main__":
    # analyze_read(file_path="https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf")
    analyze_read(file_path="/Users/laxdin24/Documents/GitHub/MS_AI_SCHOOL_6/Azure AI Language/Document Intelligence/Data/invoice-english.pdf")



* 이미지기준으로 박스바운딩 그리기

In [None]:
# 연습

from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
import json


endpoint = ""
key = ""

def format_bounding_box(bounding_box):
    if not bounding_box:
        return "N/A"
    reshaped_bounding_box = np.array(bounding_box).reshape(-1, 2)
    return ", ".join(["[{}, {}]".format(x, y) for x, y in reshaped_bounding_box])

def analyze_read(file_path):
    document_intelligence_client  = DocumentIntelligenceClient(
            endpoint=endpoint, credential=AzureKeyCredential(key)
        )
  
    local_path = file_path
    with open(local_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-read",
            body= f
        )
    result = poller.result()

    image = Image.open(file_path)
    draw = ImageDraw.Draw(image)

    a = result.pages[0]["words"]

    for i in a:
        # print(i["polygon"])

        polygon_list = [
            (i["polygon"][0],i["polygon"][1]),
            (i["polygon"][2],i["polygon"][3]),
            (i["polygon"][4],i["polygon"][5]),
            (i["polygon"][6],i["polygon"][7])
        ]
        draw.polygon(polygon_list, outline="blue", width=5)
    
    display(image)


if __name__ == "__main__":
    # analyze_read(file_path="https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf")
    analyze_read(file_path="/Users/laxdin24/Documents/GitHub/MS_AI_SCHOOL_6/Azure AI Language/Document Intelligence/Data/read-resume.png")

* 그라디오에 구현해보기

In [None]:
import gradio as gr
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
import numpy as np

endpoint = ""
key = ""

def analyze_read(file_path):
    document_intelligence_client  = DocumentIntelligenceClient(
            endpoint=endpoint, credential=AzureKeyCredential(key)
        )
  
    local_path = file_path
    with open(local_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-read",
            body= f
        )
    result = poller.result()

    image = Image.open(file_path)
    draw = ImageDraw.Draw(image)

    a = result.pages[0]["words"]

    for i in a:
        # print(i["polygon"])

        polygon_list = [
            (i["polygon"][0],i["polygon"][1]),
            (i["polygon"][2],i["polygon"][3]),
            (i["polygon"][4],i["polygon"][5]),
            (i["polygon"][6],i["polygon"][7])
        ]
        draw.polygon(polygon_list, outline="blue", width=5)

    return result.content , image

with gr.Blocks() as demo:
    with gr.Tab("documentintelligence"):
        input_image_box_1 = gr.Image(type='filepath')
        output_image_box_1 = gr.Image(label="출력되는 이미지", interactive=False)
        output_box_1 = gr.Textbox()

        input_image_box_1.change(fn=analyze_read, inputs=[input_image_box_1], outputs=[output_box_1, output_image_box_1])
    

demo.launch()

## AIvision 이랑 documentinteligence 랑 그라디오에 tab 으로 섞기

In [None]:
import gradio as gr
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.ai.vision.imageanalysis import ImageAnalysisClient
from azure.ai.vision.imageanalysis.models import VisualFeatures
from azure.core.credentials import AzureKeyCredential
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
import json
import numpy as np
import os

endpoint_1 = "https://6a026-documentintelligence.cognitiveservices.azure.com/"
key_1 = ""

def analyze_read(file_path):
    document_intelligence_client  = DocumentIntelligenceClient(
            endpoint=endpoint_1, credential=AzureKeyCredential(key_1)
        )
  
    local_path = file_path
    with open(local_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-read",
            body= f
        )
    result = poller.result()

    image = Image.open(file_path)
    draw = ImageDraw.Draw(image)

    a = result.pages[0]["words"]

    for i in a:
        # print(i["polygon"])

        polygon_list = [
            (i["polygon"][0],i["polygon"][1]),
            (i["polygon"][2],i["polygon"][3]),
            (i["polygon"][4],i["polygon"][5]),
            (i["polygon"][6],i["polygon"][7])
        ]
        draw.polygon(polygon_list, outline="blue", width=5)

    return result.content , image

endpoint_2 = 'https://westeurope.api.cognitive.microsoft.com/'
key_2 = ''
Region = 'westeurope'

def image_OCR(input_image):

    # Create an Image Analysis client
    client = ImageAnalysisClient(
        endpoint=endpoint_2,
        credential=AzureKeyCredential(key_2)
    )

    visual_features=[
        VisualFeatures.READ
        ]

    if not "https://" in input_image or not "http://" in input_image:
    # Load image to analyze into a 'bytes' object
        with open(input_image, "rb") as f:
            image_data = f.read()

            # Get a caption for the image. This will be a synchronously (blocking) call. / 요청한거 응답받기
            result = client.analyze(
                image_data=image_data,
                visual_features=visual_features,
                gender_neutral_caption=True,  # Optional (default is False)
            )
    else:
        # Get a caption for the image. This will be a synchronously (blocking) call. / 요청한거 응답받기
        result = client.analyze_from_url(
            image_url=input_image,
            visual_features=visual_features,
            gender_neutral_caption=True,  # Optional (default is False)
        )

    # 원본 이미지 열기
    image = Image.open(input_image)
    draw = ImageDraw.Draw(image)
    # font_path = "/System/Library/Fonts/Supplemental/Arial.ttf"  # macOS 기본 폰트 (예제)
    # font = ImageFont.truetype(font_path, 40)
    extracted_text = []

    # Print text (OCR) analysis results to the console
    if result.read is not None:
        for line in result.read.blocks[0].lines:
            extracted_text.append(line.text)
            points = [
                (line.bounding_polygon[0]['x'], line.bounding_polygon[0]['y']),
                (line.bounding_polygon[1]['x'], line.bounding_polygon[1]['y']),
                (line.bounding_polygon[2]['x'], line.bounding_polygon[2]['y']),
                (line.bounding_polygon[3]['x'], line.bounding_polygon[3]['y'])
                ]
            
                # 네모칸 그리기
            draw.polygon(points, outline="blue", width=5)
            # draw.text((line.bounding_polygon[0]['x'],line.bounding_polygon[0]['y']),line.text, fill='blue', font=font)

        # image.show()
        return image , "\n".join(extracted_text)


with gr.Blocks() as demo:
    with gr.Tab("documentintelligence"):
        input_image_box_1 = gr.Image(type='filepath')
        output_image_box_1 = gr.Image(label="출력되는 이미지", interactive=False)
        output_box_1 = gr.Textbox()

        input_image_box_1.change(fn=analyze_read, inputs=[input_image_box_1], outputs=[output_box_1, output_image_box_1])

    with gr.Tab("AiVision OCR"):
        with gr.Row():
            input_image = gr.Image(label="입력이미지", type="filepath")
            output_image = gr.Image(label="출력이미지" ,interactive=False)

        submit_button = gr.Button("OCR START")
        output_text = gr.Textbox(label="출력텍스트")

        submit_button.click(fn=image_OCR, inputs=[input_image], outputs=[output_image, output_text])
    

demo.launch()

# 여러가지 섞어서 조합해보기
* 다큐먼트 인텔리전스로 추적되는 텍스트를 음성으로 출력함

In [None]:
import requests, uuid, json
import os
import azure.cognitiveservices.speech as speechsdk

from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
import numpy as np

def analyze_read(file_path):
    document_intelligence_client  = DocumentIntelligenceClient(
            endpoint=endpoint, credential=AzureKeyCredential(key)
        )
    if "http" in file_path:
        # TODO: URL 파일처리
        # sample document (URL)
        formUrl = file_path
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-read", AnalyzeDocumentRequest(url_source=formUrl)
        )
        result = poller.result()
        return result.content

    elif not "http" in file_path:
        # TODO: 로컬파일 경로가져오기
        # smaple document (local)
        local_path = file_path
        with open(local_path, "rb") as f:
            poller = document_intelligence_client.begin_analyze_document(
                "prebuilt-read",
                body= f
            )
        result = poller.result()
        return result.content


def trans_text(text):
    # Add your key and endpoint
    key = 
    endpoint = 

    # location, also known as region.
    # required if you're using a multi-service or regional (not global) resource. It can be found in the Azure portal on the Keys and Endpoint page.
    location = "eastus"

    path = '/translate'
    constructed_url = endpoint + path

    params = {
        'api-version': '3.0',
        'from': 'en',
        'to': ['ko']
    }

    headers = {
        'Ocp-Apim-Subscription-Key': key,
        # location required if you're using a multi-service or regional (not global) resource.
        'Ocp-Apim-Subscription-Region': location,
        'Content-type': 'application/json',
        'X-ClientTraceId': str(uuid.uuid4())
    }

    # You can pass more than one object in body.
    body = [{
        'text': text
    }]

    request = requests.post(constructed_url, params=params, headers=headers, json=body)
    response = request.json()

    return response[0]["translations"][0]["text"]

speech_config = speechsdk.SpeechConfig(
        subscription="", region="eastus"
        )
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)

# The neural multilingual voice can speak different languages based on the input text.
speech_config.speech_synthesis_voice_name='ko-KR-HyunsuMultilingualNeural'

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

# Get text from the console and synthesize to the default speaker.
print("Enter some text that you want to speak >")
text = trans_text(analyze_read(file_path="../Document Intelligence/Document Intelligence실습파일/generaldoc-drillreport.pdf"))

speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()

if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
    print("Speech synthesized for text [{}]".format(text))
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = speech_synthesis_result.cancellation_details
    print("Speech synthesis canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        if cancellation_details.error_details:
            print("Error details: {}".format(cancellation_details.error_details))
            print("Did you set the speech resource key and region values?")