In [1]:
#!python -m pip install PyMuPDF

In [8]:
import fitz  # PyMuPDF
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from msrest.authentication import CognitiveServicesCredentials
import time

import json
with open(".\settings.json", "r") as jsonfile:
    data = json.load(jsonfile)
    print("Read successful")

subscription_key = data['ACCOUNT_KEY']
endpoint = "https://courtjudgementnew.cognitiveservices.azure.com/"


client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))

output_dir = 'testImage'
file_name = '2070_District_Court_Decision_II' 

def convert_pdf_to_images(pdf_path):
    pdf_document = fitz.open(pdf_path)
    image_paths = []
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        image_path = f"{file_name}_{page_num + 1}.png"
        output_image_path = output_dir + "\\" +  image_path
        pix.save(output_image_path)
        image_paths.append(output_image_path)
    return image_paths

def safe_extract_text(image_path, retries=3, delay=5):
    for attempt in range(retries):
        try:
            return extract_text_from_image(image_path)
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}")
            time.sleep(delay)
    
    return ""


def extract_text_from_image(image_path):
    with open(image_path, "rb") as image_stream:
        read_response = client.read_in_stream(image_stream, raw=True)
    operation_location = read_response.headers["Operation-Location"]
    operation_id = operation_location.split("/")[-1]
    while True:
        result = client.get_read_result(operation_id)
        if result.status not in ['notStarted', 'running']:
            break
        time.sleep(5)
    extracted_text = ""
    if result.status == OperationStatusCodes.succeeded:
        for page in result.analyze_result.read_results:
            for line in page.lines:
                extracted_text += line.text + "\n"
    return extracted_text

def main(pdf_path, output_txt_path):
    image_paths = convert_pdf_to_images(pdf_path)
    with open(output_txt_path, "w",encoding="utf-8") as output_file:
        for image_path in image_paths:
            print(image_path)
            text = safe_extract_text(image_path)

            output_file.write(text + "\n")

            time.sleep(2) # Delay between requests


# Run the process
main(file_name + ".pdf", file_name + ".txt")


Read successful
testImage\2070_District_Court_Decision_II_1.png
testImage\2070_District_Court_Decision_II_2.png
testImage\2070_District_Court_Decision_II_3.png
testImage\2070_District_Court_Decision_II_4.png
testImage\2070_District_Court_Decision_II_5.png
testImage\2070_District_Court_Decision_II_6.png
testImage\2070_District_Court_Decision_II_7.png
testImage\2070_District_Court_Decision_II_8.png
testImage\2070_District_Court_Decision_II_9.png
testImage\2070_District_Court_Decision_II_10.png
testImage\2070_District_Court_Decision_II_11.png
testImage\2070_District_Court_Decision_II_12.png
testImage\2070_District_Court_Decision_II_13.png
testImage\2070_District_Court_Decision_II_14.png
testImage\2070_District_Court_Decision_II_15.png
testImage\2070_District_Court_Decision_II_16.png
testImage\2070_District_Court_Decision_II_17.png
testImage\2070_District_Court_Decision_II_18.png
testImage\2070_District_Court_Decision_II_19.png
testImage\2070_District_Court_Decision_II_20.png
testImage\207