In [None]:
import base64
from typing import Union

def image_to_base64(file_path: str) -> Union[str, None]:
    """
    Reads an image file from the given file path and returns its content encoded in base64.

    Args:
        file_path (str): The path to the image file (e.g., png, jpg).

    Returns:
        Union[str, None]: The base64 encoded content of the image file, or None if the file could not be read.
    """
    try:
        # Open the image file in binary read mode
        with open(file_path, "rb") as image_file:
            # Read the entire content of the file
            image_content = image_file.read()

            # Encode the content to base64
            base64_encoded_image = base64.b64encode(image_content)

            # Decode the base64 bytes to a string
            base64_string = base64_encoded_image.decode('utf-8')

            return base64_string
    except Exception as e:
        # Handle exceptions (e.g., file not found, permission issues)
        print(f"An error occurred: {e}")
        return None

You'll need to upload a document (`.png` or `.jpg` file format) that has text inside.

In [None]:
# Example usage
image_base64 = image_to_base64("/content/whole_screenshot.png")
print(image_base64)

iVBORw0KGgoAAAANSUhEUgAAB4AAAAQ4CAIAAABnsVYUAAEAAElEQVR4nOydB5wU5f3/n+lbb69xcJSjCYo0RYKAYlADWLAg2H7GlsQEk6i/qAmpGpP8EzWWn5pE/fkzosYoKhIVKRqViBQbAoIFpN0BBwdX9rbMTv+/nnl2Z2fr7e7t3e0d37cIu7OzT5/Zmc98n89DHTfuGygJw4j+m/xBbJvtgzQvbd9Ol0JvoJBy99a6AnagFwGgK6B6ugBAKfakebGReNaNvps7a0ZRcqAoKtOLLNspk7S7ZdkCAAAAAAAAAAAAZILNQXROrz5nkZ4TEsmW4jEt9x3Tle8eEkciAAA9hQFqXRfTXe1rdK36HL3KoAzDWPH2uuI0AJX6krJtjAnQmTdkzAZGNQAAAAAAAAAAQB4CdOHqc2bpuWsCoEtXUAStEwAAIBNwhuxiCmnfktRPyaWGkfeAMbJ+YtaUiu6E/yUb8ebotthH5gbbP6mJxbcaRmm2IQAAAAAAAAAAQKlGQHdAXH1ODnyO/pW0ORdFGySJLif/e3gAAACgyJSoXUPP/kCkiYA2N0c/zFODztzCVMLlCpVGho7qz6niM2jQAAAAAAAAAAAAxYHNIfw5SX3OKeo5rRtHbwF022OYvtL1faUe3QQoSPkB7ZUfx/JvSj7ie/xZtq7r+WWTuYVNkZmKK9GpMnT0r3godIcadHLWoEEDAAAAAAAAAADkEgHdofrcgedGkvSc5AJ9DN97F5uub8lS7atSLVfJAg2WDwboRyU3ukq3S6hjuvb5kvECIPMKhFgFzjcCOrv8beUVVaINfMjHQqOjWjQ5CZj6c4cadPIm0KABAAAAAAAAAAA6YcFh3q8ZiOUYnqFphmYZmqZsCwXZFg0ydEPVdUXVFU1TFc2aRYt6JUZJfqNk6UNV6TNAn3R1e4Hc1EeGMFWSJTO6Yzx20xjOJxvriiKHiwcKIZalOYZhzIuTtAowZWrZmm4ouq6quqpq

In [None]:
# Example usage
image_base64 = image_to_base64("/content/my_screenshot.png")
print(image_base64)

iVBORw0KGgoAAAANSUhEUgAAB4AAAAQ4CAIAAABnsVYUAAEAAElEQVR4nOydCZwU1bn2T229L7MxwwybwCCg4EII4oYLQ1yC+CVEhWvUq9EYohhDNGg0V02ixqsx5rplMag3GtEEvaBxY1CCCi4IQUAgDjuzz/T03rXX9zt1qqur19kXhvcfgt1Vp06drZrup956XmrD51tRZ1BoEKAoaiifRdO0jGMHpsHdhdIG9nTJQdCG4mB0AzJsVNZIappm/p15SK6NqlFTIfpl5aiaWTNFUWS54hfJU+kbVfPUtN5Xo4xlbdM0bVZpfW1tPEVRqqqaJyIlNQuSJKmqqiiKpmlqEkmSRFGkVGOLeawBk+dcln5ZG0CONfubGga9YdkXrIaUjH6RplKaZQ2n1cmQApnzZTm1eQry2rrdrJBh07anGq+mXuNiDGNspylFUWiazuwj4rLrx7soYyStjdFXr5p79dLGiWiaJrOQHCB8xrQFpY8kx3EIIUXBo0dOIcuyz+fzlfjwdtmsBw9pxnoz5yKDjHWV8zrKnpGcuwocZU5xgTKiIMSiUVEQ+ERCUeSsaq2rzuhU4VP3BlIrntJB+pelf85LHV2f/UPyX3Wge/TfFZT9PfCo7cIwXugD+yV4AD+cBrctQAE0faqoLv/AGEZrdMjTP1/YekeO3zvDADXfBaBRadqEpmr6pv5vEVXgah1khkATsunprxsqvT897lvnS6JLVWesN6R29brTfzhbzpO3fDeWkNZ5OzVj2LOLWkQMXDilY6ST3k5cHbm+zArTxC39MzHnGdP/iSLv9Abmaz7DMHaHy263uzxeO2dLtkTN+XuczdN6AACOYgrfHUmTJskuXXjNsb1rEA2RiIxE5iOKsyzL5AVRmTVNk3VEUZRlmaIoWkupotYzEokzu820/kFGvqkk5b/Uh6HxDcby2ZiSCKn0fw8t/6xaz6XpQrD1vEltHTc+qbZbBVyzMD45bgvuvEURRlhjJwfS

In [None]:
# Example usage
image_base64 = image_to_base64("/content/false-insurance-policy.jpeg")
print(image_base64)

/9j/4AAQSkZJRgABAQAAAQABAAD/4gxYSUNDX1BST0ZJTEUAAQEAAAxITGlubwIQAABtbnRyUkdCIFhZWiAHzgACAAkABgAxAABhY3NwTVNGVAAAAABJRUMgc1JHQgAAAAAAAAAAAAAAAAAA9tYAAQAAAADTLUhQICAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABFjcHJ0AAABUAAAADNkZXNjAAABhAAAAGx3dHB0AAAB8AAAABRia3B0AAACBAAAABRyWFlaAAACGAAAABRnWFlaAAACLAAAABRiWFlaAAACQAAAABRkbW5kAAACVAAAAHBkbWRkAAACxAAAAIh2dWVkAAADTAAAAIZ2aWV3AAAD1AAAACRsdW1pAAAD+AAAABRtZWFzAAAEDAAAACR0ZWNoAAAEMAAAAAxyVFJDAAAEPAAACAxnVFJDAAAEPAAACAxiVFJDAAAEPAAACAx0ZXh0AAAAAENvcHlyaWdodCAoYykgMTk5OCBIZXdsZXR0LVBhY2thcmQgQ29tcGFueQAAZGVzYwAAAAAAAAASc1JHQiBJRUM2MTk2Ni0yLjEAAAAAAAAAAAAAABJzUkdCIElFQzYxOTY2LTIuMQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAWFlaIAAAAAAAAPNRAAEAAAABFsxYWVogAAAAAAAAAAAAAAAAAAAAAFhZWiAAAAAAAABvogAAOPUAAAOQWFlaIAAAAAAAAGKZAAC3hQAAGNpYWVogAAAAAAAAJKAAAA+EAAC2z2Rlc2MAAAAAAAAAFklFQyBodHRwOi8vd3d3LmllYy5jaAAAAAAAAAAAAAAAFklFQyBodHRwOi8vd3d3LmllYy5jaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABkZXNj

In [None]:
import requests
import json
from typing import Dict, Any

def post_request_and_parse_response(
    url: str, payload: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Sends a POST request to the specified URL with the given payload,
    then parses the byte response to a dictionary.
    Args:
    url (str): The URL to which the POST request is sent.
    payload (Dict[str, Any]): The payload to send in the POST request.
    Returns:
    Dict[str, Any]: The parsed dictionary from the response.
    """
    # Set headers for the POST request
    headers = {"Content-Type": "application/json"}

    # Send the POST request and get the response
    response = requests.post(url, json=payload, headers=headers)

    # Extract the byte data from the response
    byte_data = response.content

    # Decode the byte data to a string
    decoded_string = byte_data.decode("utf-8")

    # Convert the JSON string to a dictionary
    dict_data = json.loads(decoded_string)

    return dict_data

In [None]:
url = "https://2tsig211e0.execute-api.us-east-1.amazonaws.com/my_textract"
payload = {"image": image_base64}
result_dict = post_request_and_parse_response(url, payload)

In [None]:
'body' in result_dict

True

In [None]:
result_dict.keys()

dict_keys(['statusCode', 'body'])

In [None]:
result_dict

In [None]:
from typing import List, Dict, Any

def clean_ocr_results(ocr_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Cleans the OCR results obtained from AWS Textract by extracting the text
    and bounding box for each line that contains a 'Text' field.

    :param ocr_data: A list of OCR result dictionaries as parsed from JSON.
    :return: A list of dictionaries, each containing the 'text' and 'bbox'
             (bounding box) of valid OCR entries that contain 'Text'.
    """
    cleaned_list = []

    # Iterate over every OCR record in the input.
    for item in ocr_data:
        # Check if 'Text' key exists in the item.
        if "Text" in item:
            # Safely retrieve the bounding box data from the item.
            bbox = item.get("Geometry", {}).get("BoundingBox", {})

            # Construct a small dictionary containing text and bounding box.
            cleaned_entry = {
                "text": item["Text"],
                "bbox": bbox
            }

            # Append this small dictionary to our results.
            cleaned_list.append(cleaned_entry)

    return cleaned_list

In [None]:
cleaned_up_results = clean_ocr_results(json.loads(result_dict["body"]))

In [None]:
from typing import List, Dict, Any

def find_closest_match(ocr_results: List[Dict[str, Any]], prompt: str) -> Dict[str, Any]:
    """
    Finds the OCR result whose 'text' is the closest match to a given prompt,
    using the Levenshtein distance metric.

    :param ocr_results: A list of dictionaries, each containing at least:
                        {
                          "text": <string>,
                          "bbox": {
                            "Width": <float>,
                            "Height": <float>,
                            "Left": <float>,
                            "Top": <float>
                          }
                        }
    :param prompt:      The user prompt (string) we want to match against.
    :return:            The dictionary from 'ocr_results' whose 'text' field has
                        the smallest Levenshtein distance to the prompt.
    """

    def levenshtein_distance(str1: str, str2: str) -> int:
        """
        Calculates the Levenshtein distance between two strings str1 and str2.
        The Levenshtein distance is the minimum number of single-character
        edits (insertions, deletions, or substitutions) required to change
        one word into the other.
        """
        # If either string is empty, distance is the length of the other string.
        if not str1:
            return len(str2)
        if not str2:
            return len(str1)

        # Create a matrix (dp) with dimensions:
        # (len(str1) + 1) x (len(str2) + 1)
        dp = [[0] * (len(str2) + 1) for _ in range(len(str1) + 1)]

        # Initialize the base cases for dp:
        # dp[i][0] = i, dp[0][j] = j
        for i in range(len(str1) + 1):
            dp[i][0] = i
        for j in range(len(str2) + 1):
            dp[0][j] = j

        # Fill in the dp table
        for i in range(1, len(str1) + 1):
            for j in range(1, len(str2) + 1):
                if str1[i - 1] == str2[j - 1]:
                    # No change required if characters are the same
                    dp[i][j] = dp[i - 1][j - 1]
                else:
                    # Consider the cost of insertions, deletions, and substitutions
                    dp[i][j] = 1 + min(
                        dp[i - 1][j],    # Deletion
                        dp[i][j - 1],    # Insertion
                        dp[i - 1][j - 1] # Substitution
                    )

        return dp[len(str1)][len(str2)]

    min_distance = float('inf')
    best_match = None

    # Iterate over each OCR dictionary to find the one with smallest distance
    for item in ocr_results:
        candidate_text = item["text"]
        distance = levenshtein_distance(candidate_text.lower(), prompt.lower())
        if distance < min_distance:
            min_distance = distance
            best_match = item

    return best_match


In [None]:
find_closest_match(cleaned_up_results, "click on menu")

{'text': 'Menu',
 'bbox': {'Width': 0.01513214223086834,
  'Height': 0.00927280355244875,
  'Left': 0.3312719762325287,
  'Top': 0.012341490015387535}}

```python
import pyautogui
from typing import Dict, Any

def screenshot_and_click_on_ocr_result(
    best_matched_result: Dict[str, Any],
    screenshot_path: str = "screenshot_region.png"
) -> None:
    """
    Takes the best OCR match result, converts the bounding box (in percentages)
    to absolute screen coordinates, captures a screenshot of that region, saves
    it, and then attempts to locate the same screenshot on the screen and click on it.

    :param best_matched_result: A dictionary with 'text' and 'bbox' keys.
                                Example:
                                {
                                  "text": "Menu",
                                  "bbox": {
                                    "Width": 0.01513214223086834,
                                    "Height": 0.00927280355244875,
                                    "Left": 0.3312719762325287,
                                    "Top": 0.012341490015387535
                                  }
                                }
    :param screenshot_path:     The file name (or path) to save (and later locate) the screenshot.
    """

    # Step A: Extract bounding box percentages from the OCR result
    bbox = best_matched_result.get("bbox", {})
    width_pct = bbox.get("Width", 0.0)
    height_pct = bbox.get("Height", 0.0)
    left_pct = bbox.get("Left", 0.0)
    top_pct = bbox.get("Top", 0.0)

    # Step B: Convert percentages to absolute screen coordinates
    screen_width, screen_height = pyautogui.size()
    abs_left = int(left_pct * screen_width)
    abs_top = int(top_pct * screen_height)
    abs_width = int(width_pct * screen_width)
    abs_height = int(height_pct * screen_height)

    region = (abs_left, abs_top, abs_width, abs_height)

    # Step C: Take a screenshot of the specified region
    screenshot = pyautogui.screenshot(region=region)
    screenshot.save(screenshot_path)
    print(f"Screenshot of the specified region saved as '{screenshot_path}'.")

    # Step D: Locate the center of the saved screenshot on the screen
    center = pyautogui.locateCenterOnScreen(screenshot_path)
    if center:
        x, y = center
        print(f"Screenshot image found at coordinates: ({x}, {y}).")

        pyautogui.moveTo(x, y)
        print("Mouse moved to the center of the screenshot image.")

        pyautogui.click()
        print("Clicked on the screenshot image.")
    else:
        print(f"Screenshot image ('{screenshot_path}') not found on the screen.")


if __name__ == "__main__":
    # Example usage:
    # best_matched_result = {
    #     "text": "Menu",
    #     "bbox": {
    #         "Width": 0.01513214223086834,
    #         "Height": 0.00927280355244875,
    #         "Left": 0.3312719762325287,
    #         "Top": 0.012341490015387535
    #     }
    # }
    #
    # screenshot_and_click_on_ocr_result(best_matched_result)
    pass
```