## Handwritten Text recognition using gpt-4-vision model

In [14]:
%load_ext autoreload
%autoreload 2

import os
curpath = os.getcwd()
os.chdir(curpath.split("core")[0])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
import base64
import requests
import os
import json
from typing import List, Any, Union
from tqdm import tqdm
import asyncio

from core.features.utils import calculate_cost_gpt4_turbo, add_dicts, pdf_to_images
from prompt import SIMPLE_GPT_BASED_HTR_PROMPT, SIMPLE_GPT_BASED_HTR_FORMATTED_PROMPT
from core.features.provider import creator, acreator, text_model_defaults, vision_model_defaults

from dotenv import load_dotenv
load_dotenv()

import time

In [16]:

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


def encode_and_batch(image_paths, verbose = False):

    encoded = []
    encoded_name = []
    enc = []
    enc_name = []
    for image_path in image_paths:
        enc_name.append(image_path.split("/")[-1])
        enc.append(encode_image(image_path))
        if len(enc) >= batch_size:
            encoded.append(enc)
            encoded_name.append(enc_name)
            enc = []
            enc_name = []
    
    if len(enc) > 0:
        encoded.append(enc)
        encoded_name.append(enc_name)

    if verbose:
        print("Total images:", len(image_paths))
        print("Total batches:", len(encoded))

    return encoded, encoded_name

In [74]:
def process_for_json(text):
    # Find the index of the first '{' and the last '}'
    start_index = text.find('{')
    end_index = text.rfind('}')

    # Extract the text between the first '{' and the last '}'
    if start_index != -1 and end_index != -1 and end_index > start_index:
        extracted_text = text[start_index:end_index + 1]
        # Replace newline characters with their escaped versions
        # extracted_text = extracted_text.replace("\n", "\\n")
    else:
        extracted_text = ''

    return extracted_text

text = """```json
{
  "text": "oldest religion\nSanatan\nJews\n(1brahim)\nChristianity Islam (Arabs)\n(23rd Jesus) (24th Hazrat Mohd)\n{ Mecca - born }\n{ Madina - died }\n\n* Mohd. Bin Kasim - Awarangzeb - (712 - 1707)\n* Aryavart - o first name of India (due to invasion of Aryans\nfrom Central Asia)\n(आर्यावर्त) Vedic Civilization (1500BC - 600 B.C.)from A Asia\n(2500 BC - 1750 BC)\nIndus Valley\nCivilisation\nThese civilisations\nwere in Indus\nvalley .\n\n* 1921 - Dayaram Sahani - Hara appa was discovered\n* 1922 - R.D. Banerjee - Mohenjodaro"
}```"""

text = """```json
{
  "text": "HISTORY\nABHISHEK SIR\nPARAMOUNT\n\n- Greek Word - Historia\n\nHerodotus book, Historica\n\nFather of History\n\nSources-\n1) Archaeological sources\n2) (Fossils, Monuments, inscriptions, coins, statues)\n2) Literary sources\n   -> Religious (Rigveda, Samveda, etc)\n   -> Non-religious (Panchatantra, Patanjali, etc)\n3) Description of foreigner travellers\n\n* Veena - oldest instrument of India\n* Tambura - Arab's oldest instrument.\n\nwww.OnlineStudyPoints.com\nScanned by CamScanner"
}
```"""

repr(text)
# final = process_for_json(text)
# final

# json.loads(final)

# print(final)

'\'```json\\n{\\n  "text": "HISTORY\\nABHISHEK SIR\\nPARAMOUNT\\n\\n- Greek Word - Historia\\n\\nHerodotus book, Historica\\n\\nFather of History\\n\\nSources-\\n1) Archaeological sources\\n2) (Fossils, Monuments, inscriptions, coins, statues)\\n2) Literary sources\\n   -> Religious (Rigveda, Samveda, etc)\\n   -> Non-religious (Panchatantra, Patanjali, etc)\\n3) Description of foreigner travellers\\n\\n* Veena - oldest instrument of India\\n* Tambura - Arab\\\'s oldest instrument.\\n\\nwww.OnlineStudyPoints.com\\nScanned by CamScanner"\\n}\\n```\''

In [72]:
# import logging

# logging.critical(msg = final)

repr("rawtext" + final)
# json.loads(final)

'\'rawtext{\\n  "text": "HISTORY\\nABHISHEK SIR\\nPARAMOUNT\\n\\n- Greek Word - Historia\\n\\nHerodotus book, Historica\\n\\nFather of History\\n\\nSources-\\n1) Archaeological sources\\n2) (Fossils, Monuments, inscriptions, coins, statues)\\n2) Literary sources\\n   -> Religious (Rigveda, Samveda, etc)\\n   -> Non-religious (Panchatantra, Patanjali, etc)\\n3) Description of foreigner travellers\\n\\n* Veena - oldest instrument of India\\n* Tambura - Arab\\\'s oldest instrument.\\n\\nwww.OnlineStudyPoints.com\\nScanned by CamScanner"\\n}\''

In [18]:
def htr_gen(images: List[Any], intel = False):

    # logging.info("encoded the images")
    total_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    conversation = [{"role": "system", "content": SIMPLE_GPT_BASED_HTR_PROMPT}]
    # if intel:
    #     conversation = [{"role": "system", "content": SIMPLE_GPT_BASED_HTR_FORMATTED_PROMPT}]
        
    user_message = {"role": "user", "content": [*map(lambda x :{
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{x}"
          }
        }, images)]}
    conversation.append(user_message)

    try:
        response = creator(
                **vision_model_defaults,
                messages = conversation
                )
        
        response = response.model_dump()
        text = response["choices"][0]["message"]["content"]
        text = "{" + text.split("{")[1].split("}")[0] + "}"

        text = json.loads(text)
    except:
        # print(response.json())
        raise ValueError("Error in the response")

    total_usage = add_dicts(total_usage, dict(response["usage"]))

    return text, response["usage"]


async def async_htr_gen(images: List[Any], intel = False):

    # logging.info("encoded the images")
    total_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    conversation = [{"role": "system", "content": SIMPLE_GPT_BASED_HTR_PROMPT}]
    # if intel:
    #     conversation = [{"role": "system", "content": SIMPLE_GPT_BASED_HTR_FORMATTED_PROMPT}]
        
    user_message = {"role": "user", "content": [*map(lambda x :{
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{x}"
          }
        }, images)]}
    conversation.append(user_message)

    try:
        response = await creator(
                **vision_model_defaults,
                messages = conversation
                )
        
        response = response.model_dump()
        text = response["choices"][0]["message"]["content"]
        text = "{" + text.split("{")[1].split("}")[0] + "}"

        text = json.loads(text)
    except:
        # print(response.json())
        raise ValueError("Error in the response")

    total_usage = add_dicts(total_usage, dict(response["usage"]))

    return text, response["usage"]

In [26]:

def htr_batch_process(image_paths, batch_size: int):
    
    batches, batch_names = encode_and_batch(image_paths, batch_size=batch_size, verbose=True)

    final_text = []
    total_tokens = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
    for batch in tqdm(batches):
        text, _tokens = htr_gen(batch)
        final_text.append(text["text"])
        total_tokens = add_dicts(total_tokens, _tokens)

    return batch_names, final_text, total_tokens


In [34]:
dir_path = r"C:\Users\DELL\Documents\Curate\curate-v1\core\test_data\mmh_english\read.txt"

os.path.basename(dir_path)
# image_paths = []
# for file in os.listdir(dir_path):
#     image_paths.append(os.path.join(dir_path, file))

# batch_names, final_text, total_tokens = asyncio.run(htr_batch_process(image_paths[:10], batch_size=1))

# print(batch_names)

'read.txt'

In [32]:
# import asyncio
# import json
# import os
# from typing import List, Any

# Assuming other necessary imports and functions like `creator`, `add_dicts`, `encode_and_batch` are defined elsewhere

async def htr_gen(images: List[Any], intel=False):
    total_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    conversation = [{"role": "system", "content": SIMPLE_GPT_BASED_HTR_PROMPT}]

    user_message = {"role": "user", "content": [*map(lambda x: {
        "type": "image_url",
        "image_url": {
            "url": f"data:image/jpeg;base64,{x}"
        }
    }, images)]}
    conversation.append(user_message)

    try:
        response = await creator(
            **vision_model_defaults,
            messages=conversation
        )

        response = response.model_dump()
        text = response["choices"][0]["message"]["content"]
        text = "{" + text.split("{")[1].split("}")[0] + "}"
        text = json.loads(text)
    except Exception as e:
        raise ValueError("Error in the response: " + str(e))

    total_usage = add_dicts(total_usage, dict(response["usage"]))

    return text, response["usage"]


async def htr_batch_process(image_paths, batch_size: int):
    batches, batch_names = encode_and_batch(image_paths, batch_size=batch_size, verbose=True)

    final_text = []
    total_tokens = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
    
    # Run htr_gen for each batch concurrently
    tasks = [asyncio.create_task(htr_gen(batch)) for batch in batches]
    results = await asyncio.gather(*tasks)

    for text, _tokens in results:
        final_text.append(text["text"])
        total_tokens = add_dicts(total_tokens, _tokens)

    return batch_names, final_text, total_tokens


async def main():
    dir_path = r"C:\Users\DELL\Documents\Curate\curate-v1\core\test_data\mmh_english"
    image_paths = [os.path.join(dir_path, file) for file in os.listdir(dir_path)]
    
    batch_names, final_text, total_tokens = await htr_batch_process(image_paths[:10], batch_size=1)
    print(final_text)
    return batch_names, final_text, total_tokens

# This checks if there is an existing running loop and uses it accordingly
if asyncio.get_event_loop().is_running():
    # Schedule and run the main coroutine without blocking
    # Note: This doesn't wait for the result. Use 'await main()' if you need the result immediately and you're in an async context.
    asyncio.create_task(main())
else:
    # If there's no running loop, create one and run until complete
    result = asyncio.get_event_loop().run_until_complete(main())
    # You can process the result here



Total images: 10
Total batches: 10


Task exception was never retrieved
future: <Task finished name='Task-15' coro=<main() done, defined at C:\Users\DELL\AppData\Local\Temp\ipykernel_5424\2842789778.py:56> exception=ValueError("Error in the response: object ChatCompletion can't be used in 'await' expression")>
Traceback (most recent call last):
  File "C:\Users\DELL\AppData\Local\Temp\ipykernel_5424\2842789778.py", line 22, in htr_gen
    response = await creator(
TypeError: object ChatCompletion can't be used in 'await' expression

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\DELL\AppData\Local\Temp\ipykernel_5424\2842789778.py", line 60, in main
    batch_names, final_text, total_tokens = await htr_batch_process(image_paths[:10], batch_size=1)
  File "C:\Users\DELL\AppData\Local\Temp\ipykernel_5424\2842789778.py", line 47, in htr_batch_process
    results = await asyncio.gather(*tasks)
  File "C:\Users\DELL\AppData\Local\Temp\ipykernel_5424\2842

In [92]:
print(final_text[0])

HISTORY
ABHISHEK SIR
PARAMOUNT
Greek Word - Historia
Herodotus book, Historica
Father of History
Sources-
1) Archaeological sources
2) (Fossils, Monuments, inscriptions, coins, statues)
2) Literary Sources
   -> Religious (Rigveda, Samveda, etc)
   -> Non-religious (Panchatantra, Patanjali, etc)
3) Description of foreign travellers
* Veena - oldest instrument of India
* Tambura - Arab's oldest instrument.

1191 - First Battle of Tarain (Ghori & Prithviraj)
Kannauj (UP)
Gaharwal dynasty
ruler
Jai chand
daughter
Sanyogita (Prithviraj kidnapped her & got
married with her)

1192 - Second Battle of Tarain
Ghori defeated Prithviraj Chauhan & killed him,
from then Muslim rule was started.
1193 - he made Delhi as his capital.

1192 -Khwaja Moinuddin Chishti came to India and
made it his cottage at Ajmer and promoted
Sufi Rule.

1194 - Mohd. Ghori attacked Jaichand in Battle of
Chandawar.
Ghori defeated Jaichand & killed him

A slave and son in law of Mohd. Ghauri,
Qutubuddin Aibak.
Another sla

In [93]:
calculate_cost_gpt4_turbo(total_tokens)

{'usd': 0.18375, 'inr': 15.3191}

In [94]:
total_tokens

{'prompt_tokens': 11394, 'completion_tokens': 2327, 'total_tokens': 13721}