## Handwritten Text recognition using gpt-4-vision model

In [32]:
%load_ext autoreload
%autoreload 2

import os
curpath = os.getcwd()
os.chdir(curpath.split("core")[0])

In [33]:
import base64
import requests
import os
import json
from typing import List, Any, Union
from tqdm import tqdm

from core.features.utils import calculate_cost_gpt4_turbo, add_dicts, pdf_to_images
from prompt import SIMPLE_GPT_BASED_HTR_PROMPT, SIMPLE_GPT_BASED_HTR_FORMATTED_PROMPT
from core.features.provider import creator, text_model_defaults, vision_model_defaults

from dotenv import load_dotenv
load_dotenv()

import time

In [85]:

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


def encode_and_batch(image_paths, batch_size=4, verbose = False):

    encoded = []
    encoded_name = []
    enc = []
    enc_name = []
    for image_path in image_paths:
        enc_name.append(image_path.split("/")[-1])
        enc.append(encode_image(image_path))
        if len(enc) >= batch_size:
            encoded.append(enc)
            encoded_name.append(enc_name)
            enc = []
            enc_name = []
    
    if len(enc) > 0:
        encoded.append(enc)
        encoded_name.append(enc_name)

    if verbose:
        print("Total images:", len(image_paths))
        print("Total batches:", len(encoded))

    return encoded, encoded_name

In [86]:
def htr_gen(images: List[Any], intel = False):

    # logging.info("encoded the images")
    total_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    conversation = [{"role": "system", "content": SIMPLE_GPT_BASED_HTR_PROMPT}]
    # if intel:
    #     conversation = [{"role": "system", "content": SIMPLE_GPT_BASED_HTR_FORMATTED_PROMPT}]
        
    user_message = {"role": "user", "content": [*map(lambda x :{
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{x}"
          }
        }, images)]}
    conversation.append(user_message)

    try:
        response = creator(
                **vision_model_defaults,
                messages = conversation
                )
        
        response = response.model_dump()
        text = response["choices"][0]["message"]["content"]
        text = "{" + text.split("{")[1].split("}")[0] + "}"

        text = json.loads(text)
    except:
        # print(response.json())
        raise ValueError("Error in the response")

    total_usage = add_dicts(total_usage, dict(response["usage"]))

    return text, response["usage"]

In [87]:

def htr_batch_process(image_paths, batch_size: int):
    
    batches, batch_names = encode_and_batch(image_paths[:10], batch_size=batch_size, verbose=True)

    final_text = []
    total_tokens = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
    for batch in tqdm(batches):
        text, _tokens = htr_gen(batch)
        final_text.append(text["text"])
        total_tokens = add_dicts(total_tokens, _tokens)

    return batch_names, final_text, total_tokens


Total images: 10
Total batches: 4


100%|██████████| 4/4 [01:06<00:00, 16.54s/it]


In [90]:
dir_path = r"C:\Users\DELL\Documents\Curate\curate-v1\core\test_data\mmh_english"
image_paths = []
for file in os.listdir(dir_path):
    image_paths.append(os.path.join(dir_path, file))

batch_names, final_text, total_tokens = htr_batch_process(dir_path, batch_size=1)

print(batch_names)

[['C:\\Users\\DELL\\Documents\\Curate\\curate-v1\\core\\test_data\\mmh_english\\mmh_image1.jpg', 'C:\\Users\\DELL\\Documents\\Curate\\curate-v1\\core\\test_data\\mmh_english\\mmh_image10.jpg', 'C:\\Users\\DELL\\Documents\\Curate\\curate-v1\\core\\test_data\\mmh_english\\mmh_image11.jpg'], ['C:\\Users\\DELL\\Documents\\Curate\\curate-v1\\core\\test_data\\mmh_english\\mmh_image12.jpg', 'C:\\Users\\DELL\\Documents\\Curate\\curate-v1\\core\\test_data\\mmh_english\\mmh_image13.jpg', 'C:\\Users\\DELL\\Documents\\Curate\\curate-v1\\core\\test_data\\mmh_english\\mmh_image14.jpg'], ['C:\\Users\\DELL\\Documents\\Curate\\curate-v1\\core\\test_data\\mmh_english\\mmh_image15.jpg', 'C:\\Users\\DELL\\Documents\\Curate\\curate-v1\\core\\test_data\\mmh_english\\mmh_image16.jpg', 'C:\\Users\\DELL\\Documents\\Curate\\curate-v1\\core\\test_data\\mmh_english\\mmh_image17.jpg'], ['C:\\Users\\DELL\\Documents\\Curate\\curate-v1\\core\\test_data\\mmh_english\\mmh_image18.jpg']]


In [92]:
print(final_text[0])

HISTORY
ABHISHEK SIR
PARAMOUNT
Greek Word - Historia
Herodotus book, Historica
Father of History
Sources-
1) Archaeological sources
2) (Fossils, Monuments, inscriptions, coins, statues)
2) Literary Sources
   -> Religious (Rigveda, Samveda, etc)
   -> Non-religious (Panchatantra, Patanjali, etc)
3) Description of foreign travellers
* Veena - oldest instrument of India
* Tambura - Arab's oldest instrument.

1191 - First Battle of Tarain (Ghori & Prithviraj)
Kannauj (UP)
Gaharwal dynasty
ruler
Jai chand
daughter
Sanyogita (Prithviraj kidnapped her & got
married with her)

1192 - Second Battle of Tarain
Ghori defeated Prithviraj Chauhan & killed him,
from then Muslim rule was started.
1193 - he made Delhi as his capital.

1192 -Khwaja Moinuddin Chishti came to India and
made it his cottage at Ajmer and promoted
Sufi Rule.

1194 - Mohd. Ghori attacked Jaichand in Battle of
Chandawar.
Ghori defeated Jaichand & killed him

A slave and son in law of Mohd. Ghauri,
Qutubuddin Aibak.
Another sla

In [93]:
calculate_cost_gpt4_turbo(total_tokens)

{'usd': 0.18375, 'inr': 15.3191}

In [94]:
total_tokens

{'prompt_tokens': 11394, 'completion_tokens': 2327, 'total_tokens': 13721}