## Handwritten Text recognition using gpt-4-vision model

In [43]:
%load_ext autoreload
%autoreload 2

import os
curpath = os.getcwd()
os.chdir(curpath.split("core")[0])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
import base64
import requests
import os
from typing import List, Any, Union
from tqdm import tqdm

from core.features.utils import calculate_cost_gpt4_turbo, add_dicts
from prompt import SIMPLE_GPT_BASED_HTR_PROMPT, SIMPLE_GPT_BASED_HTR_FORMATTED_PROMPT

# OpenAI API Key
api_key = os.environ.get('OPENAI_API_KEY')


In [45]:

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


def get_text(image_paths: List[str]):

    # Getting the base64 string
    encoded = []
    if isinstance(image_paths, List):
        for image_path in image_paths:
            encoded.append(encode_image(image_path))
    else:
        raise ValueError("image_paths should be a list of image paths")


    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    total_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
    retrieved_text = []
    for base64_image in tqdm(encoded):
        payload = {
            "model": "gpt-4-vision-preview",
            "messages": [{"role": "system", "content": SIMPLE_GPT_BASED_HTR_PROMPT},
                {
                "role": "user",
                "content": [
                    # {
                    # "type": "text",
                    # "text": ques
                    # },
                    {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                        # "detail": "low"
                    }
                    }
                ]
                }
            ],
            "max_tokens": 3000
            
            # "response_format": {"type": "json_object"}
        }

        try:
            response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
            response = response.json()
            text = response["choices"][0]["message"]["content"]
        except:
            print(response.json())

        total_usage = add_dicts(total_usage, dict(response["usage"]))
        retrieved_text.append(text)

    # print(response.json())

    return retrieved_text, total_usage

In [48]:
image_paths = []
dir_path = r"C:\Users\DELL\Documents\Curate\curate-v1\core\test_data\mmh_english"

for file in os.listdir(dir_path):
    image_paths.append(os.path.join(dir_path, file))

image_paths = sorted(image_paths)

text, total_tokens = get_text(image_paths[:5])

100%|██████████| 5/5 [00:42<00:00,  8.59s/it]


In [50]:
for i in range(5):
    print(text[i])
    print()
    print("#########################################")
    print()

HISTORY
ABHISHEK SIR
PARAMOUNT

Greek Word - Historia
|
|
Herodotus book Historica
|
Father of History

Sources-
1) Archaeological sources
2) (Fossils, Monuments, inscriptions, coins, statues)
3) Literary Sources
   → Religious (Rigveda, Samveda, etc)
   → Non- religious (Panchtantra, Patanjali, etc)
3) Description of foreign travelers

* Veena - oldest instrument of India
* Tambura: Arab's oldest instrument.

Scanned by CamScanner
www.OnlineStudyPoints.com

#########################################

1191 - First Battle of Tarain (Ghori & Prithviraj)
                                Ghori lost & Prithviraj won

Kannauj (UP)

↓

Chauhan dynasty

↓

ruler

↓

Jai chand

↓

daughter

↓

Sanyogita (Prithviraj kidnapped her & got
married with her)

1192 - Second Battle of Tarain

Ghori defeated Prithviraj Chauhan & killed him,
from then Muslim rule was started.

1193 - he made Delhi as his capital.

In 1192 -Khuaja Moinuddin Chishti came to India and
made it his cottage at Ajmer and preached

In [44]:
total_tokens

{'prompt_tokens': 5860, 'total_tokens': 6903, 'completion_tokens': 1043}

In [45]:
def calculate_cost_gpt4_turbo(token_usage):
    prompt = token_usage['prompt_tokens']
    completion = token_usage['completion_tokens']

    cost =  (prompt * 0.01 + completion * 0.03) / 1000

    return cost

calculate_cost_gpt4_turbo(total_tokens)

0.08989

### Notes Enhancement

In [46]:

def get_formatted_text(image_paths: List[str]):

    # Getting the base64 string
    encoded = []
    if isinstance(image_paths, List):
        for image_path in image_paths:
            encoded.append(encode_image(image_path))
    else:
        raise ValueError("image_paths should be a list of image paths")


    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    total_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
    retrieved_text = []
    for base64_image in tqdm(encoded):
        payload = {
            "model": "gpt-4-vision-preview",
            "messages": [{"role": "system", "content": SIMPLE_GPT_BASED_HTR_FORMATTED_PROMPT},
                {
                "role": "user",
                "content": [
                    # {
                    # "type": "text",
                    # "text": ques
                    # },
                    {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                        # "detail": "low"
                    }
                    }
                ]
                }
            ],
            "max_tokens": 3000,
            # "response_format": {"type": "json_object"},
            # "response_format": "json_object",
        }

        try:
            response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
            response = response.json()
            text = response["choices"][0]["message"]["content"]
        except:
            print(response)

        total_usage = add_dicts(total_usage, dict(response["usage"]))
        retrieved_text.append(text)

    # print(response.json())

    return retrieved_text, total_usage

In [47]:
image_paths = []
dir_path = r"C:\Users\DELL\Documents\Curate\curate-v1\core\test_data\mmh_english"

for file in os.listdir(dir_path):
    image_paths.append(os.path.join(dir_path, file))

image_paths = sorted(image_paths)

text, total_tokens = get_formatted_text(image_paths[:5])

100%|██████████| 5/5 [00:47<00:00,  9.41s/it]


In [48]:
import json

for i in range(5):
    t = "{" + text[i].split("{")[1].split("}")[0] + "}"
    tj = json.loads(t)
    print(tj['text'])
    print()
    print("#########################################")
    print()

HISTORY
- Greek Word - Historia
↓
Herodotus book Historica
↓
Father of History

Sources-
1) Archaeological sources
2) (Fossils, Monuments, inscriptions, coins, statues)
2) Literary Sources
	→ Religious (Rigveda, Samveda, etc)
	→ Non-religious (Panchatantra, Patanjali, etc)
3) Description of foreign travellers

* Veena - oldest instrument of India
* Tambura - Arab's oldest instrument.

#########################################

1191 - First Battle of Tarain (Ghori & Prithviraj) where Prithviraj won. Kannauj (U.P.) was under the Gahadvala dynasty ruled by Jai Chand, whose daughter Sanyogita was kidnapped by Prithviraj Chauhan and then married. 1192 - Second Battle of Tarain where Ghori defeated Prithviraj Chauhan and killed him; from then on, Muslim rule was established in the region. 1193 - Ghori made Delhi as his capital. In 1192, Khwaja Moinuddin Chishti came to India and made his cottage at Ajmer which promoted Sufi Rule. 1194 - Ghori attacked Jai Chand in the Battle of Chandawar, wh