In [4]:
!sudo apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 23 not upgraded.
Need to get 4,850 kB of archives.
After this operation, 16.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1 [1,598 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr amd64 4.1.1-2build2 [262 kB]
Fetched 4,850 kB in 0s (9,781 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/Fro

In [5]:
import os
import json
from PIL import Image
import pytesseract
import re

# Set the path to the folder containing the images
image_folder = '/content/images'

# Set the pattern for the image filenames
image_pattern = r'\.jpeg$'

# Only select the patters where the first 4 digits are numbers
child_text_pattern = r'\d{4}.*'

# Initialize the dictionary to store the extracted values for each image
output_dict = {}

# Iterate through the images in the folder
for filename in os.listdir(image_folder):
    # Check if the filename matches the pattern for image files
    if re.search(image_pattern, filename):
        # Get the image name without the extension
        image_name = os.path.splitext(filename)[0]
        
        # Load the JSON file corresponding to the image (I have kept both .jpeg and .json in the same folder)
        json_path = os.path.join(image_folder, image_name + '.json')
        with open(json_path, 'r') as f:
            json_data = json.load(f)
        
        # Find the child text block
        child_text_id = None
        for block in json_data['Blocks']:
            if 'childText' in block and block['childText'].strip():
                child_text_id = block['Id']
                break
        
        # Consider each blocks and get their measurements to extract text from that area
        if child_text_id is not None:
            child_text_bbox = None
            for block in json_data['Blocks']:
                if block['Id'] == child_text_id:
                    child_text_bbox = block['Geometry']['BoundingBox']
                    break
            
            if child_text_bbox is not None:
                image_path = os.path.join(image_folder, filename)
                image_width, image_height = Image.open(image_path).size
                left = int(child_text_bbox['Left'] * image_width)
                top = int(child_text_bbox['Top'] * image_height)
                width = int(child_text_bbox['Width'] * image_width)
                height = int(child_text_bbox['Height'] * image_height)
                
                image = Image.open(image_path)
                child_text_image = image.crop((left, top, left + width, top + height))
                
                child_text = pytesseract.image_to_string(child_text_image)
                child_text = child_text.strip()
                
                child_texts = re.findall(child_text_pattern, child_text)
                
                output_dict[image_name] = child_texts

# Save the output dictionary to a JSON file
print(output_dict)
output_path = os.path.join(image_folder, 'Rao_Tejaswi_Results.json')
with open(output_path, 'w') as f:
    json.dump(output_dict, f, indent=4)


{'testimage5': ['55213 iBT oo jac 690 00', '0810 20:12]. | na390 jour.', '13390 | Rr'], 'testimage4': ['11092020 [11092020 {11 11056 i L jA | 80.00) 4'], 'testimage2': [], 'testimage1': ['5964 | | a | saan lool 4', '2043, looaldam'], 'testimage3': ['110048 : i i 1 14 48 | 1.00']}


#### Alternative approaches tried

#### 1) Applied filters to reduce noise and improve contrast
#### 2) Applied median blurring to remove noices from the image
#### 2) Used OCRopus instead of tesseract to compare the results

#### The obtained results are not up to the mark, as it works perfectly only for testimage4. I will dig deep and see if i can improve this

In [None]:
# The best output I arrived at was the below one after reducing the noise and improving the contrast, it seems to have 
# worsened the situation, so i opted against using it

#  {"testimage5.jpeg": 
#  [["_"], ["aA ww & W NY"], ["08"], ["08"], ["08"], ["08"], ["oB8"], ["10"], ["10"], ["10"], ["10"], ["10"], ["10"], 
#  ["20"], ["20"], ["20"], ["38"], ["98"], ["98"], ["98"], ["08"], ["oe"], ["22"], ["10"], ["10"], ["10"], ["10"], 
#  ["10"], ["20"], ["20"], ["12"], ["12"], ["ASS13"], ["assco"], ["ASS13"], ["L3390"], ["13399"], ["LT"], ["LT"], ["ac"], 
#  ["ac"], ["aBC"], ["ABC"], ["ABC"], ["9c"], ["690"], ["90"], ["690"], ["60"], ["60"], ["co"], ["00 3"], ["oo"], ["C9"], 
#  ["09"], ["0\u00b0"]],
# "testimage3.jpeg": [], 
# "testimage1.jpeg": [["i a"], ["ns"], ["aA tL ADILE"]], 
# "testimage2.jpeg": [["4-\u2014-\u2014\u2014-\u2014 \u2014"], ["j\u2014 i"], ["ST TTT TT eT"], ["6 ae"]], 
# "testimage4.jpeg": [["a"], ["fT"]]}