### This Notebook contains the all the functions.

#### Normalisation function:
This function is used for preprocessing the images for the text extraction.

#### OCR function:
This function is used for text extraction using AWS Textract

In [487]:
# import the necessary packages
from PIL import Image
import pytesseract
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from PIL import Image
import boto3
import pandas as pd

This function is for the sample_one. In this the crop size has been changed according to the image. The threshold used is also different, here adaptive threshold is used with gaussian blur, adaptive gaussian threshold and binary threshold.

We use the adaptive threshold because, rather than setting a one global threshold value, we let the algorithm calculate the threshold for small regions of the image. Thus, we end up having various threshold values for different regions of the image, which is great!

In [488]:
def image_norm_sample_one(img_path):
    # Read image using opencv
    img = cv2.imread(img_path)
    file_name = os.path.basename(img_path).split('.')[0]
    file_name = file_name.split()[0]
    current_dir = os.getcwd()
    output_dir = os.path.join(current_dir, 'normalised')
    
    #cropp
    img = img[0:270, 0:982]
    
    #convert into grayscale
    #img = cv2.imread(img)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Rescaling an image , need to scale the image to a larger size to recognize small characters
    img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    
    #denoising
    img = cv2.fastNlMeansDenoising(img, None, 10, 7, 21)
    
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    
    #apply threshold
    img = cv2.adaptiveThreshold(cv2.GaussianBlur(img, (1, 1), 0), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                cv2.THRESH_BINARY, 31, 2)
    
    
    save_path = os.path.join(output_dir, file_name + ".jpg")
    cv2.imwrite(save_path, img)

    return save_path

Similarly, for sample two different cropp and thresholds are used depending upon the image.

In [489]:
def image_norm_sample_two(img_path):
    # Read image using opencv
    img = cv2.imread(img_path)
    file_name = os.path.basename(img_path).split('.')[0]
    file_name = file_name.split()[0]
    current_dir = os.getcwd()
    output_dir = os.path.join(current_dir, 'normalised')
    
    #cropp
    img = img[0:365, 0:982]
    
    #convert into grayscale
    #img = cv2.imread(img)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Rescaling an image , need to scale the image to a larger size to recognize small characters
    img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    
    #denoising
    img = cv2.fastNlMeansDenoising(img, None, 10, 7, 21)
    
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    
    #apply threshold
    img = cv2.threshold(cv2.GaussianBlur(img, (3, 3), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    
    save_path = os.path.join(output_dir, file_name + ".jpg")
    cv2.imwrite(save_path, img)

    return save_path

Similarly, for sample three different cropp and thresholds are used depending upon the image.

In [490]:
def image_norm_sample_three(img_path):
    # Read image using opencv
    img = cv2.imread(img_path)
    file_name = os.path.basename(img_path).split('.')[0]
    file_name = file_name.split()[0]
    current_dir = os.getcwd()
    output_dir = os.path.join(current_dir, 'normalised')
    
    #cropp
    img = img[0:425, 0:982]
    
    #convert into grayscale
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Rescaling an image , need to scale the image to a larger size to recognize small characters
    img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    
    #denoising
    img = cv2.fastNlMeansDenoising(img, None, 10, 7, 21)
    
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    
    #apply threshold
    img = cv2.threshold(cv2.GaussianBlur(img, (7, 7), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    
    save_path = os.path.join(output_dir, file_name + ".jpg")
    cv2.imwrite(save_path, img)

    return save_path

This function uses AWS textract to extract the text from the image. The response gives the metadata of the image in json format and we use the Text element from that to extract our text. The accuracy for AWS Textract is eay better than pytesseract.

In [491]:
def ocr(imgpath):
    documentName = imgpath
    
    file_name = os.path.basename(imgpath).split('.')[0]
    file_name = file_name.split()[0] + '.jpg'
    
    #read documnet content
    with open(documentName, 'rb') as document:
        imageBytes = bytearray(document.read())
    
    # Amazon Textract client
    textract = boto3.client('textract',region_name='us-east-1')
    # Call Amazon Textract
    response = textract.detect_document_text(Document={'Bytes': imageBytes})
    
    l = []
    d = {}
    #Print detected text
    for item in response["Blocks"]:
        if item["BlockType"] == "LINE":
            #print(item["Text"])
            l.append(item["Text"])
            d= {file_name:l}
            #df = pd.DataFrame(list(d.items()), columns=['filename', 'text'])
            
    return d

### pytesseract

For the pytessract to work we need to do add the path so that it can work. We need to install the .exe file for the same, and we can get that from this url, https://github.com/UB-Mannheim/tesseract/wiki .The path is mentioned in the below cell.

In [492]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\Siddhant\AppData\Local\Tesseract-OCR\tesseract.exe'

In [493]:
import cv2
import sys
import pytesseract
 

imPath = 'D:\Analytics\Quarter 4\Applications of AI\Final project\ocr_test\iCard_021979_1_Daker_Sarah.jpg'
config = ('-l eng --oem 1 --psm 3')
 
# Read image from disk
im = cv2.imread(imPath)
 
# Run tesseract OCR on image
text = pytesseract.image_to_string(im, config=config)
 
# Print recognized text
print(text)

Daker

_Name.

Lot 105

Grave

bate of Gunas VeNuary 15
Aga 76y 6m --

Date of Romoval

MOUNT HOPE CEMETERY

1924

To

INDEA OF INTCRMENTS
