# EASTD143A: Digitalizing old materials with OCR
Date: 2023-03-23 15:45

Instructor: Kwok-leong Tang


## Install Tesseract OCR
Tesseract OCR documentation: https://tesseract-ocr.github.io/

Open your terminal, paste the following command:

`sudo apt-get update && sudo apt-get install tesseract-ocr && sudo apt-get install -y tesseract-ocr-eng tesseract-ocr-chi-sim tesseract-ocr-jpn tesseract-ocr-kor
`


## Install the required Python libraries:

pytesseract documentation: https://github.com/madmaze/pytesseract
opencv documentation: https://docs.opencv.org/4.x/d6/d00/tutorial_py_root.html


In [None]:
pip install pytesseract opencv-python

In [None]:
# import cv2 and pytesseract

import cv2
import pytesseract

In [None]:
# import matplotlib & pandas
# for matplotlib documentation: https://matplotlib.org/stable/users/index.html
# for pandas documentation: https://pandas.pydata.org/docs/user_guide/index.html

import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# read the image by using cv2.imread()

image = cv2.imread(
    '/workspaces/eastd143a-ocr/data/images/1c_1l_01.png')


In [None]:
# set the langauge variable
# for multiple languages use + sign, e.g. lang = 'eng+chi_tra+chi_sim'

lang = 'eng' 

In [None]:
# extract the text and put it in a variable

extracted_text = pytesseract.image_to_string(image, lang=lang)

In [None]:
# print the text

print(extracted_text)

In [None]:
def draw_boxes(image, boxes, color, thickness):
    for box in boxes:
        x, y, w, h = box
        cv2.rectangle(image, (x, y), (x + w, y + h), color, thickness)
    return image

In [None]:
def plot_characters(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)
    original_image = image.copy()

    # Set the language for OCR (e.g., 'chi_sim' for simplified Chinese, 'jpn' for Japanese, 'kor' for Korean)
    lang = "eng"

    # Perform OCR using Tesseract
    extracted_text = pytesseract.image_to_string(image, lang=lang)

    # Get character/word bounding boxes
    boxes = pytesseract.image_to_boxes(image, lang=lang)
    box_list = []

    for b in boxes.splitlines():
        elements = b.split()
        x, y, w, h = int(elements[1]), int(
            elements[2]), int(elements[3]), int(elements[4])
        box_list.append((x, image.shape[0] - y, w - x, y - h))

    # Draw green boxes around each detected character/word
    boxed_image = draw_boxes(original_image, box_list,
                             color=(0, 255, 0), thickness=2)

    # Convert the boxed_image from BGR to RGB
    boxed_image_rgb = cv2.cvtColor(boxed_image, cv2.COLOR_BGR2RGB)

    # Display the boxed image using pyplot
    plt.figure(figsize=(16, 10))
    plt.imshow(boxed_image_rgb)
    plt.xticks([]), plt.yticks([])  # to hide tick values on X and Y axis
    plt.show()

    # Print the extracted text
    print("Extracted Text:")
    print(extracted_text)


In [None]:
plot_characters('/workspaces/eastd143a-ocr/data/images/1c_1l_01.png')

In [None]:
def confindence_rate(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)

    # Set the language for OCR (e.g., 'chi_sim' for simplified Chinese, 'jpn' for Japanese, 'kor' for Korean)
    lang = "eng"

    # Perform OCR using Tesseract and obtain character data
    data = pytesseract.image_to_data(
        image, lang=lang, output_type=pytesseract.Output.DATAFRAME)

    # Filter out rows with empty text
    data = data[data['text'].notnull()]

    # Iterate through each row and print the character along with its confidence level
    for index, row in data.iterrows():
        char = row['text']
        conf = row['conf']
        print(f"Character: {char}, Confidence: {conf}")


In [None]:
confindence_rate('/workspaces/eastd143a-ocr/data/images/1c_1l_01.png')