## 1. Convert pdf to image

In [None]:
## NOTE: install tesseract (https://github.com/UB-Mannheim/tesseract/wiki) and Poppler first
# !pip install pytesseract
# !pip install Pillow
# !pip install pdf2image

In [1]:
# import statements
from PIL import Image
from pdf2image import convert_from_path
import sys
import os
import numpy as np

In [None]:
folder_path = 'C:\\Users\Vanessa\\Downloads\\for_ocr'
file_list = os.listdir(folder_path)

# remove duplicates from list
unique_files = [file for file in file_list if "(1)" not in file]


# convert pdf to image in PNG format 
def pdf_to_imgs(folder_path, file):
    pages = convert_from_path(f"{folder_path}\\{file}", 500)
    
    # counter for image file
    img_counter = 1
    
    # for each unique page, make a filename and save as png
    for page in pages:
        filename = f"{file}_{img_counter}.png".replace('.pdf','')
        print(f'Saving {filename}')
        page.save(filename, 'PNG')
        img_counter += 1

In [None]:
for file in unique_files:
    pdf_to_imgs(folder_path, file)

## 2. Check file integrity, size

In [None]:
folder_path = 'C:\\Users\\Vanessa\\Jupyter Notebooks\\STUFF'
file_list = [f for f in os.listdir(folder_path) if f.endswith('.png')]

print('Total files to check:', len(file_list))

# getting maximum dimension of each image
max_width = 0
max_height = 0
for file in file_list:
    try:
        with Image.open(os.path.join(folder_path, file)) as img:
            width, height = img.size
        if width > max_width:
            max_width = width
        if height > max_height:
            max_height = height
    except:
        print(file)

print('Maximum Width: ', max_width)
print('Maximum Height: ', max_height)

## 3. Convert image to OCR

In [2]:
import cv2 as cv
import pytesseract

pytesseract.pytesseract.tesseract_cmd=r'C:\Program Files\Tesseract-OCR\tesseract.exe'
custom_config = r' --psm 6'

In [3]:
# method to ocr
def remove_header_bg(img):
    
    # convert image to hsv
    img_hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV)
    h, s, v = cv.split(img_hsv)

    # threshold saturation img
    thresh1 = cv.threshold(s, 92, 255, cv.THRESH_BINARY)[1]

    # threshold value img then invert
    thresh2 = cv.threshold(v, 128, 255, cv.THRESH_BINARY_INV)[1]

    # make mask
    mask = cv.add(thresh1, thresh2)

    # apply mask to remove unwanted background on figure
    processed_img = img.copy()
    processed_img[mask==0] = (255,255,255)
    lined_img = processed_img.copy()

    # convert to greyscale 
    gray = cv.cvtColor(lined_img, cv.COLOR_BGR2GRAY)
    blur = cv.GaussianBlur(gray,(5,5),0)
    thresh = cv.threshold(blur, 0, 255, cv.THRESH_BINARY_INV + cv.THRESH_OTSU)[1]

    # remove horizontal lines
    hor_kernel = cv.getStructuringElement(cv.MORPH_RECT, (100,1))
    remove_hor = cv.morphologyEx(thresh, cv.MORPH_OPEN, hor_kernel, iterations=2)
    cnts = cv.findContours(remove_hor, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        cv.drawContours(lined_img, [c], -1, (255,255,255), 5)

    # try to read text
    text = pytesseract.image_to_string(lined_img, config=custom_config)
    return text
    

In [4]:
# get imgage files
img_path = os.path.abspath('')
imgs = [file for file in os.listdir(img_path) if file.endswith('.png')]
imgs.sort()

In [6]:
for img in imgs:
    fname = os.path.splitext(img)[0]
 
    image = cv.imread(img)
    title = remove_header_bg(image[1200:1700 , 100:5900])
    header = remove_header_bg(image[1800:1950 , 100:5900])
    contents = remove_header_bg(image[2100:7100 , 100:5900])
        
    with open(f'{fname}.txt', 'a') as f:
        f.write(title)
        f.write(header)
        f.write(contents)
    
    print(fname,' converted')
    
print('All img files converted')

HH-001_1  converted
HH-002_1  converted
HH-003_1  converted
HH-004_1  converted
HH-005_1  converted
HH-006_1  converted
HH-006_2  converted
HH-007_1  converted
HH-008_1  converted
HH-010_1  converted
HH-010_2  converted
HH-011_1  converted
HH-011_2  converted
HH-012_1  converted
HH-013_1  converted
HH-013_2  converted
HH-014_1  converted
HH-014_2  converted
HH-015_1  converted
HH-015_2  converted
HH-015_3  converted
HH-015_4  converted
IND-003_1  converted
IND-003_2  converted
IND-004_1  converted
IND-004_2  converted
IND-004_3  converted
IND-005_1  converted
IND-005_2  converted
IND-005_3  converted
IND-012_1  converted
IND-015_1  converted
IND-015_2  converted
IND-015_3  converted
IND-017_1  converted
IND-017_2  converted
IND-017_3  converted
IND-047_1  converted
IND_001_1  converted
IND_002_1  converted
IND_006_1  converted
IND_007_1  converted
IND_008_1  converted
IND_009_1  converted
IND_009_2  converted
IND_010_1  converted
IND_010_2  converted
IND_010_3  converted
IND_010_4  con