In [None]:
import os
import pytesseract
from tqdm.notebook import tqdm
import cv2
import re
import spacy
from typing import List, Tuple
import numpy as np
import keybert
from lib.save_load_data import from_tsv_to_list, dump_json

In [None]:
main_dir = '/Users/alexdrozdz/Desktop/Studia/00. Seminarium magisterskie/Master_degree'

In [None]:
in_list = from_tsv_to_list(path=f'{main_dir}/data/test-A/in.tsv')
out_list = from_tsv_to_list(path=f'{main_dir}/data/test-A/out.tsv')

In [None]:
confidence_level = 0.4

In [None]:
logging.info("Preparing data for OCR")

In [None]:
def combine_data_for_ocr(in_list: List, out_list: List, confidence_level: float) -> List:
    innout = []
    for i in range(len(out_list)):
        temp_out_list = out_list[i].split(' ')
        for annotation in temp_out_list:
            temp_annotation = annotation.split(':')
            if float(temp_annotation[2]) >= confidence_level:
                bbox_list = temp_annotation[1].split(',')
            else:
                continue            
            innout.append([in_list[i], temp_annotation[0], int(bbox_list[0]), int(bbox_list[1]), int(bbox_list[2]), int(bbox_list[3])])

    return innout

In [None]:
innout = combine_data_for_ocr(in_list, out_list, confidence_level)

In [None]:
# innout = []
# for i in range(len(out_list)):
#     temp_out_list = out_list[i].split(' ')
#     innout_record = []
#     for annotation in temp_out_list:
#         temp_annotation = annotation.split(':')
#         if float(temp_annotation[2]) >= confidence_level:
#             bbox_list = temp_annotation[1].split(',')
#         else:
#             continue            
#         innout.append([in_list[i], temp_annotation[0], int(bbox_list[0]), int(bbox_list[1]), int(bbox_list[2]), int(bbox_list[3])])

In [None]:
def ocr_init() -> str:
    return os.popen("brew list tesseract | grep 'bin'").read().strip()

In [None]:
try:
    pytesseract.pytesseract.tesseract_cmd = ocr_init()
except:
    logging.error('Tesseract OCR not found, check if you installed it correctly')
    raise ModuleNotFoundError()

In [None]:
vs_content_dir = "visual_content"
if not os.path.exists(f"{main_dir}/vs_content_dir"):
    logging.info(f"Directory '{vs_content_dir}' doesn't exist, creating one")
    os.makedirs(f"{main_dir}/vs_content_dir")

In [None]:
def crop_image(image: np.ndarray, x0: int, x1: int, y0: int, y1: int) -> np.ndarray:
    """
    Crop image using bboxes
    """
    return image[y0:y1, x0:x1]

In [None]:
def image_transform(image: np.ndarray) -> np.ndarray:
    """
    Image transformation pipline
    """
    # greyscale image
    grey_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # dilate the image to get background (text removal)
    dilated_img = cv2.dilate(grey_img, np.ones((7,7), np.uint8))
    # use median blur on dilated image to get better background image containing all the shadows and discoloration
    bg_img = cv2.medianBlur(dilated_img, 21)
    # combine new backgorund with old image
    diff_img = 255 - cv2.absdiff(image, bg_img)
    # normalize the image to get full dynamic range
    norm_img = cv2.normalize(diff_img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)

    return norm_img

In [None]:
def ocr_predict(image: np.ndarray) -> str:
    """
    Read text from image 
    """
    return pytesseract.image_to_string(image)

In [None]:
def ocr_text_clean(text: str, spacy_language_core: spacy.language.Language) -> Tuple:
    # regex to clean and prepare text for search
    re_clean = re.compile("[^a-zA-Z1-9\s,.!?$%-']")
    re_search = re.compile('[^a-zA-Z1-9\s-]')
    # line-breaks fix
    fixed_text = re.sub('\n', ' ', re.sub('-\n',  '', text))
    # clean text
    clean_txt = re_clean.sub('', fixed_text)
    clean_txt = re.sub(' +', ' ', clean_txt)
    # prepare text for search engine
    search_txt = re_search.sub('', fixed_text)
    search_txt = re.sub('-',  ' ', search_txt)
    search_txt = " ".join([token.lemma_.lower() for token in spacy_language_core(search_txt) if not token.is_stop and not token.is_punct])
    search_txt = re.sub(' +', ' ', search_txt)

    return clean_txt, search_txt

In [None]:
def get_keywords(ocr_text: str, keybert_model: keybert.KeyBERT, top_n: int=10) -> List:
    keywords = keybert_model.extract_keywords(ocr_text, keyphrase_ngram_range=(1, 1), stop_words='english', highlight=False, top_n=top_n)
    return list(dict(keywords).keys())

In [None]:
# final dict to save
final_dict = {}

# spacy language core
logging.info("Loading Spacy language core")
try:
    nlp = spacy.load("en_core_web_sm")
except:
    logging.error('Spacy language core not found, check if you installed it correctly')
    raise ModuleNotFoundError()

# keybert model to extract keywords
logging.info("Loading KeyBERT pretrained model")
kw_model = keybert.KeyBERT(model='all-mpnet-base-v2')

# regex to clean and prepare text for search
re_clean = re.compile("[^a-zA-Z1-9\s,.!?$%-']")
re_search = re.compile('[^a-zA-Z1-9\s-]')

logging.info("Cropping visual contents, transforming, using OCR, cleaning results and saving")
for i, elem in tqdm(enumerate(innout)):
    # read image
    img = cv2.imread(f'{main_dir}/scraped_photos/{elem[0]}')

    # crop visual content
    # x0, y0, x1, y1 = elem[2], elem[3], elem[4], elem[5]
    # cropped_img = img[y0:y1, x0:x1]
    cropped_img = crop_image(img, elem[2], elem[4], elem[3], elem[5])

    # # greyscale image
    # cropped_img = cv2.cvtColor(cropped_img, cv2.COLOR_BGR2GRAY)
    # # dilate the image to get background (text removal)
    # dilated_img = cv2.dilate(cropped_img, np.ones((7,7), np.uint8))
    # # use median blur on dilated image to get better background image containing all the shadows and discoloration
    # bg_img = cv2.medianBlur(dilated_img, 21)
    # # combine new backgorund with old image
    # diff_img = 255 - cv2.absdiff(cropped_img, bg_img)
    # # normalize the image to get full dynamic range
    # norm_img = cv2.normalize(diff_img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
    transformed_cropped_img = image_transform(cropped_img)

    # ocr 
    # cropped_img_str = pytesseract.image_to_string(cropped_img)
    cropped_img_str = ocr_predict(transformed_cropped_img)
    
    # # clean text
    # fixed_cropped_img_str = re.sub('\n', ' ', re.sub('-\n',  '', cropped_img_str))
    # # clean text to show
    # clean_txt = re_clean.sub('', fixed_cropped_img_str)
    # clean_txt = re.sub(' +', ' ', clean_txt)
    # # preprocessed text to use in search
    # search_txt = re_search.sub('', fixed_cropped_img_str)
    # search_txt = re.sub('-',  ' ', search_txt)
    # search_txt = " ".join([token.lemma_.lower() for token in nlp(search_txt) if not token.is_stop and not token.is_punct])
    # search_txt = re.sub(' +', ' ', search_txt)
    clean_txt, search_txt = ocr_text_clean(cropped_img_str, spacy_language_core=nlp)

    # keywords
    # keywords = kw_model.extract_keywords(search_txt, keyphrase_ngram_range=(1, 1), stop_words='english', highlight=False, top_n=10)
    # keywords_list = list(dict(keywords).keys())
    keywords_list = get_keywords(search_txt, top_n=10, keybert_model=kw_model)

    # save results
    in_dict = {'origin_file': elem[0], 'predicted_label': elem[1], 'ocr_raw_text': cropped_img_str, 'cleaned_text': clean_txt.strip(), 'search_text': search_txt.strip(), 'keywords': keywords_list}
    cv2.imwrite(f"{main_dir}/visual_content/vs_{i}.png", cropped_img)

    final_dict[f"vs_{i}.png"] = in_dict

In [None]:
ocr_dir = "ocr_results"
if not os.path.exists(f"{main_dir}/ocr_results"):
    logging.info(f"Directory '{ocr_dir}' doesn't exist, creating one")
    os.makedirs(f"{main_dir}/ocr_results")

In [None]:
dump_json(path=f'{ocr_dir}/vs_ocr_data.json', dict_to_save=final_dict)
logging.info(f"OCR output json saved in '{ocr_dir}' directory")