In [41]:
import pytesseract
from PIL import Image, ImageFilter, ImageEnhance
import numpy as np
from skimage import io
from skimage.color import rgb2gray
from skimage.transform import rotate
from deskew import determine_skew
import glob
import cv2
import numpy as np  
import os
from tqdm.notebook import tqdm 
import time
import pickle
from langdetect import detect_langs


#ignore UserWarning
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


path_to_tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [42]:
def deskew(train_or_test, more_or_less):
    temp = ""
    if more_or_less == "less":
        temp = "less30"
    else:
        temp = "moreeq30"

    
    input_list_names = glob.glob(f"wszystkie_zdj/{train_or_test}/*",)
    

    for photo_name in tqdm(input_list_names):
        name = photo_name.split("\\")[-1]
        name = name.split(".")[0]

        try:    
            image = io.imread(photo_name)
        except:
            print(f"Nie udało się otworzyć pliku {name}")
            continue
        angle = determine_skew(image)
        rotated = rotate(image, angle, resize=True) * 255

        if abs (angle) >= 30:
            io.imsave(f'deskewed/{train_or_test}/{temp}/{name}.png', image.astype(np.uint8))
        else:
            io.imsave(f'deskewed/{train_or_test}/{temp}/{name}.png', rotated.astype(np.uint8))

In [43]:
def increase_contrast(train_or_test, more_or_less):
    # increase contrast
    temp = ""
    if more_or_less == "less":
        temp = "less30"
    else:
        temp = "moreeq30"

    
    input_list_names = glob.glob(f"deskewed/{train_or_test}/{temp}/*",)
    for photo_name in tqdm(input_list_names):
        name = photo_name.split("\\")[-1]
        name = name.split(".")[0]
        original_image = io.imread(photo_name)
        image = Image.fromarray(original_image)
        enhancer = ImageEnhance.Contrast(image)
        image = enhancer.enhance(1.25)
        image = np.array(image)
        io.imsave(f'contrast/{train_or_test}/{temp}/{name}.png', image.astype(np.uint8))


In [44]:
def sharpen(train_or_test, more_or_less):
    # increase contrast
    sharpen_filter = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
    temp = ""
    if more_or_less == "less":
        temp = "less30"
    else:
        temp = "moreeq30"
    input_list_names = glob.glob(f"contrast/{train_or_test}/{temp}/*",)
    for photo_name in tqdm(input_list_names):
        name = photo_name.split("\\")[-1]
        name = name.split(".")[0]
        original_image = io.imread(photo_name)
        sharp_image = cv2.filter2D(original_image, -1, sharpen_filter)
        sharp_image = Image.fromarray(sharp_image)
        sharp_image.save(f'sharpened/{train_or_test}/{temp}/{name}.png')

In [45]:
def try_catch_lang(text):
    try:
        probs = [x.prob for x in detect_langs(text)]
    except:
        probs = [0]
    return probs[0]

In [46]:
def check_rotation(text,photo_name):
    # checking if the default rotation is correct  
    prob_0 = try_catch_lang(text)
    if prob_0<0.97:
        # if not sure then rotate 90 degrees clockwise and check again
        image_1 = cv2.imread(photo_name, cv2.IMREAD_UNCHANGED)
        image_rot1 = cv2.rotate(image_1, cv2.ROTATE_90_CLOCKWISE)
        text_1 = pytesseract.image_to_string(image_rot1)
        prob_1 = try_catch_lang(text_1)
        best_prob = np.argmax([prob_0,prob_1])
        if prob_1<0.95:
            # if still not sure than rotate the original file 90 degrees counter clockwise and check again
            image_rot2 = cv2.rotate(image_1, cv2.ROTATE_90_COUNTERCLOCKWISE)
            text_2 = pytesseract.image_to_string(image_rot2)
            prob_2 = try_catch_lang(text_2)
            # return the rotation with the highest probability
            best_prob = np.argmax([prob_0,prob_1,prob_2])
            if best_prob == 2:
                return text_2
        if best_prob == 1:
            return text_1
    return text

In [47]:
def get_text(train_or_test, more_or_less):
    temp = ""
    if more_or_less == "less":
        temp = "less30"
    else:
        temp = "moreeq30"
    input_list_names = glob.glob(f"sharpened/{train_or_test}/{temp}/*",)
    custom_config = r'--oem 3 --psm 6 -l pol'
    photos_dict = {}
    for photo_name in tqdm(input_list_names):
        name = photo_name.split("\\")[-1]
        name = name.split(".")[0]
        image = io.imread(photo_name)


        # Perform OCR using pytesseract
        text = pytesseract.image_to_string(image, config=custom_config)
        text = check_rotation(text,photo_name)
        
        # count number of \n
        count = text.count("\n")
        # replace \n with space
        text = text.replace("\n", " ")
        # replace \x0c with space
        text = text.replace("\x0c", " ")
    

        photos_dict[name] = (text, count)
    with open(f'photos_dict_{temp}_{train_or_test}.pickle', 'wb') as handle:
        pickle.dump(photos_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return photos_dict

In [48]:
def pipeline(train_or_test, more_or_less, deskew_ = True, increase_contrast_ = True, sharpen_ = True, get_text_ = True):
    temp = ""
    if more_or_less == "less":
        temp = "less30"
    else:
        temp = "moreeq30"

    if deskew_:
        deskew(train_or_test, more_or_less)
    
    if increase_contrast_:
        increase_contrast(train_or_test, more_or_less)
    
    if sharpen_:
        sharpen(train_or_test, more_or_less)
    
    if get_text_:
        get_text(train_or_test, more_or_less)
    # read pickle
    with open(f'photos_dict_{temp}_{train_or_test}.pickle', 'rb') as handle:
        photos_dict = pickle.load(handle)
    return photos_dict

In [49]:
pipeline("test", "more", deskew_=False)
pipeline("train", "more", deskew_=False)
pipeline("test", "less", deskew_=False)
pipeline("train", "less", deskew_=False)


  0%|          | 0/3456 [00:00<?, ?it/s]

  0%|          | 0/3456 [00:00<?, ?it/s]

  0%|          | 0/3456 [00:00<?, ?it/s]

  0%|          | 0/10742 [00:00<?, ?it/s]

  0%|          | 0/10742 [00:00<?, ?it/s]

  0%|          | 0/10742 [00:00<?, ?it/s]

{'0001ea99-0c87-4ed7-b658-fecb779186ef': ('BASIE WEIGHT SPECIĘJCATION CABARRUS COUNTY EFFECTIVE DATE, _ 06/10/1996 GENCO ULTRA LIGHTS 100 REGULAR SP STANDARD PRODUCTIGH BRAND CODE-UAAPI CST) „PHYSICAL PARAMETERS, YOBACEO RGD LENGTA 4 a.0 wi TIPPING LENSTA , 36.000 sh CIGARETTE PAPER WIDTA + 2r.00 Ha CIRCUNFERENCE : 26.80 mi CALEULATED VOLUME 3 3:82,1463 GUBIE Wł CALCULATED DENSITY + 22. Tann €GHS/CE -IARGRTS.| © PAGE 0, v. , 12.0 x ww STEM EXTRACT RATE x: 1.00 x ww ŁGOSE ENDS * < ir000 GNS/50 FIRMNESS TARGET b 3.40 m RANGE 4 FROM 5.00 T0 3:80 HA <ONE WEEK eż) RANGE 1 ROM 3.30 TO3.5D WR <EIGHT WEEK M=26) FILLER WEJEWT : 21555165 GRAS ERLEULATED WEIGHT: 1.5656 135. 71000 PAPER AND ADHESIYE | + .0571906 GRAKS ROD WETGKT : „aoan GRAS TIBPING ARD ADHEGIVE + 10667575 GRANS TOTAL LESS PLUG : „8500705 GRRHS E) REASON FDR CHANGE _ + O69Z0£STD)-GENERALS ULTRA LIGHTS 100 REGULAR SP 3 + ADDED TO BASE SPEC OBRPLCSTD) AS NEM GENERIC BRAND S + START-UP AT C/C. NO OTNER CHANGES. PREPARED BY + ALLYSON 

In [54]:
import pandas as pd