In [1]:
import pytesseract
from PIL import Image, ImageFilter, ImageEnhance
import numpy as np
from skimage import io
from skimage.color import rgb2gray
from skimage.transform import rotate
from deskew import determine_skew
import glob
import cv2
import numpy as np  
import os
from tqdm.notebook import tqdm 
import time
import pickle
from langdetect import detect_langs
import pandas as pd


#ignore UserWarning
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


path_to_tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [2]:
def deskew(train_or_test, more_or_less):
    temp = ""
    if more_or_less == "less":
        temp = "less30"
    else:
        temp = "moreeq30"

    
    input_list_names = glob.glob(f"wszystkie_zdj/{train_or_test}/*",)
    

    for photo_name in tqdm(input_list_names):
        name = photo_name.split("\\")[-1]
        name = name.split(".")[0]

        try:    
            image = io.imread(photo_name)
        except:
            print(f"Nie udało się otworzyć pliku {name}")
            continue
        angle = determine_skew(image)
        rotated = rotate(image, angle, resize=True) * 255

        if abs (angle) >= 30:
            io.imsave(f'deskewed/{train_or_test}/{temp}/{name}.png', image.astype(np.uint8))
        else:
            io.imsave(f'deskewed/{train_or_test}/{temp}/{name}.png', rotated.astype(np.uint8))

In [3]:
def increase_contrast(train_or_test, more_or_less):
    # increase contrast
    temp = ""
    if more_or_less == "less":
        temp = "less30"
    else:
        temp = "moreeq30"

    
    input_list_names = glob.glob(f"deskewed/{train_or_test}/{temp}/*",)
    for photo_name in tqdm(input_list_names):
        name = photo_name.split("\\")[-1]
        name = name.split(".")[0]
        original_image = io.imread(photo_name)
        image = Image.fromarray(original_image)
        enhancer = ImageEnhance.Contrast(image)
        image = enhancer.enhance(1.25)
        image = np.array(image)
        io.imsave(f'contrast/{train_or_test}/{temp}/{name}.png', image.astype(np.uint8))


In [4]:
def sharpen(train_or_test, more_or_less):
    # increase contrast
    sharpen_filter = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
    temp = ""
    if more_or_less == "less":
        temp = "less30"
    else:
        temp = "moreeq30"
    input_list_names = glob.glob(f"contrast/{train_or_test}/{temp}/*",)
    for photo_name in tqdm(input_list_names):
        name = photo_name.split("\\")[-1]
        name = name.split(".")[0]
        original_image = io.imread(photo_name)
        sharp_image = cv2.filter2D(original_image, -1, sharpen_filter)
        sharp_image = Image.fromarray(sharp_image)
        sharp_image.save(f'sharpened/{train_or_test}/{temp}/{name}.png')

In [5]:
def try_catch_lang(text):
    try:
        probs = [x.prob for x in detect_langs(text)]
    except:
        probs = [0]
    return probs[0]

In [6]:
def check_rotation(text,photo_name):
    # checking if the default rotation is correct  
    prob_0 = try_catch_lang(text)
    image_1 = cv2.imread(photo_name, cv2.IMREAD_UNCHANGED)
    if prob_0<0.97:
        # if not sure then rotate 90 degrees clockwise and check again
        
        image_rot1 = cv2.rotate(image_1, cv2.ROTATE_90_CLOCKWISE)
        text_1 = pytesseract.image_to_string(image_rot1)
        prob_1 = try_catch_lang(text_1)
        best_prob = np.argmax([prob_0,prob_1])
        if prob_1<0.95:
            # if still not sure than rotate the original file 90 degrees counter clockwise and check again
            image_rot2 = cv2.rotate(image_1, cv2.ROTATE_90_COUNTERCLOCKWISE)
            text_2 = pytesseract.image_to_string(image_rot2)
            prob_2 = try_catch_lang(text_2)
            # return the rotation with the highest probability
            best_prob = np.argmax([prob_0,prob_1,prob_2])
            if best_prob == 2:
                cv2.imwrite(photo_name,image_rot2)
                return text_2
        if best_prob == 1:
            cv2.imwrite(photo_name,image_rot1)
            return text_1
    cv2.imwrite(photo_name,image_1)
    return text

In [7]:
def get_text(train_or_test, more_or_less):
    temp = ""
    if more_or_less == "less":
        temp = "less30"
    else:
        temp = "moreeq30"
    input_list_names = glob.glob(f"sharpened/{train_or_test}/{temp}/*",)
    custom_config = r'--oem 3 --psm 6 -l pol'
    photos_dict = {}
    for photo_name in tqdm(input_list_names):
        name = photo_name.split("\\")[-1]
        name = name.split(".")[0]
        image = io.imread(photo_name)


        # Perform OCR using pytesseract
        text = pytesseract.image_to_string(image, config=custom_config)
        #text = check_rotation(text,photo_name)
        
        # count number of \n
        count = text.count("\n")
        # replace \n with space
        text = text.replace("\n", " ")
        # replace \x0c with space
        text = text.replace("\x0c", " ")
    

        photos_dict[name] = (text, count)
    with open(f'photos_dict_{temp}_{train_or_test}.pickle', 'wb') as handle:
        pickle.dump(photos_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return photos_dict

In [8]:
def pipeline(train_or_test, more_or_less, deskew_ = True, increase_contrast_ = True, sharpen_ = True, get_text_ = True):
    temp = ""
    if more_or_less == "less":
        temp = "less30"
    else:
        temp = "moreeq30"

    if deskew_:
        deskew(train_or_test, more_or_less)
    
    if increase_contrast_:
        increase_contrast(train_or_test, more_or_less)
    
    if sharpen_:
        sharpen(train_or_test, more_or_less)
    
    if get_text_:
        get_text(train_or_test, more_or_less)
    # read pickle
    with open(f'photos_dict_{temp}_{train_or_test}.pickle', 'rb') as handle:
        photos_dict = pickle.load(handle)
    return photos_dict

In [9]:
# pipeline("test", "more", deskew_=False)
# pipeline("train", "more", deskew_=False)
# pipeline("test", "less", deskew_=False)
pipeline("train", "less", deskew_=False)


  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/3456 [00:00<?, ?it/s]

  0%|          | 0/3456 [00:00<?, ?it/s]

  0%|          | 0/3456 [00:00<?, ?it/s]

  0%|          | 0/10742 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:


# read pickle
with open(f'photos_dict_less30_train.pickle', 'rb') as handle:
    photos_dict_moreeq30_train = pickle.load(handle)

# pickle to dataframe
df = pd.DataFrame.from_dict(photos_dict_moreeq30_train, orient='index')

# get text from 0be1e8f7-ab86-40ac-854c-2dca5ab259e1
text = df.loc["0c360979-3b99-48cc-b4e0-72a1d7d8a4ac"][0]
text


'syuowetg SoetL  '

In [2]:
from objects_metrics import *

In [3]:
df1 = generate_metrics(get_filenames("wszystkie_zdj/train/"))

  0%|          | 0/10849 [00:00<?, ?it/s]

wszystkie_zdj/train/0001ea99-0c87-4ed7-b658-fecb779186ef.tiff
wszystkie_zdj/train/00133f89-017d-44f5-a549-e11cf5b5221a.tiff
wszystkie_zdj/train/0016b62a-bec9-45e1-b49b-aa76ed28b24c.tiff
wszystkie_zdj/train/0025418f-0afe-458e-bb61-aa26fbcb2cf9.jpg
wszystkie_zdj/train/002604f4-259f-44d3-bd0c-37ea1ead946f.tiff
wszystkie_zdj/train/002ca5b9-1d65-4748-a619-f0b7406e0741.tiff
wszystkie_zdj/train/002dbe76-e499-4b44-b2f4-c9d039592984.jpg
wszystkie_zdj/train/002e7e22-048e-4cf8-be5a-d3789d656b35.tiff
wszystkie_zdj/train/003a221a-1383-4695-ad08-87c9ee201f0d.jpg
wszystkie_zdj/train/003a526b-7082-4673-b87f-7229853e2f5d.tiff
wszystkie_zdj/train/00441da6-0614-442f-b754-6028d47dd2b8.tiff
wszystkie_zdj/train/0049bd7c-6d39-48bd-8069-431533dae9bd.jpg
wszystkie_zdj/train/0059d81f-65ca-43f4-9842-4c1ad34e7481.tiff
wszystkie_zdj/train/005b8895-6f45-4ee4-9bbb-ccb6173ab7cb.tiff
wszystkie_zdj/train/005c1334-2d06-4965-a274-32017b1d113e.tiff
wszystkie_zdj/train/0064d0a7-dba8-46c6-a57d-241f3b154e19.tiff
wszystkie_zd

In [7]:
df2 = generate_metrics(get_filenames("wszystkie_zdj/test/"))

  0%|          | 0/3491 [00:00<?, ?it/s]

wszystkie_zdj/test/0009d485-11a3-4299-b19d-1cc37bc0f7fb.tiff
wszystkie_zdj/test/000ea643-eda4-4ab6-aa39-65f3787d522d.tiff
wszystkie_zdj/test/00246a52-7855-4888-bc8a-45353c0a5888.tiff
wszystkie_zdj/test/002d3899-1822-4a81-9387-86dfe0387feb.tiff
wszystkie_zdj/test/002d3ab1-166e-4ce7-9c8e-e9961cecb736.tiff
wszystkie_zdj/test/00349ed3-4c1b-4bec-b300-66db619b6873.tiff
wszystkie_zdj/test/003f245d-8ee4-4dbd-acc8-b363022264d0.tiff
wszystkie_zdj/test/00623b03-7d6f-42ec-bdfb-f5e780eb07b9.tiff
wszystkie_zdj/test/00acb951-dfa6-4bfb-a24d-53c0677e63f0.tiff
wszystkie_zdj/test/00c761a6-3f30-4c7a-a107-39d6e5bdd6b2.tiff
wszystkie_zdj/test/00d15557-30a0-4d3f-a538-d251f8e06c97.tiff
wszystkie_zdj/test/00f03acd-9d50-4c72-b927-da68b1a44caf.tiff
wszystkie_zdj/test/00f0abc8-3216-477c-a2e1-b26b9c38317a.tiff
wszystkie_zdj/test/00f98979-242f-468f-af65-cdf605f6ef91.tiff
wszystkie_zdj/test/010897c4-2bf1-444d-8af0-cbf995ad44be.jpg
wszystkie_zdj/test/01533ed1-d7dd-4cf2-9f27-dc6dca2778e1.tiff
wszystkie_zdj/test/015b44

In [5]:
df1.to_csv("train_metrics.csv")


In [6]:
df1

Unnamed: 0,filepath,hasTable,numberEmpty,nonWhiteFraction,possibleShapes,possibleImages
0,wszystkie_zdj/train/0001ea99-0c87-4ed7-b658-fe...,True,0.042373,0.021114,0,0
1,wszystkie_zdj/train/00133f89-017d-44f5-a549-e1...,True,0.146341,0.575719,4,2
2,wszystkie_zdj/train/0016b62a-bec9-45e1-b49b-aa...,True,0.064103,0.023731,0,0
3,wszystkie_zdj/train/0025418f-0afe-458e-bb61-aa...,True,0.091116,0.762817,2,1
4,wszystkie_zdj/train/002604f4-259f-44d3-bd0c-37...,True,0.223570,0.479270,2,1
...,...,...,...,...,...,...
10844,wszystkie_zdj/train/ffbb1b89-8091-4791-8e90-fc...,True,0.178571,0.106515,11,0
10845,wszystkie_zdj/train/ffc089f7-a830-4381-a5d7-d5...,True,0.214286,0.001893,0,0
10846,wszystkie_zdj/train/ffe55741-2b8b-4ba8-8113-23...,True,0.100200,0.049482,0,0
10847,wszystkie_zdj/train/ffe73bd2-0a89-4c8e-a2a2-23...,True,0.117057,0.079227,2,1


In [8]:
df2.to_csv("test_metrics.csv")