In [1]:
# utils
import re
import os
from glob import glob
import concurrent.futures
import time
import pandas as pd
from datetime import datetime

# tesseract
import pytesseract

# opencv
import cv2

os.environ['OMP_THREAD_LIMIT'] = '6'

In [2]:
df = pd.read_csv('./is_ocr.csv')

In [3]:
# drop unused columns
unused_columns = [
    'process_class',
    'process_processing_date',
    'page_is_ocr',
    'process_process_time',
    'process_is_complete',
    'document_processing_date',
    'document_processing_time',
    'page_image',
    'page_piece'
]
new_df = df
for column in unused_columns:
    new_df = new_df.drop(column, axis=1)
    
# add new column to dataframe
new_df['crappy_ocr_text'] = None
    
# add new column to join process_id and document_id
new_df['id'] = new_df['process_id'].map(lambda x: str(x) + '_') + new_df['document_id'].map(lambda x: str(x) + '_') + new_df['page_number'].map(str)
new_df = new_df.set_index('id')
new_df = new_df.sort_index()

new_df.head()

Unnamed: 0_level_0,process_id,document_id,page_body,page_text_extract,page_number,crappy_ocr_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000008_310636853_1,1000008,310636853,RECURSO EXTRAORDINÁRIO 1.000.008 PARANÁ\n\nREG...,...,1,
1000008_310653251_1,1000008,310653251,Ó%/aléúnn O%ÃÁ///IM/ O%/k]fa/,...,1,
1000008_310958131_1,1000008,310958131,&ª fezaek Zã..í.,Supremo Tribuna...,1,
1000032_310629282_1,1000032,310629282,RECURSO EXTRAORDINÁRIO COM AGRAVO 1.000.032 MA...,...,1,
1000035_15338842013_1,1000035,15338842013,á? fezaki Z É Ád\nQ%gãaú %Wegwâdwção&%wa\n\nAR...,...,1,


In [4]:
def extract_and_add_to_df(path):
    """
    Receive a image path, extract it's content using tesseract and add it to dataframe.
    """

    # get image id
    current_id = re.sub('./crappy_images\/crappy_([\d]+_[\d]+_[\d]+)\.jpg', '\\1', path)

    # read image
    image = cv2.imread(path)
    
    # extract text
    text = pytesseract.image_to_string(image, lang='por+eng')
    
#     # add to dataframe
#     new_df.loc[current_id, 'crappy_ocr_text'] = text
    return [current_id, text]

In [8]:
crappy_folder = './crappy_images/'

image_list = glob(crappy_folder + '*.jpg')
size = len(image_list)

values = []
with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
    count = 0
    current_line = 0
    times = 0
    print('[{}] Info: Pipeline has started. Ammount of data: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), size))
    for img_path, result in zip(image_list, executor.map(extract_and_add_to_df, image_list)):
        values.append(result)
        current_line += 1
        count += 1
        if count == 1000:
            times += 1
            print('[{}] Info: Pipeline has processed and generated {} crappy images.'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), times * count))
            count = 0
        


[2019-11-11 00:08:36] Info: Pipeline has started. Ammount of data: 80624
[2019-11-11 00:12:24] Info: Pipeline has processed and generated 1000 crappy images.
[2019-11-11 00:15:51] Info: Pipeline has processed and generated 2000 crappy images.
[2019-11-11 00:19:13] Info: Pipeline has processed and generated 3000 crappy images.
[2019-11-11 00:23:10] Info: Pipeline has processed and generated 4000 crappy images.
[2019-11-11 00:26:26] Info: Pipeline has processed and generated 5000 crappy images.
[2019-11-11 00:29:34] Info: Pipeline has processed and generated 6000 crappy images.
[2019-11-11 00:32:57] Info: Pipeline has processed and generated 7000 crappy images.
[2019-11-11 00:36:14] Info: Pipeline has processed and generated 8000 crappy images.
[2019-11-11 00:39:57] Info: Pipeline has processed and generated 9000 crappy images.
[2019-11-11 00:43:35] Info: Pipeline has processed and generated 10000 crappy images.
[2019-11-11 00:46:33] Info: Pipeline has processed and generated 11000 crapp

In [9]:
for value in values:
    current_id, text = value
    new_df.loc[current_id, 'crappy_ocr_text'] = text
        

In [10]:
new_df.to_csv('./ocr_results.csv')