In [54]:
# utils
import re
import os
from glob import glob
import concurrent.futures
import time
import pandas as pd
from datetime import datetime

# tesseract
import pytesseract

# opencv
import cv2

os.environ['OMP_THREAD_LIMIT'] = '6'

In [4]:
df = pd.read_csv('./is_ocr.csv')

In [117]:
# drop unused columns
unused_columns = [
    'process_class',
    'process_processing_date',
    'page_is_ocr',
    'process_process_time',
    'process_is_complete',
    'document_processing_date',
    'document_processing_time',
    'page_image',
    'page_piece'
]
new_df = df
for column in unused_columns:
    new_df = new_df.drop(column, axis=1)
    
# add new column to dataframe
new_df['crappy_ocr_text'] = None
    
# add new column to join process_id and document_id
new_df['id'] = new_df['process_id'].map(lambda x: str(x) + '_') + new_df['document_id'].map(lambda x: str(x) + '_') + new_df['page_number'].map(str)
new_df = new_df.set_index('id')
new_df = new_df.sort_index()

new_df.head()

Unnamed: 0_level_0,process_id,document_id,page_body,page_text_extract,page_number,crappy_ocr_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000008_310636853_1,1000008,310636853,RECURSO EXTRAORDINÁRIO 1.000.008 PARANÁ\n\nREG...,...,1,
1000008_310653251_1,1000008,310653251,Ó%/aléúnn O%ÃÁ///IM/ O%/k]fa/,...,1,
1000008_310958131_1,1000008,310958131,&ª fezaek Zã..í.,Supremo Tribuna...,1,
1000032_310629282_1,1000032,310629282,RECURSO EXTRAORDINÁRIO COM AGRAVO 1.000.032 MA...,...,1,
1000035_15338842013_1,1000035,15338842013,á? fezaki Z É Ád\nQ%gãaú %Wegwâdwção&%wa\n\nAR...,...,1,


In [104]:
def extract_and_add_to_df(path):
    """
    Receive a image path, extract it's content using tesseract and add it to dataframe.
    """

    # get image id
    current_id = re.sub('./crappy_images\/crappy_([\d]+_[\d]+_[\d]+)\.jpg', '\\1', path)

    # read image
    image = cv2.imread(path)
    
    # extract text
    text = pytesseract.image_to_string(image, lang='por+eng')
    
#     # add to dataframe
#     new_df.loc[current_id, 'crappy_ocr_text'] = text
    return [current_id, text]

In [96]:
extract_and_add_to_df('./crappy_images/crappy_1018649_311322053_1.jpg')

./crappy_images/crappy_1018649_311322053_1.jpg 1018649_311322053_1
1018649_311322053_1 0


['1018649_311322053_1', '']

In [121]:
crappy_folder = './crappy_images/'

image_list = glob(crappy_folder + '*.jpg')
size = len(image_list)

values = []
with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
    count = 0
    current_line = 0
    times = 0
    print('[{}] Info: Pipeline has started. Ammount of data: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), size))
    for img_path, result in zip(image_list, executor.map(extract_and_add_to_df, image_list[:1000])):
        values.append(result)
        current_line += 1
        count += 1
        if count == 1000:
            times += 1
            print('[{}] Info: Pipeline has processed and generated {} crappy images.'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), times * count))
            count = 0
        


[2019-11-10 21:40:49] Info: Pipeline has started. Ammount of data: 80624
[2019-11-10 21:45:26] Info: Pipeline has processed and generated 1000 crappy images.


In [122]:
for value in values:
    current_id, text = value
    print(current_id, len(text))
    new_df.loc[current_id, 'crappy_ocr_text'] = text
        

1018649_311322053_1 0
997215_310785435_1 0
1000816_310421887_2 0
1004537_310540147_1 0
1009332_310698499_5 461
1000929_310425558_23 561
1129687_314277807_205 1000
715112_309783646_24 0
1002012_310456385_176 0
950500_309123561_1 84
1025513_311218242_44 0
667552_1003774_15 123
667552_1003774_35 85
732779_2488267_8 1488
1020388_311083956_1 684
686615_1281047_5 656
818789_4668744_13 438
818703_4664472_16 94
1044873_311736837_5 0
808459_5179757_78 507
1128190_314237427_1 248
818595_4659739_21 18
667500_1002582_27 0
715112_309785360_88 0
1068934_312508918_20 0
1140287_314608062_2 0
1081338_312907111_1 0
1040384_311658130_1 321
1096784_313391324_9 19
981934_309861808_155 0
1025513_311218234_139 244
1070954_312584450_3 0
1091328_313239754_27 0
995716_310292023_39 0
669983_1033221_8 0
1016684_310941272_27 33
982435_309956406_39 1383
799803_4104316_1 379
1002012_310456387_20 0
1085863_313406107_1 0
821227_4758760_1 536
692503_1401594_14 302
991664_310196610_52 0
987553_310085307_44 476
673902_11

In [114]:
values

AttributeError: 'list' object has no attribute 'map'

In [123]:
new_df.to_csv('./ocr_results.csv')