# Applied Project in Big Data on Industrial Dataset

## DATA COLLECTION AND PROCESSING TECHNIQUES
## Part III. OCR preprocessing: Tesseract

### 1. Libraries and config parameters

In [None]:
import os
import glob
import json
import pytesseract
from tqdm.auto import tqdm
from pdf2image import convert_from_path
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
DATA_PATH = '/home/jovyan/__AIKNOWLEDGEBASE'
PDFS_PATH = f'{DATA_PATH}/rawbooks'
IMGS_CACHE = f'{DATA_PATH}/imgcache'
TXTS_PATH = f'{DATA_PATH}/texts_pytess'

os.makedirs(IMGS_CACHE, exist_ok=True)
os.makedirs(TXTS_PATH, exist_ok=True)

### 2. Files to process

In [None]:
pdf_files = glob.glob(f'{PDFS_PATH}/**/*.pdf', recursive=True)
pdf_files = pdf_files[:2]
print('files to process:', len(pdf_files))
print('pdf files:', '\nfirst:', pdf_files[0], '\nlast:', pdf_files[-1])

### 3. Tesseract OCR

In [None]:
for pdf_file in tqdm(pdf_files, desc='pdf files'):
    if '.pdf' in pdf_file:
        # clean cache
        for file_name in glob.glob(f'{IMGS_CACHE}/*.jpg'):
            os.remove(file_name)

        # convert PDF to images
        pdf_pages = convert_from_path(
            pdf_path=pdf_file,
            dpi=200,
            output_folder=IMGS_CACHE,
            first_page=None,
            last_page=None,
            fmt='JPEG'
        )

        # convert images to text
        text = ''
        for img_name in tqdm(sorted(os.listdir(f'{IMGS_CACHE}')), desc='images'):
            if '.jpg' in img_name:
                text_tmp = str(
                    pytesseract.image_to_string(
                        Image.open(f'{IMGS_CACHE}/{img_name}'),
                        lang='eng+rus'
                    )
                )
                text = ' '.join([text, text_tmp])

        # write resulting text
        file_name = pdf_file.replace(PDFS_PATH, '').replace('/', '').replace('.pdf', '.txt')
        file_path = f'{TXTS_PATH}/pytess_{file_name}'
        with open(file_path, 'w') as file:
            file.write(text)
    else:
        print('file skiped:', pdf_file)

### 4. Results

In [None]:
file_path = os.listdir(IMGS_CACHE)[4]

In [None]:
img = Image.open(f'{IMGS_CACHE}/{file_path}')
plt.imshow(img)
plt.show()

In [None]:
print(text)