# Deep Learning for Business Applications course

## TOPIC 8: More Tasks for Deep Learning. OCR with Tesseract

### 1. Libraries and parameters

Demo is based on [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) framework.

In [None]:
import os
import pytesseract
from tqdm.auto import tqdm
from pdf2image import convert_from_path
from PIL import Image

### 2. Document preprocessing

In [None]:
def pdf2img(file_path, img_dir, first_page, last_page, dpi=200):
    """
    Turns pdf file to set of jpeg images.

    """
    if not os.path.exists(img_dir):
        os.makedirs(img_dir)
    pdf_pages = convert_from_path(
        pdf_path=file_path,
        dpi=dpi,
        output_folder=img_dir,
        first_page=first_page,
        last_page=last_page,
        fmt='JPEG'
    )
    return pdf_pages


def ocr_text(img_dir, lang='eng'):
    """
    Takes the text from image,

    """
    text = ''
    for img_name in tqdm(sorted(os.listdir(img_dir))):
        if '.jpg' in img_name:
            text_tmp = str(
                pytesseract.image_to_string(
                    Image.open(f'{IMG_PATH}/{img_name}'),
                    lang=lang  # `eng+rus` for two languages in document
                )
            )
            text = ' '.join([text, text_tmp])
    return text

#### 2.1. First test

In [None]:
IMG_PATH = 'aiimg'
PDF_PATH = '/home/jovyan/__DATA/DLBA_F24/topic_08'
pdf_pages = pdf2img(
    file_path=f'{PDF_PATH}/AI_for_mapping_SDGs.pdf',
    img_dir=IMG_PATH,
    first_page=1,
    last_page=3
)

In [None]:
text = ocr_text(img_dir=IMG_PATH)

In [None]:
print(text)

#### 2.2. Second test

In [None]:
IMG_PATH = 'bdimg'
PDF_PATH = '/home/jovyan/__DATA/DLBA_F24/topic_08'
pdf_pages = pdf2img(
    file_path=f'{PDF_PATH}/Sherlock_Holmes_The_Blue_Diamond.pdf',
    img_dir=IMG_PATH,
    first_page=1,
    last_page=3
)

In [None]:
text = ocr_text(img_dir=IMG_PATH)

In [None]:
print(text)