# Applied Project in Big Data on Industrial Dataset

## DATA COLLECTION AND PROCESSING TECHNIQUES
## Part III. OCR preprocessing: Yandex Vision OCR

### 1. Libraries and config parameters

In [None]:
import os
import time
import glob
import json
import base64
import requests
import numpy as np
from tqdm.auto import tqdm
from pdf2image import convert_from_path
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
DATA_PATH = '/home/jovyan/__AIKNOWLEDGEBASE'
PDFS_PATH = f'{DATA_PATH}/rawbooks'
IMGS_CACHE = f'{DATA_PATH}/imgcache'
RAWOCR_PATH = f'{DATA_PATH}/rawocr'
TXTS_PATH = f'{DATA_PATH}/texts_yaocr'
OCR_MODEL = 'table'  # or `page`
SLEEP = 1

os.makedirs(IMGS_CACHE, exist_ok=True)
os.makedirs(RAWOCR_PATH, exist_ok=True)
os.makedirs(TXTS_PATH, exist_ok=True)

### 2. Files to process

In [None]:
pdf_files = glob.glob(f'{PDFS_PATH}/**/*.pdf', recursive=True)
pdf_files = pdf_files[:2]
print('files to process:', len(pdf_files))
print('pdf files:', '\nfirst:', pdf_files[0], '\nlast:', pdf_files[-1])

### 3. Utils

In [None]:
def json_data(file_path):
    with open(file_path) as file:
        access_data = json.load(file)
    return access_data


def ocr_json(json_data, file_path):
    with open(file_path, 'w') as file:
        json.dump(json_data, file)


def encode_file(file_path):
    with open(file_path, 'rb') as file:
        file_content = file.read()
    return base64.b64encode(file_content).decode('utf-8')


creds = json_data(file_path=f'{DATA_PATH}/.accessyaapi')
print(creds.keys())

### 4. Yandex Vision OCR

In [None]:
for pdf_file in tqdm(pdf_files, desc='pdf files'):
    if '.pdf' in pdf_file:
        # clean cache
        for file_name in glob.glob(f'{IMGS_CACHE}/*.jpg'):
            os.remove(file_name)

        # convert PDF to images
        pdf_pages = convert_from_path(
            pdf_path=pdf_file,
            dpi=200,
            output_folder=IMGS_CACHE,
            first_page=None,
            last_page=None,
            fmt='JPEG'
        )

        # create path for raw ocr
        rawocr_dir = pdf_file.replace(
            PDFS_PATH,
            RAWOCR_PATH
        ).replace(
            '.pdf', ''
        )
        os.makedirs(rawocr_dir, exist_ok=True)

        # convert images to text
        text = ''
        for img_name in tqdm(sorted(os.listdir(f'{IMGS_CACHE}')), desc='images'):
            if '.jpg' in img_name:
                content = encode_file(f'{IMGS_CACHE}/{img_name}')
                data = {
                    'mimeType': 'JPEG',
                    'languageCodes': ['*'],
                    'model': OCR_MODEL,
                    'content': content
                }
                url = 'https://ocr.api.cloud.yandex.net/ocr/v1/recognizeText'
                headers = {
                    'Content-Type': 'application/json',
                    'Authorization': f'Api-Key {creds["secret_key"]}',
                    'x-folder-id': creds['folder_id'],
                    'x-data-logging-enabled': 'true'
                }
                flag = True
                while flag:
                    r = requests.post(url=url, headers=headers, data=json.dumps(data))
                    if r.status_code == 200:
                        # save raw ocr
                        rawocr_path = f'{rawocr_dir}/{img_name.replace(".jpg", ".json")}'
                        ocr_json(r.json(), rawocr_path)
                        # extract text
                        text_tmp = r.json()['result']['textAnnotation']['fullText']
                        text = ' '.join([text, text_tmp])
                        flag = False
                    else:
                        print('error ->', r.text)
                    time.sleep(SLEEP)
    else:
        print('file skiped:', pdf_file)

In [None]:
for pdf_file in tqdm(pdf_files, desc='pdf files'):
    if '.pdf' in pdf_file:
        rawocr_dir = pdf_file.replace(
            PDFS_PATH,
            RAWOCR_PATH
        ).replace(
            '.pdf', ''
        )
        # read json files and make a text
        json_files = glob.glob(f'{rawocr_dir}/*.json', recursive=True)
        text = ''
        for n_page, json_file in enumerate(tqdm(json_files, desc='json files')):
            d = json_data(json_file)
            # block to print out tables to text
            tables = d['result']['textAnnotation']['tables']
            tables_text = ''
            if tables:
                for tbl in tables:
                    rows = int(tbl['rowCount'])
                    cols = int(tbl['columnCount'])
                    arr = np.empty(shape=[rows, cols], dtype=object)
                    for cell in tbl['cells']:
                        arr[int(cell['rowIndex']), int(cell['columnIndex'])] = cell['text']
                    table_text = ''
                    for row in arr:
                        row_txt = '|'
                        for col in row:
                            row_txt += (col.replace('\n', ' ') + '|' if col else '')
                        table_text += (row_txt + '\n')
                    tables_text = tables_text + '\n\n' + table_text
            text = '{}\npage {}\n\n{}\n{}'.format(
                text,
                str(n_page + 1),
                d['result']['textAnnotation']['fullText'],
                tables_text
            )
        # write resulting text
        file_path = pdf_file.replace(
            PDFS_PATH + '/',
            'yaocrai_'
        ).replace(
            '/',
            '_'
        ).replace(
            '.pdf', '.txt'
        )
        with open(f'{TXTS_PATH}/{file_path}', 'w') as file:
            file.write(text)
    else:
        print('file skiped:', pdf_file)

### 5. Results

In [None]:
file_path = glob.glob(f'{RAWOCR_PATH}/**/*.json', recursive=True)[44]

In [None]:
res = json_data(file_path)

In [None]:
res

In [None]:
file_path = os.listdir(TXTS_PATH)[0]

In [None]:
with open(f'{TXTS_PATH}/{file_path}', 'r') as file:
    print(file.read())