In [1]:
INPUT_DIR = './Davar' # Directory that contains subdirectories with ALTO newspapers
OUTPUT_DIR = './output' # Directory in which the output CSVs are saved
SKIP_EXISTING = True # If True, will skip newspapers whose outputs already exist in the output folder, if False will process them again

import datetime
import glob
import os
from os import path
import xml.etree.ElementTree as ET
import re


from tqdm import tqdm
import pandas as pd
from kraken.lib import models

from readers import OliveReader, AltoReader, build_mets, NewAltoReader
from ocr import ImageOCR


baseline_model = None  # vgsl.TorchVGSLModel.load_model('blla.mlmodel')
model = models.load_any('model_best_9379_140624.mlmodel')
ocr = ImageOCR(model=model, bw_threshold=150, baseline_model=baseline_model)

def list_alto_pages(alto_dir):
    return sorted(list(glob.glob(path.join(alto_dir, '**/ALTO/*.xml'), recursive=True)))

def list_mets_files(parent_dir):
    return sorted(list(glob.glob(path.join(parent_dir, '**/*-METS.xml'), recursive=True)))
    
def get_alto_pages(mets_path):
    return sorted(list(glob.glob(path.join(path.dirname(mets_path), 'ALTO/*.xml'))))

def process_alto_page(page):
    olr = NewAltoReader(page)
    blocks = olr.get_text_blocks()
    process_blocks(blocks, olr, page)
    return blocks

def process_blocks(blocks, olr, page_file):
    for block in tqdm(blocks):
        block['page_file'] = page_file
        try:
            block['ocr_text'] = '\n'.join(ocr.get_text(olr.get_image_for_block(block)))
        except Exception as e:
            print('Error in text: ', e)
            block['ocr_text'] = ''

def get_mets_save_name(mets_file):
    """
    Get CSV output file name according to path to METS file, taking the mets file name and prepending the newsletter name,
    assuming the newsletter name is in the alphabetical directory recursively containing the mets file:
    '/path/to/.../Davar/1957/01/01_01/19570101_01-METS.xml' -> 'Davar_19570101_01.csv'
    """
    full_path = path.abspath(path.dirname(mets_file)).replace('\\', '/') # '/path/to/Davar/1957/01/02/...'
    first_alphabetic_dir = [d for d in reversed(full_path.split('/')) if re.match('[a-zA-Z]+', d) is not None][0] # 'Davar'
    return first_alphabetic_dir + '_' + path.basename(mets_file).replace('.xml', '.csv').replace('-METS', '')  # 'Davar_19570101_01.csv'

def alto_dir_pipeline(alto_dir, output_dir, skip_existing=True):
    mets_files = list_mets_files(alto_dir)

    # output_file = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M') + '_' + alto_dir.replace('/', '_').replace('\\', '_') + '.csv'
    
    save_interval = 3
    process_i = 0
    
    for mets_file in mets_files:
        output_file = os.path.join(output_dir, get_mets_save_name(mets_file))
        if skip_existing and os.path.exists(output_file):
            print(f'Skipping existing: {output_file}')
            continue
        
        pages_results = []
        print(f'Processing METS: {mets_file}')
        pages = get_alto_pages(mets_file)
        
        for page in tqdm(pages):
            page_results = process_alto_page(page)
            pages_results.extend(page_results)
            # process_i += 1
            # if process_i % save_interval == 0:
            #     # Save every save_interval pages
            #     results_df = pd.DataFrame(pages_results)
            #     results_df.to_csv(output_file)

        results_df = pd.DataFrame(pages_results)
        # results_df.to_csv(output_file)        
    
        results_df = results_df.set_index('block_id')
        with open(mets_file, 'r', encoding='utf-8') as mets:
            tree = ET.parse(mets)
            mets_root = tree.getroot()
            mets_data = build_mets(mets_root)
            
            for article in mets_data:
                ocr_texts = []
                for begin in article['begins']:
                    try:
                        block = results_df.loc[begin]
                        if not pd.isna(block['ocr_text']):
                            ocr_texts.append(block['ocr_text'])
                    except KeyError:
                        print(f'Block {begin} was not found in data')
                article['ocr_text'] = '\n'.join(ocr_texts)
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)
        pd.DataFrame(mets_data).to_csv(output_file)
alto_dir_pipeline("חירות", "Herut_output", SKIP_EXISTING)

scikit-learn version 1.2.2 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.
Torch version 2.0.1+cu117 has not been tested with coremltools. You may run into unexpected errors. Torch 2.0.0 is the most recent version that has been tested.


Skipping existing: Herut_output/azureuser_19481003_01.csv
Skipping existing: Herut_output/azureuser_19481006_01.csv
Skipping existing: Herut_output/azureuser_19481007_01.csv
Skipping existing: Herut_output/azureuser_19481008_01.csv
Skipping existing: Herut_output/azureuser_19481010_01.csv
Skipping existing: Herut_output/azureuser_19481011_01.csv
Skipping existing: Herut_output/azureuser_19481012_01.csv
Skipping existing: Herut_output/azureuser_19481014_01.csv
Skipping existing: Herut_output/azureuser_19481015_01.csv
Skipping existing: Herut_output/azureuser_19481017_01.csv
Skipping existing: Herut_output/azureuser_19481019_01.csv
Skipping existing: Herut_output/azureuser_19481020_01.csv
Skipping existing: Herut_output/azureuser_19481021_01.csv
Skipping existing: Herut_output/azureuser_19481022_01.csv
Skipping existing: Herut_output/azureuser_19481024_01.csv
Skipping existing: Herut_output/azureuser_19481026_01.csv
Skipping existing: Herut_output/azureuser_19481027_01.csv
Skipping exist

  0%|                                                                                                                                                                        | 0/4 [00:00<?, ?it/s]
  0%|                                                                                                                                                                       | 0/89 [00:00<?, ?it/s][A
  1%|█▊                                                                                                                                                             | 1/89 [00:01<02:29,  1.70s/it][A
  2%|███▌                                                                                                                                                           | 2/89 [00:07<06:08,  4.23s/it][A
  3%|█████▎                                                                                                                                                         | 3/89 [00:08<03:48,  2.66s/it][A
  4%|███

IndexError: list index out of range