In [None]:
INPUT_DIR = './Davar' # Directory that contains subdirectories with ALTO newspapers
OUTPUT_DIR = './out' # Directory in which the output CSVs are saved
SKIP_EXISTING = True # If True, will skip newspapers whose outputs already exist in the output folder, if False will process them again
NUM_PROCESSES = 1 # Change to number of processes to run simultaneously, should be around the number of processors on the machine

DISABLE_TQDM = True # Disable progress bars

import datetime
import glob
import os
from os import path
import xml.etree.ElementTree as ET
import re
import multiprocessing
import logging
from functools import partial
import math


from tqdm import tqdm
import pandas as pd
from kraken.lib import models

from readers import OliveReader, AltoReader, build_mets, NewAltoReader
from ocr import ImageOCR


logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - [PID: %(process)d] - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)


def list_alto_pages(alto_dir):
    return sorted(list(glob.glob(path.join(alto_dir, '**/ALTO/*.xml'), recursive=True)))

def list_mets_files(parent_dir):
    return sorted(list(glob.glob(path.join(parent_dir, '**/*-METS.xml'), recursive=True)))
    
def get_alto_pages(mets_path):
    return sorted(list(glob.glob(path.join(path.dirname(mets_path), 'ALTO/*.xml'))))
    
def get_mets_save_name(mets_file):
    """
    Get CSV output file name according to path to METS file, taking the mets file name and prepending the newsletter name,
    assuming the newsletter name is in the alphabetical directory recursively containing the mets file:
    '/path/to/.../Davar/1957/01/01_01/19570101_01-METS.xml' -> 'Davar_19570101_01.csv'
    """
    full_path = path.abspath(path.dirname(mets_file)).replace('\\', '/') # '/path/to/Davar/1957/01/02/...'
    first_alphabetic_dir = [d for d in reversed(full_path.split('/')) if re.match('[a-zA-Zא-ת]+', d) is not None][0] # 'Davar'
    return first_alphabetic_dir + '_' + path.basename(mets_file).replace('.xml', '.csv').replace('-METS', '')  # 'Davar_19570101_01.csv'


def process_mets_files(mets_files, alto_dir, output_dir, skip_existing):
    logging.getLogger('kraken').setLevel(logging.WARNING)
    baseline_model = None  # vgsl.TorchVGSLModel.load_model('blla.mlmodel')
    model = models.load_any('model_best_9379_140624.mlmodel')
    ocr = ImageOCR(model=model, bw_threshold=150, baseline_model=baseline_model)

    logging.info(f'Started process with {len(mets_files)} files: {mets_files}')
    
    def process_alto_page(page):
        olr = NewAltoReader(page)
        blocks = olr.get_text_blocks()
        process_blocks(blocks, olr, page)
        return blocks
    
    def process_blocks(blocks, olr, page_file):
        for block in tqdm(blocks, disable=DISABLE_TQDM):
            block['page_file'] = page_file
            try:
                block['ocr_text'] = '\n'.join(ocr.get_text(olr.get_image_for_block(block)))
            except Exception as e:
                print('Error in text: ', e)
                block['ocr_text'] = ''
    

    for mets_file in mets_files:
        try:
            output_file = os.path.join(output_dir, get_mets_save_name(mets_file))
            if skip_existing and os.path.exists(output_file):
                logging.info(f'Skipping existing: {output_file}')
                continue
            
            pages_results = []
            logging.info(f'Processing METS: {mets_file}')
            pages = get_alto_pages(mets_file)
            
            for pi, page in enumerate(tqdm(pages, disable=DISABLE_TQDM)):
                try:
                    logging.info(f'Starting page {pi + 1} of {mets_file}')
                    page_results = process_alto_page(page)
                    pages_results.extend(page_results)
                    logging.info(f'Done with page {pi + 1} of {mets_file}')
                except Exception:
                    logging.exception(f'Failed to process page {pi + 1} in file {mets_file}')
            results_df = pd.DataFrame(pages_results)
            results_df = results_df.set_index('block_id')
            
            with open(mets_file, 'r', encoding='utf-8') as mets:
                tree = ET.parse(mets)
                mets_root = tree.getroot()
                mets_data = build_mets(mets_root)
                
                for article in mets_data:
                    ocr_texts = []
                    for begin in article['begins']:
                        try:
                            block = results_df.loc[begin]
                            if not pd.isna(block['ocr_text']):
                                ocr_texts.append(block['ocr_text'])
                        except KeyError:
                            print(f'Block {begin} was not found in data')
                    article['ocr_text'] = '\n'.join(ocr_texts)
            
            if not os.path.exists(output_dir):
                os.makedirs(output_dir, exist_ok=True)
            pd.DataFrame(mets_data).to_csv(output_file)
        except Exception:
            logging.exception(f'Failed to process file: {mets_file}')


def chunk_list(data, chunk_size):
    """Divide the data into chunks of the given size."""
    for i in range(0, len(data), chunk_size):
        yield data[i:i + chunk_size]

def alto_dir_pipeline(alto_dir, output_dir, skip_existing, num_processes):
    initial_mets_files = list_mets_files(alto_dir)  # All files, including those processed already
    mets_files = [] # Actual files to process
    for mets_file in initial_mets_files:
        output_file = os.path.join(output_dir, get_mets_save_name(mets_file))
        if skip_existing and os.path.exists(output_file):
            logging.info(f'Skipping existing: {output_file}')
            continue
        mets_files.append(mets_file)

    if not mets_files:
        logging.info('No files to process!')
        return
    
    mets_chunks = chunk_list(mets_files, math.ceil(len(mets_files) / num_processes))

    with multiprocessing.Pool(processes=num_processes) as pool:
        # Map the process function to items asynchronously
        process_with_args = partial(process_mets_files, alto_dir=alto_dir, output_dir=output_dir, skip_existing=skip_existing)
        logging.info(f'Starting: {process_with_args}')
        
        result_async = pool.map_async(process_with_args, mets_chunks)
        # Block until all finish
        result_async.get()

alto_dir_pipeline("HaYom", "Hayom_output", skip_existing=True, num_processes=4)

2024-11-25 04:50:58 - [PID: 1406] - INFO - Starting: functools.partial(<function process_mets_files at 0x7661fa7afe20>, alto_dir='HaYom', output_dir='Hayom_output', skip_existing=True)
2024-11-25 04:51:31 - [PID: 9530] - INFO - Started process with 16 files: ['HaYom/1966/10/19_01/19661019_01-METS.xml', 'HaYom/1966/10/20_01/19661020_01-METS.xml', 'HaYom/1966/10/21_01/19661021_01-METS.xml', 'HaYom/1966/10/23_01/19661023_01-METS.xml', 'HaYom/1966/10/24_01/19661024_01-METS.xml', 'HaYom/1966/10/25_01/19661025_01-METS.xml', 'HaYom/1966/10/26_01/19661026_01-METS.xml', 'HaYom/1966/10/27_01/19661027_01-METS.xml', 'HaYom/1966/10/28_01/19661028_01-METS.xml', 'HaYom/1966/10/30_01/19661030_01-METS.xml', 'HaYom/1966/10/31_01/19661031_01-METS.xml', 'HaYom/1966/11/01_01/19661101_01-METS.xml', 'HaYom/1966/11/02_01/19661102_01-METS.xml', 'HaYom/1966/11/03_01/19661103_01-METS.xml', 'HaYom/1966/11/04_01/19661104_01-METS.xml', 'HaYom/1966/11/06_01/19661106_01-METS.xml']
2024-11-25 04:51:31 - [PID: 9530] - 

Block P1_CB00003 was not found in data
Block P1_CB00004 was not found in data
Block P1_CB00005 was not found in data
Block P1_CB00006 was not found in data
Block P1_CB00007 was not found in data
Block P1_CB00008 was not found in data
Block P1_CB00009 was not found in data
Block P1_CB00010 was not found in data
Block P1_CB00011 was not found in data
Block P1_CB00012 was not found in data
Block P2_CB00002 was not found in data
Block P2_CB00003 was not found in data
Block P3_CB00006 was not found in data
Block P3_CB00007 was not found in data
Block P4_CB00001 was not found in data
Block P5_CB00001 was not found in data
Block P5_CB00002 was not found in data
Block P5_CB00003 was not found in data
Block P5_CB00004 was not found in data
Block P5_CB00005 was not found in data
Block P5_CB00006 was not found in data
Block P6_CB00001 was not found in data
Block P6_CB00002 was not found in data
Block P6_CB00003 was not found in data
Block P6_CB00004 was not found in data
Block P6_CB00005 was not 

2024-11-25 07:46:17 - [PID: 9530] - INFO - Processing METS: HaYom/1966/10/20_01/19661020_01-METS.xml
2024-11-25 07:46:17 - [PID: 9530] - INFO - Starting page 1 of HaYom/1966/10/20_01/19661020_01-METS.xml
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
2024-11-25 07:49:32 - [PID: 9531] - INFO - Done with page 5 of HaYom/1966/11/07_01/19661107_01-METS.xml
2024-11-25 07:49:32 - [PID: 9531] - INFO - Starting page 6 of HaYom/1966/11/07_01/19661107_01-METS.xml
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
2024-11-25 07:54:03 - [PID: 9533] - INFO - Done with page 6 of HaYom/1966/12/14_01/19661214_01-METS.xml


Article DIVL646 does not have content and was skipped.
Block P1_CB00004 was not found in data
Block P1_CB00005 was not found in data
Block P1_CB00006 was not found in data
Block P1_CB00007 was not found in data
Block P1_CB00008 was not found in data
Block P1_CB00009 was not found in data
Block P1_CB00010 was not found in data
Block P2_CB00002 was not found in data
Block P4_CB00005 was not found in data
Block P4_CB00006 was not found in data
Block P4_CB00007 was not found in data
Block P4_CB00008 was not found in data
Block P4_CB00009 was not found in data
Block P5_CB00001 was not found in data
Block P5_CB00002 was not found in data
Block P5_CB00003 was not found in data
Block P5_CB00004 was not found in data
Block P5_CB00005 was not found in data
Block P5_CB00006 was not found in data
Block P5_CB00007 was not found in data
Block P5_CB00008 was not found in data
Block P5_CB00009 was not found in data
Block P5_CB00010 was not found in data
Block P5_CB00011 was not found in data
Block P5_

2024-11-25 07:54:04 - [PID: 9533] - INFO - Processing METS: HaYom/1966/12/15_01/19661215_01-METS.xml
2024-11-25 07:54:04 - [PID: 9533] - INFO - Starting page 1 of HaYom/1966/12/15_01/19661215_01-METS.xml
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return v/np.amax(v)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
2024-11-25 08:14:12 - [PID: 9533] - INFO - Done with page 1 of HaYom/1966/12/15_01/19661215_01-METS.xml
2024-11-25 08:14:12 - [PID: 9533] - INFO - Starting page 2 of HaYom/1966/12/15_01/19661215_01-METS.xml
2024-11-25 08:15:26 - [PID: 9532] - INFO - Done with page 6 of HaYom/1966/11/25_01/19661125_01-METS.xml
2024-11-25 08:15:26 - [PID: 9532] - INFO - Starting page 7 of HaYom/1966/11/25_01/19661125_01-METS.xml
2024-11-25 08:19:06 - [PID: 9531] - INFO - Done with page 6 of HaYom/1966/11/07_01/19661107_01-M

Article DIVL529 does not have content and was skipped.
Block P1_CB00004 was not found in data
Block P1_CB00005 was not found in data
Block P1_CB00006 was not found in data
Block P1_CB00007 was not found in data
Block P1_CB00008 was not found in data
Block P1_CB00009 was not found in data
Block P1_CB00010 was not found in data
Block P1_CB00011 was not found in data
Block P2_CB00001 was not found in data
Block P4_CB00005 was not found in data
Block P4_CB00006 was not found in data
Block P4_CB00007 was not found in data
Block P4_CB00008 was not found in data
Block P4_CB00009 was not found in data
Block P4_CB00010 was not found in data
Block P5_CB00003 was not found in data
Block P5_CB00004 was not found in data
Block P5_CB00005 was not found in data
Block P5_CB00006 was not found in data
Block P5_CB00007 was not found in data
Block P5_CB00008 was not found in data
Block P5_CB00009 was not found in data
Block P5_CB00010 was not found in data
Block P5_CB00011 was not found in data
Block P5_

2024-11-25 08:19:07 - [PID: 9531] - INFO - Processing METS: HaYom/1966/11/08_01/19661108_01-METS.xml
2024-11-25 08:19:07 - [PID: 9531] - INFO - Starting page 1 of HaYom/1966/11/08_01/19661108_01-METS.xml
2024-11-25 08:22:29 - [PID: 9530] - INFO - Done with page 1 of HaYom/1966/10/20_01/19661020_01-METS.xml
2024-11-25 08:22:29 - [PID: 9530] - INFO - Starting page 2 of HaYom/1966/10/20_01/19661020_01-METS.xml
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
2024-11-25 08:38:10 - [PID: 9532] - INFO - Done with page 7 of HaYom/1966/11/25_01/19661125_01-METS.xml
2024-11-25 08:38:10 - [PID: 9532] - INFO - Starting page 8 of HaYom/1966/11/25_01/19661125_01-METS.xml
2024-11-25 08:54:26 - [PID: 9531] - INFO - Done with page 1 of HaYom/1966/11/08_01/19661108_01-METS.xml
2024-11-25 08:54:26 - [PID: 9531] - INFO - Starting page 2 of HaYom/1966/11/08_01/19661108_01-METS.xml
