In [114]:
from lxml import etree
from tqdm.notebook import tqdm
import os
from typing import NamedTuple, Iterable
from dataclasses import dataclass

URN_PREFIX = 'URN:NBN:'
    
@dataclass
class AltoString:
    id: str
    token: str    
        
@dataclass
class TextLine:
    id: str
    strings: Iterable[AltoString]
    
@dataclass
class TextBlock:
    id: str
    text_lines: str
    
@dataclass
class ComposedBlock:
    id: str
    text_blocks: Iterable[TextBlock]
    
@dataclass
class PageData:
    composed_blocks: Iterable[ComposedBlock]
    

class Book:
    def __init__(self, path: str):
        self.path = path
        self.pages = []
        self.parse()
        
    @property  
    def urn_suffix(self):
        return os.path.basename(self.path).split('.')[0]  
    
    @property
    def urn(self):
        return URN_PREFIX + self.urn_suffix  
    
    def __repr__(self):
        return f'<Book {self.urn}>'
        
    def parse(self):
        for path, _, files in os.walk(self.path):
            for file in tqdm(files):
                if file.endswith('.xml'):
                    self.pages.append(Page(os.path.join(path, file)))
                    
    def print_pages(self):
        for page in self.pages:
            print(page.path)
            
    def __getitem__(self, slice):
        return self.pages[slice]
    
    def export_to_txt(self, path: str):
        os.makedirs(path, exist_ok=True)        
        for page in self.pages:
            with open(os.path.join(path, page.urn_suffix + '.txt'), 'w') as f:
                for line in page.text:
                    f.write(line)
        
        
        
class Page:
    def __init__(self, path: str):
        self.path = path
        self.components = []
        self.parse()
    
    @property  
    def urn_suffix(self):
        return os.path.basename(self.path).split('.')[0]  
    
    @property
    def urn(self):
        return URN_PREFIX + self.urn_suffix      
    
    def __repr__(self):
        return f'<Page {self.urn}>'

    def parse(self):
         # Parse the ALTO XML file
        tree = etree.parse(self.path)
        namespaces = {'alto': 'http://www.loc.gov/standards/alto/ns-v3#'}
        
        data = PageData([])
        self.components.append(data)
        
        
        composed_blocks = tree.xpath('.//alto:ComposedBlock', namespaces=namespaces)
        
        for composed_block in composed_blocks:
            id = composed_block.get('ID')
            target_composed_block = ComposedBlock(id, [])
            data.composed_blocks.append(target_composed_block)
                
            blocks = composed_block.xpath('./alto:TextBlock', namespaces=namespaces)
            for block in blocks:
                target_block = TextBlock(block.get('ID'), [])
                target_composed_block.text_blocks.append(target_block)
                
                lines = block.xpath('./alto:TextLine', namespaces=namespaces)
                
                for line in lines:
                    target_line = TextLine(line.get('ID'), [])
                    target_block.text_lines.append(target_line)
                    
                    strings = line.xpath('./alto:String', namespaces=namespaces)
                    strings = [AltoString(s.get('ID'), s.get('CONTENT')) for s in strings]
                    target_line.strings = strings
                    
                    
    def print_text(self):
        for data in self.components:
            for composed_block in data.composed_blocks:
                for block in composed_block.text_blocks:
                    for line in block.text_lines:
                        print(" ".join([t.token for t in line.strings]))
                print()
            print()
    
    @property    
    def text(self):
        for data in self.components:
            for composed_block in data.composed_blocks:
                for block in composed_block.text_blocks:
                    for line in block.text_lines:
                        yield " ".join([t.token for t in line.strings])
                    yield "\n"
                yield "\n"
            yield "\n"
                    
                    
        

In [115]:
test_path = "data/test/no-nb_digibok_2006080900007"

test = Book(test_path)
# test.print_pages()

0it [00:00, ?it/s]

  0%|          | 0/203 [00:00<?, ?it/s]

In [116]:
test

<Book URN:NBN:no-nb_digibok_2006080900007>

In [117]:
test.export_to_txt('data/test_txt')

In [87]:
test.urn

AttributeError: 'Book' object has no attribute 'urn'

In [118]:
test.pages[2:3].print_text()

AttributeError: 'list' object has no attribute 'print_text'