In [6]:
import shapely
from PIL import Image, ImageDraw
import numpy as np
import os
import re
from lxml import etree
import xml.etree.ElementTree as ET

In [7]:
directory = "/Users/Theo/Desktop/14-18/transcription/new_samples/"

In [8]:
#Code that displays the file paths in the alto documents
for filename in os.listdir(directory):
    if filename.endswith('.xml'):
        # Load the XML file into an ElementTree object
        tree = ET.parse(os.path.join(directory, filename))
        
        # Get the root element
        root = tree.getroot()
        
        # Find the fileName element and print its text content
        file_name_elems = root.findall(".//{http://www.loc.gov/standards/alto/ns-v4#}fileName")
        for file_name_elem in file_name_elems:
            print(file_name_elem.text)

435116_archives_FRAD075RM_D4R1_2071_0265_D.jpg
802253_archives_FRAD075RM_D4R1_1628_0691_D.jpg
435116_archives_FRAD075RM_D4R1_2071_0264_D.jpg
394626_archives_FRAD075RM_D4R1_1102_0220_D.jpg
815346_archives_FRAD075RM_D4R1_1685_0044_D.jpg


In [4]:
#Code that changes the file paths in the alto documents
ns_dict = {
    'alto': 'http://www.loc.gov/standards/alto/ns-v4#',
    'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}
ns_prefix = '{http://www.loc.gov/standards/alto/ns-v4#}'

# Loop over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.xml'):
        # Load the XML file into an ElementTree object
        tree = ET.parse(os.path.join(directory, filename))
        
        # Find the root element and remove the namespace prefix
        root = tree.getroot()
        if root.tag.startswith(ns_prefix):
            root.tag = 'alto'
            root.attrib = {
                'xmlns:xsi': ns_dict['xsi'],
                'xmlns': ns_dict['alto'],
                'xsi:schemaLocation': 'http://www.loc.gov/standards/alto/ns-v4# http://www.loc.gov/standards/alto/v4/alto-4-3.xsd'
            }
        
        # Find all elements with the ALTO namespace
        for elem in tree.findall('.//{http://www.loc.gov/standards/alto/ns-v4#}*'):
            # Remove the namespace prefix from the element tag
            elem.tag = re.sub(r'{.*}', '', elem.tag)
        
        # Find all fileName elements and modify their text content
        for file_name_elem in tree.findall('.//fileName', ns_dict):
            # Remove all characters and signs in between any slashes in the filename, and return only the filename itself
            file_name_elem.text = re.sub(r'^.*/', '', file_name_elem.text)
            
        # Write the modified XML file to disk
        tree.write(os.path.join(directory, filename), encoding='UTF-8', xml_declaration=True)
        
        

In [None]:
# Code that calculates IOU on polgons of each document

In [None]:
xml_file = "/Users/Theo/Desktop/14-18/transcription/test_2/245309_archives_FRAD075RM_D4R1_1054_0751_D.xml"

In [None]:
with open(xml_file,'r') as my_file:
    root = ET.parse(my_file)
    
    list_region_image = root.findall(".//{http://www.loc.gov/standards/alto/ns-v4#}TextBlock[@TAGREFS='TYPE_6']")
    
    coords_image = []
    
    for region in list_region_image:
        coords_image.append(region.find('.//{http://www.loc.gov/standards/alto/ns-v4#}Polygon').attrib['POINTS'])
            
    dict_region = {}
    for index, coords in enumerate(coords_image):
        coords = coords.split()
        coords = [int(i) for i in coords]
    
        dict_region[index] = list((coords[i], coords[i+1]) for i in range(0, len(coords)-1,2))
    
    dimensions = root.findall('.//{http://www.loc.gov/standards/alto/ns-v4#}Page')[0].attrib
    imageWidth = int(dimensions['WIDTH'])
    imageHeight = int(dimensions['HEIGHT'])
    
    image = Image.new('RGB', (imageWidth, imageHeight), color = 'yellow')
    
    for clef in dict_region:
        ImageDraw.Draw(image).polygon(dict_region[clef], fill='pink')
    mask = np.array(image)
    image = Image.fromarray(mask)
    
    image.save('output.jpg', 'JPEG')
    

In [None]:
dict_region

In [None]:
from shapely.geometry import MultiPoint, Polygon
simplify_points = Polygon(dict_region[0]).envelope
default_points = Polygon(dict_region[0])
intersect = default_points.intersection(simplify_points).area
union = default_points.union(simplify_points).area
iou = intersect/union
iou


In [None]:
iou_liste = []
for i in range(len(dict_region)):
    simplify_points = Polygon(dict_region[i]).envelope
    default_points = Polygon(dict_region[i])
    intersect = default_points.intersection(simplify_points).area
    union = default_points.union(simplify_points).area
    iou = intersect/union
    iou_liste.append(iou)

iou_liste