# Jupyter Notebook: SSML Debugging Tools

#### Cell1: Setup and Imports

In [1]:
import os
import re
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import ParseError
import html
from lxml import etree
from IPython.display import display
import ipywidgets as widgets

#### Cell2: Function to Clean SSML Tags

In [16]:
def clean_ssml_tags(file_path):
    """
    Cleans SSML tags in the provided file by ensuring proper attributes for <w> and <break> tags,
    and removing unwanted tags.

    Args:
        file_path (str): Path to the SSML file to be cleaned.

    Returns:
        None
    """
    allowed_tags = {"break", "lang", "p", "phoneme", "s", "speak", "sub", "w"}

    def ensure_role_attribute(tag):
        if 'role=' not in tag:
            tag = tag.replace('<w', '<w role="amazon:NN"', 1)
        return tag

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        content = html.unescape(content)

        def ensure_break_time(match):
            if 'time' not in match.group(0):
                return '<break time="1s"/>'
            return match.group(0)

        content = re.sub(r'<break\s*/?>', ensure_break_time, content)
        content = re.sub(r'<w([^>]*)>', lambda m: ensure_role_attribute(m.group(0)), content)
        root = ET.fromstring(f"<root>{content}</root>")

        def remove_unwanted_tags(element):
            for child in list(element):
                if child.tag not in allowed_tags:
                    element.remove(child)
                else:
                    remove_unwanted_tags(child)

        remove_unwanted_tags(root)
        cleaned_ssml = ET.tostring(root, encoding='unicode').replace('<root>', '').replace('</root>', '')

        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_ssml)

        print(f"SSML tags cleaned and saved to {file_path}")

    except ParseError as e:
        print(f"Error parsing the SSML text: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

def handle_uploaded_file(uploaded_file):
    file_path = uploaded_file['name']
    with open(file_path, 'wb') as f:
        f.write(uploaded_file['content'].tobytes())
    return file_path

# Usage Example
file_selector = widgets.FileUpload(accept='.txt', multiple=False)

def on_clean_ssml_tags(b):
    for uploaded_file in file_selector.value:
        file_path = handle_uploaded_file(uploaded_file)
        clean_ssml_tags(file_path)

clean_button = widgets.Button(description="Clean SSML Tags")
clean_button.on_click(on_clean_ssml_tags)

display(file_selector, clean_button)


FileUpload(value=(), accept='.txt', description='Upload')

Button(description='Clean SSML Tags', style=ButtonStyle())

Error parsing the SSML text: mismatched tag: line 282, column 4


#### Cell 3: Function to Check Nested `<w>` Tags

In [15]:
def check_nested_w_tags_from_file(file_path):
    """
    Checks for nested <w> or <sub> tags in the SSML file and reports their locations.
    Also checks for nested <phoneme> tags within <sub> tags.

    Args:
        file_path (str): Path to the SSML file to be checked.

    Returns:
        list or str: List of findings or a message if no nested tags are found.
    """
    try:
        with open(file_path, 'rb') as file:
            root = etree.parse(file).getroot()

        findings = []

        def recurse_check(element, parent_tag=None):
            if (element.tag == 'w' or element.tag == 'sub') and parent_tag not in ['speak', 's', 'p']:
                findings.append(f'Nested <{element.tag}> tag found in <{parent_tag}> at line {element.sourceline}.')
            if element.tag == 'phoneme' and parent_tag == 'sub':
                findings.append(f'Nested <phoneme> tag found in <sub> at line {element.sourceline}.')

            for child in element:
                recurse_check(child, element.tag)

        recurse_check(root)
        return findings if findings else "No nested <w>, <sub>, or <phoneme> tags found."

    except etree.XMLSyntaxError as e:
        return f"Error parsing the SSML: {str(e)}"
    except FileNotFoundError:
        return "Error: The file specified does not exist."
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"

# Usage Example
file_selector = widgets.FileUpload(accept='.txt', multiple=False)

def on_check_nested_tags(b):
    for uploaded_file in file_selector.value:
        file_path = handle_uploaded_file(uploaded_file)
        result = check_nested_w_tags_from_file(file_path)
        print(result)

check_button = widgets.Button(description="Check Nested Tags")
check_button.on_click(on_check_nested_tags)

display(file_selector, check_button)


FileUpload(value=(), accept='.txt', description='Upload')

Button(description='Check Nested Tags', style=ButtonStyle())

Error parsing the SSML: Opening and ending tag mismatch: s line 281 and p, line 282, column 7 (On_Faith_SSML_chunk_1.txt, line 282)


#### Cell4: Function to Remove `<w>` Tags

In [4]:
def remove_w_tags_from_file(file_path):
    """
    Removes all <w> tags from the SSML file and saves the modified content to a new file.

    Args:
        file_path (str): Path to the SSML file to be modified.

    Returns:
        str: Message indicating the result of the operation.
    """
    try:
        print(f"Attempting to open the file at: {file_path}")

        if not os.path.exists(file_path):
            return "Error: The file specified does not exist."

        with open(file_path, 'rb') as file:
            tree = etree.parse(file)
            root = tree.getroot()

        for w_tag in root.xpath('//w'):
            w_tag.getparent().remove(w_tag)

        dir_name = os.path.dirname(file_path)
        base_name = os.path.basename(file_path)
        modified_base_name = 'modified_' + base_name
        modified_path = os.path.join(dir_name, modified_base_name)

        print(f"Saving the modified file to: {modified_path}")
        tree.write(modified_path, pretty_print=True, xml_declaration=True, encoding='UTF-8')
        return f"All <w> tags have been removed and the modified file is saved as {modified_path}."

    except etree.XMLSyntaxError as e:
        return f"Error parsing the SSML: {str(e)}"
    except PermissionError as e:
        return f"Permission Error: {str(e)}"
    except OSError as e:
        return f"OS Error: {str(e)}"
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"

# Usage Example
file_selector = widgets.FileUpload(accept='.txt', multiple=False)

def on_remove_w_tags(b):
    for uploaded_file in file_selector.value:
        file_path = handle_uploaded_file(uploaded_file)
        result = remove_w_tags_from_file(file_path)
        print(result)

remove_button = widgets.Button(description="Remove <w> Tags")
remove_button.on_click(on_remove_w_tags)

display(file_selector, remove_button)


FileUpload(value=(), accept='.txt', description='Upload')

Button(description='Remove <w> Tags', style=ButtonStyle())

Attempting to open the file at: Fulgentius_of_Ruspe_On_Faith_and_For_the_Catholic_Faith_Translations_SSML_chunk_1.txt
Saving the modified file to: modified_Fulgentius_of_Ruspe_On_Faith_and_For_the_Catholic_Faith_Translations_SSML_chunk_1.txt
All <w> tags have been removed and the modified file is saved as modified_Fulgentius_of_Ruspe_On_Faith_and_For_the_Catholic_Faith_Translations_SSML_chunk_1.txt.
