In [1]:
import os
import xml.etree.ElementTree as ET


def count_languages_in_all_folders(root_path):
    language_counts = {}  # Dictionary to store counts of each language per file

    # Walk through all directories and subdirectories
    for dirpath, dirnames, filenames in os.walk(root_path):
        for filename in filenames:
            if filename.endswith('.xml'):  
                full_path = os.path.join(dirpath, filename)

                try:
                    tree = ET.parse(full_path)  # Parse the XML file
                    root = tree.getroot()  # Get the root element

                    # Traverse through each 's' element in the XML file
                    # Finds all 's' tags at any depth
                    for s in root.findall(".//s"):
                        lang = s.get('lang')  # Extract the 'lang' attribute
                        if lang:
                            if lang in language_counts:
                                language_counts[lang] += 1
                            else:
                                language_counts[lang] = 1

                except ET.ParseError:
                    print(
                        f"Error parsing {full_path}: File may be corrupted or not well-formed XML.")

    # Sum all counts to get the total number of language occurrences
    total_language_occurrences = sum(language_counts.values())

    # Calculate and return both count and percentage of each language
    results = {}
    for lang, count in language_counts.items():
        percentage = (count / total_language_occurrences *
                      100) if total_language_occurrences > 0 else 0
        results[lang] = {'count': count, 'percentage': percentage}

    return results

root_path = './all'
language_details = count_languages_in_all_folders(root_path)

# Print results
for lang, details in language_details.items():
    print(
        f"{lang}: Count = {details['count']}, Percentage = {details['percentage']:.4f}%")

la: Count = 44781, Percentage = 77.4664%
de: Count = 12949, Percentage = 22.4004%
el: Count = 76, Percentage = 0.1315%
he: Count = 1, Percentage = 0.0017%
