# PageRank algorithm using MapReduce

In [1]:
import xml.etree.ElementTree as ET
import os # Optional: to check if file exists

def parse_padgett_data(xml_file_path: str, output_txt_path: str):
    """
    Parses the Padgett Florentine families XML file to extract nodes
    and marriage links (PADGM network with value=1.0), writing them
    to a text file suitable for MRJob input.

    Format:
        NODE_ID\t!NODE  (for declaring all nodes)
        SOURCE_ID\tTARGET_ID (for representing marriage links)

    Args:
        xml_file_path (str): Path to the input padgett.xml file.
        output_txt_path (str): Path where the output .txt file will be saved.
    """
    nodes = set()
    edges = []

    try:
        if not os.path.exists(xml_file_path):
             print(f"Error: Input file not found at {xml_file_path}")
             return False

        # Parse the XML tree
        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        # --- Extract all node IDs ---
        # Find the 'agent' nodeclass and extract all node IDs within it
        agent_nodes = root.findall("./MetaNetwork/nodes/nodeclass[@type='agent']/node")
        for node in agent_nodes:
            node_id = node.get('id')
            if node_id:
                nodes.add(node_id)

        # --- Extract relevant links (marriage network PADGM, value=1.0000) ---
        marriage_links = root.findall("./MetaNetwork/networks/network[@id='PADGM']/link")
        for link in marriage_links:
            source = link.get('source')
            target = link.get('target')
            value = link.get('value')

            # Only consider links with value 1.0000 and where source/target exist
            if value == "1.0000" and source and target:
                if source in nodes and target in nodes:
                    edges.append((source, target))

        with open(output_txt_path, 'w') as f_out:
            for node_id in sorted(list(nodes)): # Sorting is nice but not required
                f_out.write(f"{node_id}\t!NODE\n")

            for source, target in edges:
                f_out.write(f"{source}\t{target}\n")

        print(f"Successfully parsed '{xml_file_path}' and created '{output_txt_path}'.")
        print(f"Found {len(nodes)} nodes and {len(edges)} marriage links.")
        return True

    except ET.ParseError as e:
        print(f"Error parsing XML file '{xml_file_path}': {e}")
        return False
    except IOError as e:
        print(f"Error writing to output file '{output_txt_path}': {e}")
        return False
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return False

input_xml = './data/padgett.xml'
output_txt = './data/padgett_input.txt' # Name for the generated file

if parse_padgett_data(input_xml, output_txt):
    print(f"file saved: {output_txt}")

Successfully parsed './data/padgett.xml' and created './data/padgett_input.txt'.
Found 16 nodes and 40 marriage links.
file saved: ./data/padgett_input.txt
