In [None]:
import os
import glob
import xml.etree.ElementTree as ET
import csv
import logging

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s:%(message)s',
                    handlers=[logging.StreamHandler()])

def parse_xml_file(file_path):
    """
    Parse one XML file to extract argument units and relations.
    Returns:
        args_dict: dict mapping arg_id -> {type, text}
        relations: list of dicts with keys:
          source_ID, target_ID, relation, source_type, target_type
    """
    args_dict = {}
    relations = []
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
    except ET.ParseError as e:
        logging.error(f"Failed to parse XML {file_path}: {e}")
        return args_dict, relations
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {e}")
        return args_dict, relations

    # Extract all premises and conclusions with their texts
    for elem in root.iter():
        if elem.tag in ['prem', 'conc']:
            arg_id = elem.attrib.get('ID')
            if not arg_id:
                logging.warning(f"Missing ID for {elem.tag} in {file_path}")
                continue
            arg_type = elem.tag  # 'prem' or 'conc'
            # Get text inside tag, joining if broken up across children
            text = ''.join(elem.itertext()).strip().replace('\n',' ').replace('  ',' ')
            args_dict[arg_id] = {'type': arg_type, 'text': text}

    # Extract relations using SUP (support) and ATT (attack)
    for elem in root.iter():
        if elem.tag not in ['prem', 'conc']:
            continue
        target_id = elem.attrib.get('ID')
        if not target_id or target_id not in args_dict:
            continue
        target_type = args_dict[target_id]['type']

        # Support relations (multiple IDs separated by '|')
        sup_str = elem.attrib.get('SUP')
        if sup_str:
            sup_ids = [s.strip() for s in sup_str.split('|') if s.strip()]
            for src_id in sup_ids:
                if src_id not in args_dict:
                    logging.warning(f"SUP source {src_id} not found in {file_path} for target {target_id}")
                    continue
                source_type = args_dict[src_id]['type']
                relations.append({
                    'source_ID': src_id,
                    'target_ID': target_id,
                    'relation': 'support',
                    'source_type': source_type,
                    'target_type': target_type,
                    'source_text': args_dict[src_id]['text'],
                    'target_text': args_dict[target_id]['text']
                })

        # Attack relations (similar to SUP)
        att_str = elem.attrib.get('ATT')
        if att_str:
            att_ids = [s.strip() for s in att_str.split('|') if s.strip()]
            for src_id in att_ids:
                if src_id not in args_dict:
                    logging.warning(f"ATT source {src_id} not found in {file_path} for target {target_id}")
                    continue
                source_type = args_dict[src_id]['type']
                relations.append({
                    'source_ID': src_id,
                    'target_ID': target_id,
                    'relation': 'attack',
                    'source_type': source_type,
                    'target_type': target_type,
                    'source_text': args_dict[src_id]['text'],
                    'target_text': args_dict[target_id]['text']
                })

    return args_dict, relations


def extract_all_relations(xml_folder, output_csv):
    all_relations = []
    xml_files = glob.glob(os.path.join(xml_folder, '*.xml'))
    if not xml_files:
        logging.error(f"No XML files found in folder: {xml_folder}")
        return

    logging.info(f"Found {len(xml_files)} XML files to process.")

    for file_path in xml_files:
        logging.info(f"Processing {file_path} ...")
        _, relations = parse_xml_file(file_path)
        filename = os.path.basename(file_path)
        for rel in relations:
            rel['file_name'] = filename
            rel['type_pair'] = f"{rel['source_type']}-{rel['target_type']}"
            all_relations.append(rel)

    # Write CSV
    csv_columns = ['source_ID', 'source_text', 'target_ID', 'target_text',
                   'relation', 'source_type', 'target_type', 'file_name', 'type_pair']

    try:
        with open(output_csv, 'w', encoding='utf-8', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            writer.writeheader()
            for rel in all_relations:
                writer.writerow({key: rel.get(key, '') for key in csv_columns})
        logging.info(f"CSV successfully written to {output_csv}. Total rows: {len(all_relations)}")
    except Exception as e:
        logging.error(f"Error writing CSV file {output_csv}: {e}")


if __name__ == '__main__':
    folder_path = 'all_xml_cleaned'   # folder containing XML files
    output_file = 'PC_SA.csv'
    extract_all_relations(folder_path, output_file)


2025-09-21 18:28:17,521 INFO:Found 40 XML files to process.
2025-09-21 18:28:17,521 INFO:Processing all_xml/R2011_France Télécom SA v European Commission.xml ...
2025-09-21 18:28:17,522 INFO:Processing all_xml/A2018_Commission v Spain.xml ...
2025-09-21 18:28:17,523 INFO:Processing all_xml/R2017_European Commission v Frucona Košice a.xml ...
2025-09-21 18:28:17,524 INFO:Processing all_xml/A2018_Dirk Andres v European Commission.xml ...
2025-09-21 18:28:17,525 INFO:Processing all_xml/A2017_European Commission v Italian Republic_DT.xml ...
2025-09-21 18:28:17,525 INFO:Processing all_xml/R2016_DTS Distribuidora de Televisión Digital.xml ...
2025-09-21 18:28:17,526 INFO:Processing all_xml/A2018_Scuola Elementare Maria Montessori Srl v European Commission.xml ...
2025-09-21 18:28:17,527 INFO:Processing all_xml/R2021_FVE Holýšov I and Others v Commission.xml ...
2025-09-21 18:28:17,528 INFO:Processing all_xml/R2016_European Commission v Hansestadt Lübeck.xml ...
2025-09-21 18:28:17,529 INFO: