In [5]:
import os
import glob
import xml.etree.ElementTree as ET
import csv
import logging

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s:%(message)s',
                    handlers=[logging.StreamHandler()])

def parse_xml_file(file_path):
    """
    Parse one XML file to extract argument units and relations.
    Returns:
        args_dict: dict mapping arg_id -> {type, text}
        relations: list of dicts with keys:
          source_ID, target_ID, relation, source_type, target_type
    """
    args_dict = {}
    relations = []
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
    except ET.ParseError as e:
        logging.error(f"Failed to parse XML {file_path}: {e}")
        return args_dict, relations
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {e}")
        return args_dict, relations

    # Extract all premises and conclusions with their texts
    for elem in root.iter():
        if elem.tag in ['prem', 'conc']:
            arg_id = elem.attrib.get('ID')
            if not arg_id:
                logging.warning(f"Missing ID for {elem.tag} in {file_path}")
                continue
            arg_type = elem.tag  # 'prem' or 'conc'
            # Get text inside tag, joining if broken up across children
            text = ''.join(elem.itertext()).strip().replace('\n',' ').replace('  ',' ')
            args_dict[arg_id] = {'type': arg_type, 'text': text}

    # Extract relations using SUP (support) and ATT (attack)
    for elem in root.iter():
        if elem.tag not in ['prem', 'conc']:
            continue
        target_id = elem.attrib.get('ID')
        if not target_id or target_id not in args_dict:
            continue
        target_type = args_dict[target_id]['type']

        # Support relations (multiple IDs separated by '|')
        sup_str = elem.attrib.get('SUP')
        if sup_str:
            sup_ids = [s.strip() for s in sup_str.split('|') if s.strip()]
            for src_id in sup_ids:
                if src_id not in args_dict:
                    logging.warning(f"SUP source {src_id} not found in {file_path} for target {target_id}")
                    continue
                source_type = args_dict[src_id]['type']
                relations.append({
                    'source_ID': src_id,
                    'target_ID': target_id,
                    'relation': 'support',
                    'source_type': source_type,
                    'target_type': target_type,
                    'source_text': args_dict[src_id]['text'],
                    'target_text': args_dict[target_id]['text']
                })

        # Attack relations (similar to SUP)
        att_str = elem.attrib.get('ATT')
        if att_str:
            att_ids = [s.strip() for s in att_str.split('|') if s.strip()]
            for src_id in att_ids:
                if src_id not in args_dict:
                    logging.warning(f"ATT source {src_id} not found in {file_path} for target {target_id}")
                    continue
                source_type = args_dict[src_id]['type']
                relations.append({
                    'source_ID': src_id,
                    'target_ID': target_id,
                    'relation': 'attack',
                    'source_type': source_type,
                    'target_type': target_type,
                    'source_text': args_dict[src_id]['text'],
                    'target_text': args_dict[target_id]['text']
                })

    return args_dict, relations


def extract_all_relations(xml_folder, output_csv):
    all_relations = []
    xml_files = glob.glob(os.path.join(xml_folder, '*.xml'))
    if not xml_files:
        logging.error(f"No XML files found in folder: {xml_folder}")
        return

    logging.info(f"Found {len(xml_files)} XML files to process.")

    for file_path in xml_files:
        logging.info(f"Processing {file_path} ...")
        _, relations = parse_xml_file(file_path)
        filename = os.path.basename(file_path)
        for rel in relations:
            rel['file_name'] = filename
            rel['type_pair'] = f"{rel['source_type']}-{rel['target_type']}"
            all_relations.append(rel)

    # Write CSV
    csv_columns = ['source_ID', 'source_text', 'target_ID', 'target_text',
                   'relation', 'source_type', 'target_type', 'file_name', 'type_pair']

    try:
        with open(output_csv, 'w', encoding='utf-8', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            writer.writeheader()
            for rel in all_relations:
                writer.writerow({key: rel.get(key, '') for key in csv_columns})
        logging.info(f"CSV successfully written to {output_csv}. Total rows: {len(all_relations)}")
    except Exception as e:
        logging.error(f"Error writing CSV file {output_csv}: {e}")


if __name__ == '__main__':
    folder_path = 'all_xml_cleaned'   # folder containing XML files
    output_file = 'PC_SA.csv'
    extract_all_relations(folder_path, output_file)


2025-09-21 18:28:17,521 INFO:Found 40 XML files to process.
2025-09-21 18:28:17,521 INFO:Processing all_xml/R2011_France Télécom SA v European Commission.xml ...
2025-09-21 18:28:17,522 INFO:Processing all_xml/A2018_Commission v Spain.xml ...
2025-09-21 18:28:17,523 INFO:Processing all_xml/R2017_European Commission v Frucona Košice a.xml ...
2025-09-21 18:28:17,524 INFO:Processing all_xml/A2018_Dirk Andres v European Commission.xml ...
2025-09-21 18:28:17,525 INFO:Processing all_xml/A2017_European Commission v Italian Republic_DT.xml ...
2025-09-21 18:28:17,525 INFO:Processing all_xml/R2016_DTS Distribuidora de Televisión Digital.xml ...
2025-09-21 18:28:17,526 INFO:Processing all_xml/A2018_Scuola Elementare Maria Montessori Srl v European Commission.xml ...
2025-09-21 18:28:17,527 INFO:Processing all_xml/R2021_FVE Holýšov I and Others v Commission.xml ...
2025-09-21 18:28:17,528 INFO:Processing all_xml/R2016_European Commission v Hansestadt Lübeck.xml ...
2025-09-21 18:28:17,529 INFO:

# add no-relation pairs between arguments of the same file, 125 no-relation edges per file, so 5000 total no-relation edges in total. stand-alone nodes are prioritied first to connect with no-relation edge. no realtion self-edges are discarded.

In [None]:
import csv
import random
from collections import defaultdict
from pathlib import Path
import xml.etree.ElementTree as ET

def load_arguments(xml_folder):
    """
    Load all argument texts and types per file per ID from XMLs.
    Returns:
        dict file_name -> dict id -> {'text': text, 'type': prem/conc}
    """
    args_data = defaultdict(dict)

    for xml_file in Path(xml_folder).glob("*.xml"):
        try:
            tree = ET.parse(xml_file)
            root = tree.getroot()
            file_name = xml_file.name

            for elem in root.iter():
                if elem.tag in ['prem', 'conc']:
                    arg_id = elem.attrib.get('ID', '').strip()
                    if arg_id:
                        text = ''.join(elem.itertext()).strip().replace('\n', ' ')
                        args_data[file_name][arg_id] = {'text': text, 'type': elem.tag}
        except ET.ParseError as e:
            print(f"Parse error in {xml_file.name}: {e}")

    return args_data

def load_existing_relations(csv_file):
    """Load existing relations from CSV."""
    rows = []
    relations_set = set()  # to track (file, src_id, tgt_id) for no duplicates

    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            key = (row['file_name'], row['source_ID'], row['target_ID'])
            relations_set.add(key)
            rows.append(row)
    return rows, relations_set

def generate_no_relation_edges_with_text(
        xml_folder,
        existing_csv,
        standalone_premises,
        standalone_conclusions,
        edges_per_file=125):
    from itertools import product

    args_data = load_arguments(xml_folder)
    existing_rows, existing_relations_set = load_existing_relations(existing_csv)

    no_relation_rows = []

    random.seed(42)

    for file_name, args_dict in args_data.items():
        args = list(args_dict.keys())
        existing_edges = set((src, tgt) for (f, src, tgt) in existing_relations_set if f == file_name)

        # All possible pairs except self loops
        possible_pairs = set((a1, a2) for a1 in args for a2 in args if a1 != a2)

        candidate_no_rel = possible_pairs - existing_edges

        # Find stand-alone IDs in this file
        stand_alone_prem_ids = [pid for pid, pfname in standalone_premises if pfname == file_name and pid in args]
        stand_alone_conc_ids = [cid for cid, cfname in standalone_conclusions if cfname == file_name and cid in args]

        # Priority no-relation edges involving stand-alone nodes
        priority_edges = set()
        for sid in stand_alone_prem_ids + stand_alone_conc_ids:
            for tid in args:
                if sid != tid and (sid, tid) in candidate_no_rel:
                    priority_edges.add((sid, tid))
                if tid != sid and (tid, sid) in candidate_no_rel:
                    priority_edges.add((tid, sid))

        no_rel_edges_file = list(priority_edges)

        if len(no_rel_edges_file) < edges_per_file:
            remaining_needed = edges_per_file - len(no_rel_edges_file)
            candidate_no_rel -= priority_edges
            candidates_list = list(candidate_no_rel)
            random.shuffle(candidates_list)
            no_rel_edges_file.extend(candidates_list[:remaining_needed])
        else:
            random.shuffle(no_rel_edges_file)
            no_rel_edges_file = no_rel_edges_file[:edges_per_file]

        # Convert no-relation edges into CSV rows
        for src_id, tgt_id in no_rel_edges_file:
            src_data = args_dict.get(src_id, {'text': '', 'type': ''})
            tgt_data = args_dict.get(tgt_id, {'text': '', 'type': ''})

            row = {
                'source_ID': src_id,
                'source_text': src_data['text'],
                'target_ID': tgt_id,
                'target_text': tgt_data['text'],
                'relation': 'no-relation',
                'source_type': src_data['type'],
                'target_type': tgt_data['type'],
                'file_name': file_name,
                'type_pair': f"{src_data['type']}-{tgt_data['type']}"
            }
            no_relation_rows.append(row)

    # Combine all relations
    combined_rows = []

    # Columns order
    csv_columns = ['source_ID', 'source_text', 'target_ID', 'target_text',
                   'relation', 'source_type', 'target_type', 'file_name', 'type_pair']

    # Add existing rows, making sure all columns are present (fill source_text and target_text from args_data if missing)
    for row in existing_rows:
        # Fill missing texts from args_data
        file_name = row.get('file_name', '')
        args_in_file = args_data.get(file_name, {})
        src_id = row.get('source_ID', '')
        tgt_id = row.get('target_ID', '')

        if not row.get('source_text'):
            row['source_text'] = args_in_file.get(src_id, {}).get('text', '')
        if not row.get('target_text'):
            row['target_text'] = args_in_file.get(tgt_id, {}).get('text', '')

        # Ensure all columns exist
        filtered_row = {col: row.get(col, '') for col in csv_columns}
        combined_rows.append(filtered_row)

    combined_rows.extend(no_relation_rows)

    # Write combined CSV
    output_csv = '3 PC_SANR_including_standalone_PC_no_relation.csv'
    with open(output_csv, 'w', encoding='utf-8', newline='') as f_out:
        writer = csv.DictWriter(f_out, fieldnames=csv_columns)
        writer.writeheader()
        for row in combined_rows:
            writer.writerow(row)

    print(f"Combined CSV with support, attack, and no-relation edges written to {output_csv}")
    print(f"Total rows: {len(combined_rows)}")

if __name__ == '__main__':
    xml_folder = 'all_xml_cleaned'
    existing_csv = '2 PC_SANR_including_standalone_PC.csv'

    standalone_premises = [
        ('E3bis', 'R2016_DTS Distribuidora de Televisión Digital.xml'),
        ('B42', 'R2021_Prosegur Compañía de Seguridad SA, established in Madrid (Spain) v Commission.xml'),
        ('B7bis', 'R2002_associação dos refinadores de açúcar portugueses.xml'),
        ('E21', 'R2002_associação dos refinadores de açúcar portugueses.xml'),
        ('A23bis', 'A2013_European Commission v Ireland and Others.xml'),
        ('A25bis', 'A2013_European Commission v Ireland and Others.xml'),
        ('D28bis', 'A2016_European Commission v Aer Lingus Ltd and Ryanair Designated Activity Company.xml')
    ]

    standalone_conclusions = [
        ('F3', 'R2021_FVE Holýšov I and Others v Commission.xml')
    ]

    generate_no_relation_edges_with_text(
        xml_folder,
        existing_csv,
        standalone_premises,
        standalone_conclusions,
        edges_per_file=125
    )


Combined CSV with support, attack, and no-relation edges written to PC_SA_all_relations.csv
Total rows: 7425


In [12]:
import csv
from collections import defaultdict

def print_edge_types_first_dataset(csv_file):
    # Nested dict structure: file -> relation -> type_pair -> count
    counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            file_name = row['file_name']
            relation = row['relation']
            type_pair = row['type_pair']
            # Only consider prem and conc types for first dataset
            if not all(t in ['prem', 'conc'] for t in type_pair.split('-')):
                continue
            counts[file_name][relation][type_pair] += 1

    # Print nicely
    for fname in sorted(counts.keys()):
        print(f"File: {fname}")
        for relation in ['support', 'attack', 'no-relation']:
            if relation in counts[fname]:
                print(f"  {relation.capitalize()} edges:")
                for tpair, cnt in sorted(counts[fname][relation].items()):
                    print(f"    {tpair}: {cnt}")
        print()

if __name__ == '__main__':
    csv_file_path = '3 PC_SANR_final.csv'
    print_edge_types_first_dataset(csv_file_path)


File: A2008_Commission of the European Communities v Salzgitter AG.xml
  Support edges:
    prem-conc: 10
    prem-prem: 38
  Attack edges:
    prem-prem: 2
  No-relation edges:
    conc-conc: 3
    conc-prem: 15
    prem-conc: 16
    prem-prem: 91

File: A2009_3F v Commission of the European Communities.xml
  Support edges:
    prem-conc: 15
    prem-prem: 44
  Attack edges:
    conc-prem: 2
    prem-prem: 2
  No-relation edges:
    conc-prem: 11
    prem-conc: 9
    prem-prem: 105

File: A2009_Commission of the European Communities v Koninklijke FrieslandCampina NV_M.xml
  Support edges:
    prem-conc: 4
    prem-prem: 44
  No-relation edges:
    conc-prem: 8
    prem-conc: 8
    prem-prem: 109

File: A2010_NDSHT Nya Destination Stockholm Hotell & Teaterpaket AB v European Commission.xml
  Support edges:
    prem-conc: 3
    prem-prem: 36
  Attack edges:
    prem-prem: 1
  No-relation edges:
    conc-prem: 5
    prem-conc: 2
    prem-prem: 118

File: A2011_European Commission (C-106_

# adding the non-argumentative texts

In [9]:
import csv
import random
from collections import defaultdict
from pathlib import Path

def load_existing_relations(csv_file):
    """Load existing support and attack relations from PC_SA.csv"""
    relations = []
    relations_set = set()  # (file_name, src_id, tgt_id)
    args_by_file = defaultdict(dict)  # file_name -> id -> (text, type)

    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            file_name = row['file_name']
            src_id = row['source_ID']
            tgt_id = row['target_ID']
            relations_set.add((file_name, src_id, tgt_id))
            relations.append(row)

            # Save argument data (text, type)
            args_by_file[file_name][src_id] = (row['source_text'], row['source_type'])
            args_by_file[file_name][tgt_id] = (row['target_text'], row['target_type'])

    return relations, relations_set, args_by_file

def load_non_argumentative_texts(pcna_folder):
    """
    Load non-argumentative texts from annotated csv files in pcna_folder.
    Returns:
        dict file_name -> list of tuples (na_id, text)
          na_id is generated as 'NA_1', 'NA_2', ... unique per file.
    """
    non_arg_texts = defaultdict(list)

    for pcna_csv_file in Path(pcna_folder).glob('*___annotated_judgment.csv'):
        file_name = pcna_csv_file.name.replace('___annotated_judgment.csv', '.xml')

        with open(pcna_csv_file, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            na_count = 0
            for row in reader:
                cls = row['class']
                if cls == '0':  # non-argumentative
                    na_count += 1
                    na_id = f'NA_{na_count}'
                    text = row['text'].strip()
                    non_arg_texts[file_name].append((na_id, text))

    return non_arg_texts

def generate_no_relation_edges_new_dataset(
        pc_sa_csv,
        pcna_folder,
        output_csv,
        total_no_relation_edges=5000,
        edges_per_file=125):
    random.seed(42)

    # Load existing support and attack relations
    existing_rels, existing_rels_set, args_by_file = load_existing_relations(pc_sa_csv)

    # Load non-argumentative texts
    non_arg_texts = load_non_argumentative_texts(pcna_folder)

    all_rows = []

    # Add existing relations (support, attack) as-is
    all_rows.extend(existing_rels)

    # For each file, generate no-relation edges including NA
    for file_name in args_by_file.keys():
        # Arguments from PC_SA.csv: id -> (text, type)
        args = args_by_file[file_name]

        # Non-argumentative texts, assign dummy IDs like NA_1 ...
        nas = non_arg_texts.get(file_name, [])

        # All argument IDs and their types, include NA as type 'na'
        all_args = list(args.items()) + [(na_id, (text, 'na')) for na_id, text in nas]

        # Create maps for prem, conc and NA IDs for sampling
        prem_ids = [aid for aid, (txt, typ) in all_args if typ == 'prem']
        conc_ids = [aid for aid, (txt, typ) in all_args if typ == 'conc']
        na_ids = [aid for aid, (txt, typ) in all_args if typ == 'na']

        # Build sets for fast lookup of existing edges
        existing_edges = set((src, tgt) for f, src, tgt in existing_rels_set if f == file_name)

        # Function to check if candidate edge exists
        def edge_exists(s, t):
            return (file_name, s, t) in existing_rels_set

        # Candidate pools by each combination type for no-rel edges
        candidate_edges = []

        # prem-prem
        candidate_edges += [(s, t) for s in prem_ids for t in prem_ids if s != t and not edge_exists(s, t)]
        # prem-conc
        candidate_edges += [(s, t) for s in prem_ids for t in conc_ids if s != t and not edge_exists(s, t)]
        # conc-prem
        candidate_edges += [(s, t) for s in conc_ids for t in prem_ids if s != t and not edge_exists(s, t)]
        # conc-conc
        candidate_edges += [(s, t) for s in conc_ids for t in conc_ids if s != t and not edge_exists(s, t)]
        # prem-na
        candidate_edges += [(s, t) for s in prem_ids for t in na_ids if not edge_exists(s, t)]
        # na-prem
        candidate_edges += [(s, t) for s in na_ids for t in prem_ids if not edge_exists(s, t)]
        # conc-na
        candidate_edges += [(s, t) for s in conc_ids for t in na_ids if not edge_exists(s, t)]
        # na-conc
        candidate_edges += [(s, t) for s in na_ids for t in conc_ids if not edge_exists(s, t)]
        # na-na
        candidate_edges += [(s, t) for s in na_ids for t in na_ids if s != t and not edge_exists(s, t)]

        random.shuffle(candidate_edges)

        # Select 125 edges per file or less if not enough candidates
        selected_edges = candidate_edges[:edges_per_file]

        for src, tgt in selected_edges:
            src_text, src_type = None, None
            tgt_text, tgt_type = None, None

            # lookup in args or non-arg texts
            if src in args:
                src_text, src_type = args[src]
            else:
                # NA text: find from nas list
                src_text = next((txt for nid, txt in nas if nid == src), '')
                src_type = 'na'

            if tgt in args:
                tgt_text, tgt_type = args[tgt]
            else:
                tgt_text = next((txt for nid, txt in nas if nid == tgt), '')
                tgt_type = 'na'

            row = {
                'source_ID': src,
                'source_text': src_text,
                'target_ID': tgt,
                'target_text': tgt_text,
                'relation': 'no-relation',
                'source_type': src_type,
                'target_type': tgt_type,
                'file_name': file_name,
                'type_pair': f"{src_type}-{tgt_type}"
            }

            all_rows.append(row)

    # Write all combined rows to output CSV
    columns = ['source_ID', 'source_text', 'target_ID', 'target_text',
               'relation', 'source_type', 'target_type', 'file_name', 'type_pair']

    with open(output_csv, 'w', encoding='utf-8', newline='') as f_out:
        writer = csv.DictWriter(f_out, fieldnames=columns)
        writer.writeheader()
        for r in all_rows:
            writer.writerow(r)

    print(f"New dataset with support, attack, and no_relation (including NA) edges written to {output_csv}")
    print(f"Total number of rows: {len(all_rows)}")

if __name__ == '__main__':
    pc_sa_csv = '2 PC_SA_including_standalone_PC.csv'
    pcna_folder = 'PCNA_csv'
    output_csv = '4 PCNA_SANR_all_relations.csv'

    generate_no_relation_edges_new_dataset(
        pc_sa_csv,
        pcna_folder,
        output_csv,
        total_no_relation_edges=5000,
        edges_per_file=125
    )


New dataset with support, attack, and no_relation (including NA) edges written to 4 PCNA_SANR_all_relations.csv
Total number of rows: 7425


In [13]:
import csv
from collections import defaultdict

def print_edge_types_new_dataset(csv_file):
    counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            file_name = row['file_name']
            relation = row['relation']
            type_pair = row['type_pair']
            counts[file_name][relation][type_pair] += 1

    # Print nicely
    for fname in sorted(counts.keys()):
        print(f"File: {fname}")
        for relation in ['support', 'attack', 'no-relation']:
            if relation in counts[fname]:
                print(f"  {relation.capitalize()} edges:")
                for tpair, cnt in sorted(counts[fname][relation].items()):
                    # Format type_pair to make types more descriptive:
                    t1, t2 = tpair.split('-')
                    type_map = {'prem': 'premise', 'conc': 'conclusion', 'na': 'non-arg'}
                    type1 = type_map.get(t1, t1)
                    type2 = type_map.get(t2, t2)
                    print(f"    {type1} - {type2}: {cnt}")
        print()

if __name__ == '__main__':
    csv_file_path = '4 PCNA_SANR_final.csv'
    print_edge_types_new_dataset(csv_file_path)


File: A2008_Commission of the European Communities v Salzgitter AG.xml
  Support edges:
    premise - conclusion: 10
    premise - premise: 38
  Attack edges:
    premise - premise: 2
  No-relation edges:
    conclusion - non-arg: 2
    conclusion - premise: 2
    non-arg - conclusion: 4
    non-arg - non-arg: 68
    non-arg - premise: 22
    premise - conclusion: 2
    premise - non-arg: 18
    premise - premise: 7

File: A2009_3F v Commission of the European Communities.xml
  Support edges:
    premise - conclusion: 15
    premise - premise: 44
  Attack edges:
    conclusion - premise: 2
    premise - premise: 2
  No-relation edges:
    conclusion - non-arg: 4
    conclusion - premise: 1
    non-arg - conclusion: 2
    non-arg - non-arg: 65
    non-arg - premise: 22
    premise - non-arg: 25
    premise - premise: 6

File: A2009_Commission of the European Communities v Koninklijke FrieslandCampina NV_M.xml
  Support edges:
    premise - conclusion: 4
    premise - premise: 44
  No-re