# file checking

In [7]:
import os

xml_folder = 'all_xml'
csv_folder = 'PCNA_csv'

xml_files = [f for f in os.listdir(xml_folder) if f.endswith('.xml')]
xml_basenames = [os.path.splitext(f)[0] for f in xml_files]

expected_csv_files = [name + '___annotated_judgment.csv' for name in xml_basenames]
actual_csv_files = [f for f in os.listdir(csv_folder) if f.endswith('.csv')]

missing_csv_files = [csv for csv in expected_csv_files if csv not in actual_csv_files]
extra_csv_files = [csv for csv in actual_csv_files if csv not in expected_csv_files]

print("Missing CSV files:")
for fname in missing_csv_files:
    print(fname)

print("Extra CSV files:")
for fname in extra_csv_files:
    print(fname)


Missing CSV files:
Extra CSV files:


# no of premises, conclusion, support and attack edges check

In [22]:
import os
import xml.etree.ElementTree as ET
from pathlib import Path

def count_xml_relations_and_types(xml_folder):
    support_count = 0
    attack_count = 0
    premise_ids = set()
    conclusion_ids = set()

    for xml_file in Path(xml_folder).glob("*.xml"):
        try:
            tree = ET.parse(xml_file)
            root = tree.getroot()

            for elem in root.iter():
                if elem.tag == 'prem':
                    premise_ids.add(elem.attrib.get('ID', '').strip())
                elif elem.tag == 'conc':
                    conclusion_ids.add(elem.attrib.get('ID', '').strip())

                for rel_type in ['SUP', 'ATT']:
                    if rel_type in elem.attrib:
                        refs = [x.strip() for x in elem.attrib[rel_type].split('|') if x.strip()]
                        if rel_type == 'SUP':
                            support_count += len(refs)
                        else:
                            attack_count += len(refs)
        except ET.ParseError as e:
            print(f"XML parse error in {xml_file.name}: {e}")

    return {
        'support_relations': support_count,
        'attack_relations': attack_count,
        'num_premises': len(premise_ids),
        'num_conclusions': len(conclusion_ids)
    }

# Example usage:
xml_stats = count_xml_relations_and_types('all_xml_cleaned')
print(xml_stats)


{'support_relations': 2272, 'attack_relations': 145, 'num_premises': 343, 'num_conclusions': 108}


In [20]:
import csv

def count_unique_prem_conc_and_relations(csv_file):
    premises = set()
    conclusions = set()
    support_count = 0
    attack_count = 0
    no_relation_count = 0

    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            src_type = row['source_type']
            tgt_type = row['target_type']
            rel = row['relation']

            if src_type == 'prem':
                premises.add(row['source_ID'])
            elif src_type == 'conc':
                conclusions.add(row['source_ID'])

            if tgt_type == 'prem':
                premises.add(row['target_ID'])
            elif tgt_type == 'conc':
                conclusions.add(row['target_ID'])

            if rel == 'support':
                support_count += 1
            elif rel == 'attack':
                attack_count += 1
            elif rel == 'no-relation':
                no_relation_count += 1

    return {
        'support_relations': support_count,
        'attack_relations': attack_count,
        'no_relation': no_relation_count,
        'unique_premises': len(premises),
        'unique_conclusions': len(conclusions),
        
    }

# Usage
csv_file_path = 'PC_SA_including_standalone_PC.csv'
stats = count_unique_prem_conc_and_relations(csv_file_path)
print(stats)


{'support_relations': 2272, 'attack_relations': 145, 'no_relation': 8, 'unique_premises': 343, 'unique_conclusions': 108}


# find missing premises and conclusions (standalone premises and conclusions which dont support/attack or get supported/attacked by any other argument)

In [18]:
import os
import xml.etree.ElementTree as ET
from pathlib import Path
import csv

def read_xml_prem_conc_ids(xml_folder):
    # Maps ID -> file_name for premises and conclusions
    prem_id_to_file = {}
    conc_id_to_file = {}
    for xml_file in Path(xml_folder).glob('*.xml'):
        try:
            tree = ET.parse(xml_file)
            root = tree.getroot()
            for elem in root.iter():
                if elem.tag == 'prem':
                    arg_id = elem.attrib.get('ID', '').strip()
                    if arg_id:
                        prem_id_to_file[arg_id] = xml_file.name
                elif elem.tag == 'conc':
                    arg_id = elem.attrib.get('ID', '').strip()
                    if arg_id:
                        conc_id_to_file[arg_id] = xml_file.name
        except ET.ParseError as e:
            print(f"XML parse error in {xml_file.name}: {e}")
    return prem_id_to_file, conc_id_to_file

def read_csv_unique_ids(csv_file):
    # Returns sets of premise IDs and conclusion IDs found in the CSV
    csv_prem_ids = set()
    csv_conc_ids = set()
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            # Add source ID to respective set
            if row['source_type'] == 'prem' and row['source_ID']:
                csv_prem_ids.add(row['source_ID'])
            elif row['source_type'] == 'conc' and row['source_ID']:
                csv_conc_ids.add(row['source_ID'])
            # Add target ID to respective set
            if row['target_type'] == 'prem' and row['target_ID']:
                csv_prem_ids.add(row['target_ID'])
            elif row['target_type'] == 'conc' and row['target_ID']:
                csv_conc_ids.add(row['target_ID'])
    return csv_prem_ids, csv_conc_ids

def find_missing_ids(xml_folder, csv_file):
    xml_prem_ids_map, xml_conc_ids_map = read_xml_prem_conc_ids(xml_folder)
    csv_prem_ids, csv_conc_ids = read_csv_unique_ids(csv_file)
    
    missing_prem = {}
    missing_conc = {}

    for prem_id, file_name in xml_prem_ids_map.items():
        if prem_id not in csv_prem_ids:
            missing_prem[prem_id] = file_name

    for conc_id, file_name in xml_conc_ids_map.items():
        if conc_id not in csv_conc_ids:
            missing_conc[conc_id] = file_name

    print(f"Missing Premises ({len(missing_prem)}):")
    for pid, fname in missing_prem.items():
        print(f"  Premise ID {pid} from file {fname}")

    print(f"\nMissing Conclusions ({len(missing_conc)}):")
    for cid, fname in missing_conc.items():
        print(f"  Conclusion ID {cid} from file {fname}")

    return missing_prem, missing_conc

if __name__ == '__main__':
    xml_folder = 'all_xml_cleaned'
    csv_file = 'PC_SA_including_standalone_PC.csv' #change filename for including_standalone
    missing_premises, missing_conclusions = find_missing_ids(xml_folder, csv_file)


Missing Premises (0):

Missing Conclusions (0):


# added no-relation edges in including file

In [19]:
import pandas as pd

# Load the two CSV files
df_including = pd.read_csv('PC_SA_including_standalone_PC.csv')
df_without = pd.read_csv('PC_SA_without_standalone_PC.csv')

# Perform a merge to find rows in the including file but not in the without file
difference = pd.merge(df_including, df_without, how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)

# Print the resulting rows
print(difference)


     source_ID                                        source_text target_ID  \
419      E3bis  it states at the outset that its appeal does n...        E1   
593         F3  Consequently, the sixth ground of appeal must ...        H4   
1616     B7bis  Since it is not notified before it is implemen...        B8   
1663       E21  Finally, it must be pointed out that the fact ...       E13   
2079       B42  First, it observes that the aim of that measur...       B40   
2340    A23bis  secondly, the fact that those exemptions were ...       A25   
2342    A25bis  The General Court concluded that, in the parti...        B3   
2416    D28bis   "that assessment is irrelevant to the recover...       C14   

                                            target_text     relation  \
419   It should be noted that the third ground of ap...  no-relation   
593   However, as the General Court found in paragra...  no-relation   
1616  However, the background to the present dispute...  no-relation   


# Attributes

In [8]:
import os
from pathlib import Path
import xml.etree.ElementTree as ET


def extract_unique_attributes(xml_folder):
    prem_attrs = set()
    conc_attrs = set()

    for xml_file in Path(xml_folder).glob("*.xml"):
        try:
            tree = ET.parse(xml_file)
            root = tree.getroot()

            for elem in root.iter():
                if elem.tag == 'prem':
                    prem_attrs.update(elem.attrib.keys())
                elif elem.tag == 'conc':
                    conc_attrs.update(elem.attrib.keys())

        except ET.ParseError as e:
            print(f"XML parse error in {xml_file.name}: {e}")

    return prem_attrs, conc_attrs


if __name__ == '__main__':
    folder = 'all_xml'
    unique_premise_attributes, unique_conclusion_attributes = extract_unique_attributes(folder)

    print("Unique Premise Attributes:")
    for attr in sorted(unique_premise_attributes):
        print(f"  {attr}")

    print("\nUnique Conclusion Attributes:")
    for attr in sorted(unique_conclusion_attributes):
        print(f"  {attr}")


Unique Premise Attributes:
  ATT
  ID
  INH
  REPH
  S
  SFF
  SUP
  T

Unique Conclusion Attributes:
  ID
  SFF
  SUP


# conclusions never get attacked. premise can attack another premise, conclusion can attack a premise, but a premise never attacks conclusion, also, a conclusion never attacks conclusion

### now lets see how many no-relation pairs can be made

there are a total of 336 premises and 107 conclusion = 443 total arguments in PC_SA.csv, so there are 443 x 443 = 196249 total relations possible. 

so, no-relations = total - support - attack = 196249 - 2269 - 145 = 193835 total edges

we need to sample from these
