In [16]:
import os

def compare_folders(txt_folder, xml_folder):
    # Get list of filenames without extensions
    txt_files = {os.path.splitext(file)[0] for file in os.listdir(txt_folder) if file.endswith('.csv')}
    xml_files = {os.path.splitext(file)[0] for file in os.listdir(xml_folder) if file.endswith('.csv')}
    
    # Find matched and unmatched files
    matched_files = txt_files.intersection(xml_files)
    unmatched_files = xml_files.difference(txt_files)

    # Print the results
    print(f"Total matched documents: {len(matched_files)}")
    if matched_files:
        print("Matched documents:")
        for file in matched_files:
            print(file)
    
    print(f"\nTotal unmatched documents: {len(unmatched_files)}")
    if unmatched_files:
        print("Unmatched documents:")
        for file in unmatched_files:
            print(file)

# Example usage:
txt_folder = 'prem_vs_conc_csv_files'  # Replace with the path to your 'original_txt_files' folder
xml_folder = 'argumentative_nonargumentative_FinalCSVs'          # Replace with the path to your 'xml_files' folder

compare_folders(txt_folder, xml_folder)


Total matched documents: 38
Matched documents:
R2016_Netherlands Maritime Technology Association formerly Scheepsbouw Nederland v European Commission
A2011_European Commission (C-106_09 P) and Kingdom of Spain (C-107_09 P) v Government of Gibraltar and United Kingdom of Great Britain and Northern Ireland
R2021_FVE Holýšov I and Others v Commission
R2012_European Commission v Électricité de France (EDF)
R2004_Italian Republic v Commission of the European Communities
R2004_Daewoo Electronics Manufacturing España SA and Territorio Histórico de Álava - Diputación Foral de Álava v Commission of the European Communities
A2017_European Commission v Italian Republic_DT
R2016_Orange v European Commission
R2016_Hellenic Republic v European Commission
R2016_European Commission v Hansestadt Lübeck
A2013_European Commission v Ireland and Others
R2000_French Republic v Ladbroke Racing Ltd and Commission of the European Communitie
R2011_European Commission v Kronoply GmbH & Co
R2015_European Commissi

In [19]:
import os

def compare_annotated_csvs_with_xml(csv_folder, xml_folder):
    # Extract base names (without .xml) from xml_files
    xml_files = {os.path.splitext(file)[0] for file in os.listdir(xml_folder) if file.endswith('.xml')}

    # Extract base names from CSVs by stripping '___annotated_judgment.csv'
    csv_files = {
        file.replace('___annotated_judgment', '').replace('.txt', '')
        for file in os.listdir(csv_folder)
        if file.endswith('.txt') and '___annotated_judgment' in file
    }

    # Compare
    matched_files = xml_files.intersection(csv_files)
    unmatched_files = xml_files.difference(csv_files)

    # Output
    print(f"Total matched documents: {len(matched_files)}")
    if matched_files:
        print("Matched documents:")
        for file in sorted(matched_files):
            print(file)

    print("\n" + "-"*40 + "\n")

    print(f"Total unmatched documents: {len(unmatched_files)}")
    if unmatched_files:
        print("Unmatched documents:")
        for file in sorted(unmatched_files):
            print(file)

# Example usage:
csv_folder = 'annotated'  # Replace with actual path
xml_folder = 'xml_files'                                 # Replace with actual path

compare_annotated_csvs_with_xml(csv_folder, xml_folder)


Total matched documents: 39
Matched documents:
A2008_Commission of the European Communities v Salzgitter AG
A2009_3F v Commission of the European Communities
A2009_Commission of the European Communities v Koninklijke FrieslandCampina NV_M
A2010_NDSHT Nya Destination Stockholm Hotell & Teaterpaket AB v European Commission
A2011_European Commission (C-106_09 P) and Kingdom of Spain (C-107_09 P) v Government of Gibraltar and United Kingdom of Great Britain and Northern Ireland
A2012_BNP Paribas and Banca Nazionale del Lavoro SpA (BNL) v European Commission
A2013_European Commission v Ireland and Others
A2013_Frucona Košice a.s. v European Commission
A2016_European Commission v Aer Lingus Ltd and Ryanair Designated Activity Company
A2016_European_Commission_v_World_Duty_Free
A2017_Ellinikos Chrysos AE Metalleion kai Viomichanias Chrysou v European Commission
A2017_European Commission v Italian Republic_DT
A2017_European Commission v TV2_Danmark A_S
A2018_Commission v Spain
A2018_Dirk Andre

In [13]:
compare_annotated_csvs_with_xml('prem_vs_conc_csv_files', xml_folder)

Total matched documents: 40
Matched documents:
A2008_Commission of the European Communities v Salzgitter AG
A2009_3F v Commission of the European Communities
A2009_Commission of the European Communities v Koninklijke FrieslandCampina NV_M
A2010_NDSHT Nya Destination Stockholm Hotell & Teaterpaket AB v European Commission
A2011_European Commission (C-106_09 P) and Kingdom of Spain (C-107_09 P) v Government of Gibraltar and United Kingdom of Great Britain and Northern Ireland
A2012__BNP Paribas and Banca Nazionale del Lavoro SpA (BNL) v European Commission
A2013_European Commission v Ireland and Others
A2013_Frucona Košice a.s. v European Commission
A2016_European Commission v Aer Lingus Ltd and Ryanair Designated Activity Company
A2016_European_Commission_v_World_Duty_Free
A2017_Ellinikos Chrysos AE Metalleion kai Viomichanias Chrysou v European Commission
A2017_European Commission v Italian Republic_DT
A2017_European Commission v TV2_Danmark A_S
A2018_Commission v Spain
A2018_Dirk Andr

In [20]:
import os
import xml.etree.ElementTree as ET

def count_support_and_attack_relations(xml_folder_path):
    results = []

    for filename in os.listdir(xml_folder_path):
        if filename.endswith(".xml"):
            file_path = os.path.join(xml_folder_path, filename)
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()

                support_count = 0
                attack_count = 0

                for elem in root.iter():
                    # Check for SUP attribute
                    if 'SUP' in elem.attrib:
                        supported = elem.attrib['SUP'].split('|')
                        support_count += len(supported)

                    # Check for ATT attribute
                    if 'ATT' in elem.attrib:
                        attacked = elem.attrib['ATT'].split('|')
                        attack_count += len(attacked)

                results.append({
                    'filename': filename,
                    'support_count': support_count,
                    'attack_count': attack_count
                })

            except ET.ParseError as e:
                print(f"Error parsing {filename}: {e}")

    # Print results
    for res in results:
        print(f"File: {res['filename']}")
        print(f"  Support Relations: {res['support_count']}")
        print(f"  Attack Relations:  {res['attack_count']}\n")

# Call the function with the path to your xml_files folder
count_support_and_attack_relations('xml_files')


File: R2011_France Télécom SA v European Commission.xml
  Support Relations: 77
  Attack Relations:  0

File: A2018_Commission v Spain.xml
  Support Relations: 77
  Attack Relations:  5

File: R2017_European Commission v Frucona Košice a.xml
  Support Relations: 56
  Attack Relations:  0

File: A2018_Dirk Andres v European Commission.xml
  Support Relations: 73
  Attack Relations:  5

File: A2017_European Commission v Italian Republic_DT.xml
  Support Relations: 59
  Attack Relations:  1

File: R2016_DTS Distribuidora de Televisión Digital.xml
  Support Relations: 84
  Attack Relations:  6

File: A2018_Scuola Elementare Maria Montessori Srl v European Commission.xml
  Support Relations: 107
  Attack Relations:  3

File: R2021_FVE Holýšov I and Others v Commission.xml
  Support Relations: 34
  Attack Relations:  8

File: R2016_European Commission v Hansestadt Lübeck.xml
  Support Relations: 66
  Attack Relations:  0

File: R2006_European Commission v Italian Republic.xml
  Support Relat

In [11]:
import pandas as pd
CSV_PATH = 'filtered_all.csv'

df = pd.read_csv(CSV_PATH)
file_names = df['file_name'].unique()
assert len(file_names) == 40, f"Expected 40 files, found {len(file_names)}"
