In [3]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from difflib import SequenceMatcher
from tqdm import tqdm

In [4]:
def text_similarity(a, b):
    return SequenceMatcher(None, a.strip().lower(), b.strip().lower()).ratio()

def parse_annotated_file(annotated_path):
    """Parse annotated text file into segments"""
    segments = []
    with open(annotated_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.split('||')
            if len(parts) != 2:
                continue
            text = parts[0].strip()
            label = parts[1].strip()
            
            if label == 'NA':
                segments.append({'text': text, 'type': 'non-argumentative', 'id': None})
            elif label.startswith('P_'):
                segments.append({'text': text, 'type': 'premise', 'id': label[2:]})
            elif label.startswith('C_'):
                segments.append({'text': text, 'type': 'conclusion', 'id': label[2:]})
    return segments


In [5]:
def parse_xml_relations(xml_path):
    tree = ET.parse(xml_path)
    relations = []
    for elem in tree.iter():
        if elem.tag not in ['prem', 'conc']:
            continue
        source_id = elem.attrib['ID']
        for target in elem.attrib.get('SUP', '').split('|'):
            if target:
                relations.append(('support', source_id, target))
        for target in elem.attrib.get('ATT', '').split('|'):
            if target:
                relations.append(('attack', source_id, target))
    return relations

In [6]:
def create_id_mapping(segments):
    id_map = {}
    for seg in segments:
        if seg['id'] is None:
            continue
        existing = id_map.get(seg['id'])
        if existing is None or text_similarity(seg['text'], existing['text']) < 0.8:
            id_map[seg['id']] = seg
    return id_map

def process_document(base_name, annotated_folder, xml_folder):
    annotated_path = os.path.join(annotated_folder, f"{base_name}___annotated_judgment.txt")
    if not os.path.exists(annotated_path):
        return []
    segments = parse_annotated_file(annotated_path)
    id_map = create_id_mapping(segments)
    xml_path = os.path.join(xml_folder, f"{base_name}.xml")
    if not os.path.exists(xml_path):
        return []
    relations = parse_xml_relations(xml_path)
    pairs = []
    for i, src in enumerate(segments):
        for j, tgt in enumerate(segments):
            if i == j:
                continue
            relation = 'no-relation'
            if src['type'] != 'non-argumentative' and tgt['type'] != 'non-argumentative':
                src_id = src['id']
                tgt_id = tgt['id']
                for rel_type, s_id, t_id in relations:
                    if (text_similarity(src_id, s_id) > 0.8 and 
                        text_similarity(tgt_id, t_id) > 0.8):
                        relation = rel_type
                        break
            pairs.append({
                'source_text': src['text'],
                'target_text': tgt['text'],
                'relation': relation,
                'source_type': src['type'],
                'target_type': tgt['type'],
                'file_name': base_name
            })
    return pairs


In [7]:
def process_split(annotated_folder, xml_folder, output_file):
    annotated_files = [f for f in os.listdir(annotated_folder)
                       if f.endswith('___annotated_judgment.txt')]
    all_pairs = []
    for annotated_file in tqdm(annotated_files, desc=f"Processing {output_file}"):
        base_name = annotated_file.replace('___annotated_judgment.txt', '')
        pairs = process_document(base_name, annotated_folder, xml_folder)
        all_pairs.extend(pairs)
    df = pd.DataFrame(all_pairs)
    if not df.empty:
        df = df.drop_duplicates(subset=['source_text', 'target_text', 'file_name'])
        df.to_csv(output_file, index=False)
        print(f"Dataset created with {len(df)} pairs. Saved to {output_file}")
    else:
        print(f"No data found for {output_file}")


In [8]:
annotated_train_folder = 'annotated_train'
xml_train_folder = 'xml_train'
annotated_test_folder = 'annotated_test'
xml_test_folder = 'xml_test'

# Output files
train_output = 'relations_train.csv'
test_output = 'relations_test.csv'

# Process train and test splits
process_split(annotated_train_folder, xml_train_folder, train_output)
process_split(annotated_test_folder, xml_test_folder, test_output)

Processing relations_train.csv: 100%|██████████| 32/32 [00:43<00:00,  1.35s/it]


Dataset created with 1396641 pairs. Saved to relations_train.csv


Processing relations_test.csv: 100%|██████████| 8/8 [00:35<00:00,  4.41s/it]


Dataset created with 297765 pairs. Saved to relations_test.csv


In [None]:
# data_folder = './'
# output_file = 'final_relations_dataset.csv'

# # Get list of annotated files
# annotated_files = [f for f in os.listdir(os.path.join(data_folder, 'annotated')) 
#                     if f.endswith('___annotated_judgment.txt')]

# # Process all documents
# all_pairs = []
# for annotated_file in tqdm(annotated_files):
#     base_name = annotated_file.replace('___annotated_judgment.txt', '')
#     pairs = process_document(base_name, data_folder)
#     all_pairs.extend(pairs)

# # Create DataFrame and save
# df = pd.DataFrame(all_pairs)
# df = df.drop_duplicates(subset=['source_text', 'target_text', 'file_name'])
# df.to_csv(output_file, index=False)
# print(f"Dataset created with {len(df)} pairs. Saved to {output_file}")


100%|██████████| 39/39 [01:13<00:00,  1.88s/it]


Dataset created with 1601063 pairs. Saved to final_relations_dataset.csv


In [9]:
# %% [Cell 1] Add Enhanced Cleaning Functions
import pandas as pd
import re

def remove_leading_numbers(text):
    """Remove 1-3 digit numbers at start of text with trailing period/space"""
    return re.sub(r'^\d{1,3}\.*\s*', '', text).strip()

def filter_short_phrases(df):
    """Remove rows with fewer than 3 words in source_text or target_text"""
    return df[
        (df['source_text'].apply(lambda x: len(str(x).split()) >= 4)) &
        (df['target_text'].apply(lambda x: len(str(x).split()) >= 4))
    ]

# %% [Cell 2] Enhanced Processing Pipeline
def enhanced_processing(input_csv, output_csv):
    # Load cleaned dataset
    df = pd.read_csv(input_csv)
    
    print("Before enhanced cleaning:")
    print(f"Total rows: {len(df)}")
    print("Sample rows with numbers:")
    print(df[df['source_text'].str.match(r'^\d{1,3}\.')].head(2))
    print(df[df['target_text'].str.match(r'^\d{1,3}\.')].head(3))

        
    # Remove leading numbers
    df['source_text'] = df['source_text'].apply(remove_leading_numbers)
    df['target_text'] = df['target_text'].apply(remove_leading_numbers)
        
    # Filter short phrases
    initial_count = len(df)
    df = filter_short_phrases(df)
    
    print("\nAfter enhanced cleaning:")
    print(f"Removed {initial_count - len(df)} rows")
    print(f"Remaining rows: {len(df)}")
    print("Sample cleaned rows:")
    print(df.head(3))
    
    # Save final dataset
    df.to_csv(output_csv, index=False)
    return df

final_df = enhanced_processing('relations_train.csv', 'relations_train_clean.csv')


Before enhanced cleaning:
Total rows: 1396641
Sample rows with numbers:
    source_text                       target_text     relation  \
306          5.                   JUDGMENT OF 16.  no-relation   
307          5.  * Language of the case: English.  no-relation   

           source_type        target_type  \
306  non-argumentative  non-argumentative   
307  non-argumentative  non-argumentative   

                                             file_name  
306  R2002_associacao dos refinadores de acucar por...  
307  R2002_associacao dos refinadores de acucar por...  
         source_text target_text     relation        source_type  \
0    JUDGMENT OF 16.          5.  no-relation  non-argumentative   
299  JUDGMENT OF 16.          1.  no-relation  non-argumentative   
300  JUDGMENT OF 16.          2.  no-relation  non-argumentative   

           target_type                                          file_name  
0    non-argumentative  R2002_associacao dos refinadores de acucar por...

In [10]:
final_df_test = enhanced_processing('relations_test.csv', 'relations_test_clean.csv')


Before enhanced cleaning:
Total rows: 297765
Sample rows with numbers:
                                        source_text  \
2016  37.5 billion (equivalent to EUR 5.7 billion).   
2017  37.5 billion (equivalent to EUR 5.7 billion).   

                          target_text     relation        source_type  \
2016                             C.G.  no-relation  non-argumentative   
2017  * Language of the case: French.  no-relation  non-argumentative   

            target_type                           file_name  
2016  non-argumentative  R2016_Orange v European Commission  
2017  non-argumentative  R2016_Orange v European Commission  
    source_text                                    target_text     relation  \
13         C.G.  37.5 billion (equivalent to EUR 5.7 billion).  no-relation   
140        C.G.                                             1.  no-relation   
141        C.G.                                             2.  no-relation   

           source_type        target_typ

In [11]:
import os
import pandas as pd

# Verify dataset file exists
dataset_file = 'relations_test_clean.csv'
if not os.path.exists(dataset_file):
    print("Error: Dataset file not found. Existing files:")
    print('\n'.join(f for f in os.listdir('.') if f.endswith('.csv')))
else:
    df = pd.read_csv(dataset_file)
    
    # Overall statistics
    overall = df['relation'].value_counts().to_dict()
    
    # Per-file statistics
    per_file = df.groupby('file_name')['relation'].value_counts().unstack(fill_value=0)
    
    print(f"\nOverall relation counts ({len(df)} total pairs):")
    print(pd.DataFrame.from_dict(overall, orient='index', columns=['count']))
    
    print("\nPer-file relation counts:")
    print(per_file)



Overall relation counts (232080 total pairs):
              count
no-relation  231510
support         507
attack           63

Per-file relation counts:
relation                                            attack  no-relation  \
file_name                                                                 
R2016_Hellenic Republic v European Commission            0        13290   
R2016_Netherlands Maritime Technology Associati...       8        33237   
R2016_Orange v European Commission                       9        19376   
R2017_European Commission v Frucona Košice a             0        31096   
R2017_Viasat Broadcasting UK Ltd v European Com...       3        14231   
R2021_FVE Holýšov I and Others v Commission              7        23219   
R2021_Prosegur Compañía de Seguridad SA, establ...      24        55320   
R2021_World Duty Free v. Commission                     12        41741   

relation                                            support  
file_name                        

In [12]:
import os
import pandas as pd

# Verify dataset file exists
dataset_file = 'relations_train_clean.csv'
if not os.path.exists(dataset_file):
    print("Error: Dataset file not found. Existing files:")
    print('\n'.join(f for f in os.listdir('.') if f.endswith('.csv')))
else:
    df = pd.read_csv(dataset_file)
    
    # Overall statistics
    overall = df['relation'].value_counts().to_dict()
    
    # Per-file statistics
    per_file = df.groupby('file_name')['relation'].value_counts().unstack(fill_value=0)
    
    print(f"\nOverall relation counts ({len(df)} total pairs):")
    print(pd.DataFrame.from_dict(overall, orient='index', columns=['count']))
    
    print("\nPer-file relation counts:")
    print(per_file)



Overall relation counts (924188 total pairs):
              count
no-relation  922734
support        1397
attack           57

Per-file relation counts:
relation                                            attack  no-relation  \
file_name                                                                 
A2008_Commission of the European Communities v ...       2        23823   
A2009_3F v Commission of the European Communities        4        25699   
A2009_Commission of the European Communities v ...       0        16482   
A2010_NDSHT Nya Destination Stockholm Hotell & ...       0         9097   
A2011_European Commission (C-106_09 P) and King...       2        70994   
A2012_BNP Paribas and Banca Nazionale del Lavor...       3        49478   
A2013_European Commission v Ireland and Others           0         9266   
A2013_Frucona Košice a.s. v European Commission          1        30059   
A2016_European Commission v Aer Lingus Ltd and ...       2        40139   
A2016_European_Commis

In [13]:
import pandas as pd

# Load the cleaned dataset
input_file = "relations_train_clean.csv"
output_file = "balanced_relations_train_clean.csv.csv"

try:
    df = pd.read_csv(input_file)
    
    # Separate relations
    support = df[df['relation'] == 'support']
    attack = df[df['relation'] == 'attack']
    no_relation = df[df['relation'] == 'no-relation']

    # Sample 75 no-relation per file
    sampled_no_relation = no_relation.groupby('file_name', group_keys=False)\
                                    .apply(lambda x: x.sample(n=min(75, len(x)), 
                                                             random_state=42))

    # Combine all relations
    balanced_df = pd.concat([support, attack, sampled_no_relation], ignore_index=True)

    # Shuffle the dataset
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Save to new file
    balanced_df.to_csv(output_file, index=False)
    
    # Show statistics
    print("Final dataset composition:")
    print(balanced_df['relation'].value_counts())
    print(f"\nTotal samples: {len(balanced_df)}")
    print(f"Saved to: {output_file}")

except FileNotFoundError:
    print(f"Error: File '{input_file}' not found. Please verify:")
    print("1. The file exists in the current directory")
    print("2. The filename is spelled correctly")
    print("3. The file path is correct")


Final dataset composition:
relation
no-relation    2400
support        1397
attack           57
Name: count, dtype: int64

Total samples: 3854
Saved to: balanced_relations_train_clean.csv.csv


  .apply(lambda x: x.sample(n=min(75, len(x)),


In [14]:
import pandas as pd

# Load the cleaned dataset
input_file = "relations_test_clean.csv"
output_file = "balanced_relations_test_clean.csv"

try:
    df = pd.read_csv(input_file)
    
    # Separate relations
    support = df[df['relation'] == 'support']
    attack = df[df['relation'] == 'attack']
    no_relation = df[df['relation'] == 'no-relation']

    # Sample 75 no-relation per file
    sampled_no_relation = no_relation.groupby('file_name', group_keys=False)\
                                    .apply(lambda x: x.sample(n=min(75, len(x)), 
                                                             random_state=42))

    # Combine all relations
    balanced_df = pd.concat([support, attack, sampled_no_relation], ignore_index=True)

    # Shuffle the dataset
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Save to new file
    balanced_df.to_csv(output_file, index=False)
    
    # Show statistics
    print("Final dataset composition:")
    print(balanced_df['relation'].value_counts())
    print(f"\nTotal samples: {len(balanced_df)}")
    print(f"Saved to: {output_file}")

except FileNotFoundError:
    print(f"Error: File '{input_file}' not found. Please verify:")
    print("1. The file exists in the current directory")
    print("2. The filename is spelled correctly")
    print("3. The file path is correct")


Final dataset composition:
relation
no-relation    600
support        507
attack          63
Name: count, dtype: int64

Total samples: 1170
Saved to: balanced_relations_test_clean.csv


  .apply(lambda x: x.sample(n=min(75, len(x)),


In [15]:
import pandas as pd

def analyze_relations_dataset(file_path):
    try:
        df = pd.read_csv(file_path)
        
        # Create type pair column
        df['type_pair'] = df['source_type'] + '-' + df['target_type']
        
        # 1. Overall relation counts
        relation_counts = df['relation'].value_counts().to_dict()
        
        # 2. Type pair distribution
        type_pair_counts = df['type_pair'].value_counts().to_dict()
        
        # 3. Relation breakdown per type pair
        pair_relations = df.groupby(['type_pair', 'relation']).size().unstack(fill_value=0)
        
        # 4. Detailed metrics
        metrics = {
            'premise-premise': {
                'support': pair_relations.loc['premise-premise', 'support'] if 'premise-premise' in pair_relations.index else 0,
                'attack': pair_relations.loc['premise-premise', 'attack'] if 'premise-premise' in pair_relations.index else 0,
                'total': type_pair_counts.get('premise-premise', 0)
            },
            'premise-conclusion': {
                'support': pair_relations.loc['premise-conclusion', 'support'] if 'premise-conclusion' in pair_relations.index else 0,
                'attack': pair_relations.loc['premise-conclusion', 'attack'] if 'premise-conclusion' in pair_relations.index else 0,
                'total': type_pair_counts.get('premise-conclusion', 0)
            },
            'conclusion-premise': {
                'support': pair_relations.loc['conclusion-premise', 'support'] if 'conclusion-premise' in pair_relations.index else 0,
                'attack': pair_relations.loc['conclusion-premise', 'attack'] if 'conclusion-premise' in pair_relations.index else 0,
                'total': type_pair_counts.get('conclusion-premise', 0)
            },
            'conclusion-conclusion': {
                'support': pair_relations.loc['conclusion-conclusion', 'support'] if 'conclusion-conclusion' in pair_relations.index else 0,
                'attack': pair_relations.loc['conclusion-conclusion', 'attack'] if 'conclusion-conclusion' in pair_relations.index else 0,
                'total': type_pair_counts.get('conclusion-conclusion', 0)
            },
            'non-argumentative-premise': {
                'total': type_pair_counts.get('non-argumentative-premise', 0)
            },
            'non-argumentative-conclusion': {
                'total': type_pair_counts.get('non-argumentative-conclusion', 0)
            },
            'premise-non-argumentative': {
                'total': type_pair_counts.get('premise-non-argumentative', 0)
            },
            'conclusion-non-argumentative': {
                'total': type_pair_counts.get('conclusion-non-argumentative', 0)
            },
            'non-argumentative-non-argumentative': {
                'total': type_pair_counts.get('non-argumentative-non-argumentative', 0)
            }
        }
        
        # Calculate percentages
        total_pairs = len(df)
        for pair in metrics:
            if 'total' in metrics[pair]:
                metrics[pair]['percentage'] = (metrics[pair]['total'] / total_pairs) * 100
        
        return {
            'total_pairs': total_pairs,
            'relation_distribution': relation_counts,
            'type_pair_metrics': metrics,
            'detailed_breakdown': pair_relations.to_dict()
        }
    
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        print("Please verify:")
        print("1. The file exists in the current directory")
        print("2. You're using the correct filename (case-sensitive)")
        print("3. The file path is correct")
        return None

# Run analysis
results = analyze_relations_dataset('balanced_relations_test_clean.csv')


In [19]:
results_train = analyze_relations_dataset('balanced_relations_train_clean.csv')


In [17]:
if results:
    print("\nTotal Pairs:", results['total_pairs'])
    print("\nRelation Distribution:")
    print(pd.DataFrame.from_dict(results['relation_distribution'], orient='index', columns=['Count']))
    
    print("\nType Pair Metrics:")
    print(pd.DataFrame(results['type_pair_metrics']).T)



Total Pairs: 1170

Relation Distribution:
             Count
no-relation    600
support        507
attack          63

Type Pair Metrics:
                                     support  attack  total  percentage
premise-premise                        392.0    54.0  563.0   48.119658
premise-conclusion                       0.0     9.0   18.0    1.538462
conclusion-premise                     115.0     0.0  123.0   10.512821
conclusion-conclusion                    0.0     0.0    0.0    0.000000
non-argumentative-premise                NaN     NaN  120.0   10.256410
non-argumentative-conclusion             NaN     NaN   15.0    1.282051
premise-non-argumentative                NaN     NaN  133.0   11.367521
conclusion-non-argumentative             NaN     NaN    9.0    0.769231
non-argumentative-non-argumentative      NaN     NaN  189.0   16.153846


In [20]:
if results_train:
    print("\nTotal Pairs:", results_train['total_pairs'])
    print("\nRelation Distribution:")
    print(pd.DataFrame.from_dict(results_train['relation_distribution'], orient='index', columns=['Count']))
    
    print("\nType Pair Metrics:")
    print(pd.DataFrame(results_train['type_pair_metrics']).T)



Total Pairs: 3854

Relation Distribution:
             Count
no-relation   2400
support       1397
attack          57

Type Pair Metrics:
                                     support  attack   total  percentage
premise-premise                       1176.0    52.0  1516.0   39.335755
premise-conclusion                       0.0     5.0    23.0    0.596783
conclusion-premise                     221.0     0.0   237.0    6.149455
conclusion-conclusion                    0.0     0.0     1.0    0.025947
non-argumentative-premise                NaN     NaN   491.0   12.740010
non-argumentative-conclusion             NaN     NaN    28.0    0.726518
premise-non-argumentative                NaN     NaN   486.0   12.610275
conclusion-non-argumentative             NaN     NaN    29.0    0.752465
non-argumentative-non-argumentative      NaN     NaN  1043.0   27.062792


In [1]:
import pandas as pd

def remove_conclusion_premise_rows(input_file, output_file):
    try:
        # Read the input CSV file
        df = pd.read_csv(input_file)
        
        # Create the type pair column
        df['type_pair'] = df['source_type'] + '-' + df['target_type']
        
        # Filter out rows where the type_pair is 'conclusion-premise'
        filtered_df = df[df['type_pair'] != 'conclusion-premise']
        
        # Save the filtered data to a new file
        filtered_df.to_csv(output_file, index=False)
        print(f"File saved as: {output_file}")
        
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found.")
        print("Please verify:")
        print("1. The file exists in the current directory")
        print("2. You're using the correct filename (case-sensitive)")
        print("3. The file path is correct")
        
# Apply the function to train and test files
remove_conclusion_premise_rows('balanced_relations_train_clean.csv', 'filtered_relations_train_clean_removed_conclusion_source.csv')
remove_conclusion_premise_rows('balanced_relations_test_clean.csv', 'filtered_relations_test_clean_removed_conclusion_source.csv')


File saved as: filtered_relations_train_clean_removed_conclusion_source.csv
File saved as: filtered_relations_test_clean_removed_conclusion_source.csv
