In [4]:
import nltk
nltk.download(['punkt', 'wordnet'])

# %% [Cell 2] Import Libraries
import pandas as pd
import re
from nltk.tokenize import PunktSentenceTokenizer


[nltk_data] Downloading package punkt to /home/suyamoon/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/suyamoon/nltk_data...


In [5]:
# %% [Cell 3] Text Reconstruction with Cleaning
def reconstruct_and_clean(annotated_path):
    """Rebuild text with paragraph number removal and position tracking"""
    full_text = []
    fragments = []
    current_pos = 0
    para_pattern = re.compile(r'^\d+(?:\.|\s{2,})\s*')

    with open(annotated_path, 'r', encoding='utf') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            # Split text and annotation
            text_part, _, tag_part = line.partition('||')
            text = para_pattern.sub('', text_part.strip())  # Remove paragraph numbers
            tag = tag_part.strip()
            
            if not text:
                continue

            # Track positions
            start = current_pos
            end = start + len(text)
            full_text.append(text)
            fragments.append({
                'text': text,
                'tag': tag,
                'start': start,
                'end': end
            })
            current_pos = end + 1  # Add space between fragments

    return ' '.join(full_text), fragments



In [6]:
full_text, fragments = reconstruct_and_clean('annotated/A2010_NDSHT Nya Destination Stockholm Hotell & Teaterpaket AB v European Commission___annotated_judgment.txt')
print("RECONSTRUCTED TEXT SAMPLE:")
print(full_text[:300] + "...\n")
print("CLEANED FRAGMENTS EXAMPLE:")
for frag in fragments[:5]:
    print(f"[{frag['start']:04}-{frag['end']:04}] {frag['text'][:50]}... | Tag: {frag['tag']}")


RECONSTRUCTED TEXT SAMPLE:
I  -  11914 JUDGMENT OF 18. 2010 — CASE C-322/09 P JUDGMENT OF THE COURT (Third Chamber) 18 November 2010 * In Case C-322/09 P, APPEAL under Article 56 of the Statute of the Court of Justice, brought on 8 August 2009, NDSHT Nya Destination Stockholm Hotell & Teaterpaket AB, established in Stockholm ...

CLEANED FRAGMENTS EXAMPLE:
[0000-0011] I  -  11914... | Tag: 
[0012-0027] JUDGMENT OF 18.... | Tag: NA
[0028-0050] 2010 — CASE C-322/09 P... | Tag: 
[0051-0088] JUDGMENT OF THE COURT (Third Chamber)... | Tag: 
[0089-0107] 18 November 2010 *... | Tag: 


In [7]:
def legal_sentence_tokenizer(text):
    """Custom tokenizer for legal text with position tracking"""
    tokenizer = PunktSentenceTokenizer()
    sentences = []
    char_offset = 0
    
    for sent in tokenizer.tokenize(text):
        # Find sentence boundaries in original text
        start = text.find(sent, char_offset)
        end = start + len(sent)
        
        # Legal text adjustments
        if sent.endswith(('v.', 'No.', 'Art.', 'para.', 'pp.')):
            next_char = text[end:end+1]
            if next_char.islower() or next_char.isnumeric():
                continue  # Merge abbreviation with next sentence
        
        sentences.append({
            'text': sent,
            'start': start,
            'end': end
        })
        char_offset = end
    
    return sentences


In [8]:
sentences = legal_sentence_tokenizer(full_text)
print("\nTOKENIZED SENTENCES:")
print(f"Total sentences: {len(sentences)}")
print("Sample sentences:")
for i, sent in enumerate(sentences[:50]):
    print(f"{i+1}. [{sent['start']:04}-{sent['end']:04}] {sent['text'][:80]}...")




TOKENIZED SENTENCES:
Total sentences: 149
Sample sentences:
1. [0000-0027] I  -  11914 JUDGMENT OF 18....
2. [0028-0596] 2010 — CASE C-322/09 P JUDGMENT OF THE COURT (Third Chamber) 18 November 2010 * ...
3. [0597-1421] I  -  11915 NDSHT v COMMISSION THE COURT (Third Chamber), composed of K. Lenaert...
4. [1422-1764] 2010 — CASE C-322/09 P NDSHT’s action for annulment of the decision contained in...
5. [1765-1964] Legal context 2 As is apparent from recital 2 in its preamble, Council Regulatio...
6. [1965-2368] 1), as amended by the Act concerning the conditions of accession of the Czech Re...
7. [2369-2537] 33) (‘Regulation No 659/1999’), codifies and reinforces the practice for examini...
8. [2538-3143] 3 Under Article  1(b)(i) of that regulation, ‘existing aid’ means ‘without preju...
9. [3144-3497] I  -  11917 NDSHT v COMMISSION 4 In accordance with Article 1(h) of that regulat...
10. [3498-3637] 5 Article 4(1) to (4) of Regulation No 659/1999, in Chapter II thereof, which is...
1

In [9]:
def map_annotations(sentences, fragments):
    dataset = []
    
    for sent in sentences:
        class_label = "NA"  # Default to non-argumentative
        sent_start = sent['start']
        sent_end = sent['end']
        
        for frag in fragments:
            # Skip empty tags and NA fragments
            if frag['tag'] in ('', 'NA'):
                continue
                
            # Extract argument type (P/C) from tag
            tag_type = frag['tag'].split('_')[0]
            if tag_type not in ('P', 'C'):
                continue

            # Calculate overlap
            overlap_start = max(sent_start, frag['start'])
            overlap_end = min(sent_end, frag['end'])
            
            if overlap_start < overlap_end:
                overlap_ratio = (overlap_end - overlap_start) / (frag['end'] - frag['start'])
                if overlap_ratio > 0.6:
                    class_label = tag_type  # Set to P/C
                    break  # First qualifying fragment determines class
        
        dataset.append({
            'text': sent['text'].strip(),
            'class': class_label
        })
    
    return pd.DataFrame(dataset)


In [10]:
df = map_annotations(sentences, fragments)


In [11]:
print("\nDATASET PREVIEW:")
print(f"Class Distribution:\n{df['class'].value_counts()}\n")
print("Sample Argumentative Sentences:")
print(df[df['class'] == 1].head(3))
print("\nSample Non-Argumentative Sentences:")
print(df[df['class'] == 0].head(3))




DATASET PREVIEW:
Class Distribution:
class
NA    108
P      39
C       2
Name: count, dtype: int64

Sample Argumentative Sentences:
Empty DataFrame
Columns: [text, class]
Index: []

Sample Non-Argumentative Sentences:
Empty DataFrame
Columns: [text, class]
Index: []


In [12]:
def final_clean(text):
    """Remove residual noise while preserving legal references"""
    # Remove standalone numbers and bullet points
    text = re.sub(r'^\d+\.?\s*', '', text)
    # Normalize quotation marks
    text = re.sub(r'[”“â€˜â€™]', "'", text)
    # Fix encoding artifacts
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply final cleaning
df['cleaned_text'] = df['text'].apply(final_clean)
df = df[df['cleaned_text'].ne('')]

# Export to CSV
output_path = 'cleaned_legal_dataset.csv'
df[['text', 'class']].to_csv(output_path, index=False) 

print("\nFINAL DATASET SAMPLE:")
print(f"Class Distribution:\n{df['class'].value_counts()}\n")
print("Sample Premises:")
print(df[df['class'] == 'P'].head(3))
print("\nSample Conclusions:")
print(df[df['class'] == 'C'].head(3))



FINAL DATASET SAMPLE:
Class Distribution:
class
NA    108
P      39
C       2
Name: count, dtype: int64

Sample Premises:
                                                 text class  \
97  I  -  11929 NDSHT v COMMISSION Findings of the...     P   
98  41 In that regard, it must be observed that it...     P   
99  42 However, it is clear that, contrary to the ...     P   

                                         cleaned_text  
97  I - 11929 NDSHT v COMMISSION Findings of the C...  
98  In that regard, it must be observed that it fo...  
99  However, it is clear that, contrary to the Com...  

Sample Conclusions:
                                                  text class  \
102       It follows that that argument is admissible.     C   
137  63 In those circumstances, the judgment under ...     C   

                                          cleaned_text  
102       It follows that that argument is admissible.  
137  In those circumstances, the judgment under app...  


In [13]:
import os

def process_directory(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    for filename in os.listdir(input_dir):
        if filename.endswith('_annotated_judgment.txt'):
            input_path = os.path.join(input_dir, filename)
            full_text, fragments = reconstruct_and_clean(input_path)
            sentences = legal_sentence_tokenizer(full_text)
            df = map_annotations(sentences, fragments)
            output_path = os.path.join(output_dir, filename.replace('.txt', '.csv'))
            df.to_csv(output_path, index=False)

# Example usage:


In [14]:
process_directory('annotated', 'P_C_NA csv')


In [15]:
# %% [Cell 1] Add Enhanced Cleaning Functions
import pandas as pd
import re

def remove_leading_numbers(text):
    """Remove 1-3 digit numbers at start of text with trailing period/space"""
    return re.sub(r'^\d{1,3}\.*\s*', '', text).strip()

def filter_short_phrases(df):
    """Remove rows with fewer than 3 words"""
    return df[df['text'].apply(lambda x: len(str(x).split()) >= 3)]

# %% [Cell 2] Enhanced Processing Pipeline
def enhanced_processing(input_csv, output_csv):
    # Load cleaned dataset
    df = pd.read_csv(input_csv)
    
    print("Before enhanced cleaning:")
    print(f"Total rows: {len(df)}")
    
    # Remove leading numbers
    df['text'] = df['text'].apply(remove_leading_numbers)
    
    # Filter short phrases
    initial_count = len(df)
    df = filter_short_phrases(df)
    
    # Preserve NA labels explicitly
    df['class'] = df['class'].fillna('NA')  # New line to fix missing labels
    
    print("\nAfter enhanced cleaning:")
    print(f"Removed {initial_count - len(df)} rows")
    print(f"Remaining rows: {len(df)}")
    
    # Save final dataset
    df.to_csv(output_csv, index=False)
    return df



In [16]:
def process_directory(input_dir, output_dir):
    # Create output directory if not exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Verify input directory exists
    if not os.path.exists(input_dir):
        raise FileNotFoundError(f"Input directory '{input_dir}' not found")
    
    # Get list of CSV files
    csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]
    
    if not csv_files:
        print(f"No CSV files found in {input_dir}")
        return
    
    # Process each file
    for csv_file in csv_files:
        input_path = os.path.join(input_dir, csv_file)
        output_path = os.path.join(output_dir, csv_file)
        
        print(f"Processing {csv_file}...")
        final_df = enhanced_processing(input_path, output_path)
        
        if final_df is not None:
            print(f"Processed {len(final_df)} rows -> {output_path}")



In [17]:
# Run the processing
process_directory(
    'P_C_NA csv',
    'P_C_NA csv final combined2'
)


Processing R2015_European Commission v MOL Magyar Olaj- és Gázipari Nyrt___annotated_judgment.csv...
Before enhanced cleaning:
Total rows: 172

After enhanced cleaning:
Removed 2 rows
Remaining rows: 170
Processed 170 rows -> P_C_NA csv final combined2/R2015_European Commission v MOL Magyar Olaj- és Gázipari Nyrt___annotated_judgment.csv
Processing R2000_French Republic v Ladbroke Racing Ltd and Commission of the European Communitie___annotated_judgment.csv...
Before enhanced cleaning:
Total rows: 163

After enhanced cleaning:
Removed 2 rows
Remaining rows: 161
Processed 161 rows -> P_C_NA csv final combined2/R2000_French Republic v Ladbroke Racing Ltd and Commission of the European Communitie___annotated_judgment.csv
Processing R2004_Ramondín SA and Ramondín Cápsulas SA (C-186_02 P) and Territorio Histórico de Álava - Diputación Foral de Álava (C-188_02 P) v Commission of the European Communities___annotated_judgment.csv...
Before enhanced cleaning:
Total rows: 139

After enhanced cle