In [2]:
import nltk
nltk.download(['punkt', 'wordnet'])

# %% [Cell 2] Import Libraries
import pandas as pd
import re
from nltk.tokenize import PunktSentenceTokenizer


[nltk_data] Downloading package punkt to /home/suyamoon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/suyamoon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# %% [Cell 3] Text Reconstruction with Cleaning
def reconstruct_and_clean(annotated_path):
    """Rebuild text with paragraph number removal and position tracking"""
    full_text = []
    fragments = []
    current_pos = 0
    para_pattern = re.compile(r'^\d+(?:\.|\s{2,})\s*')

    with open(annotated_path, 'r', encoding='utf') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            # Split text and annotation
            text_part, _, tag_part = line.partition('||')
            text = para_pattern.sub('', text_part.strip())  # Remove paragraph numbers
            tag = tag_part.strip()
            
            if not text:
                continue

            # Track positions
            start = current_pos
            end = start + len(text)
            full_text.append(text)
            fragments.append({
                'text': text,
                'tag': tag,
                'start': start,
                'end': end
            })
            current_pos = end + 1  # Add space between fragments

    return ' '.join(full_text), fragments



In [4]:
full_text, fragments = reconstruct_and_clean('annotated/A2009_Commission of the European Communities v Koninklijke FrieslandCampina NV_M___annotated_judgment.txt')
print("RECONSTRUCTED TEXT SAMPLE:")
print(full_text[:300] + "...\n")
print("CLEANED FRAGMENTS EXAMPLE:")
for frag in fragments[:5]:
    print(f"[{frag['start']:04}-{frag['end']:04}] {frag['text'][:50]}... | Tag: {frag['tag']}")


RECONSTRUCTED TEXT SAMPLE:
JUDGMENT OF 17. 2009 — CASE C-519/07 P JUDGMENT OF THE COURT (Third Chamber) 17 September 2009 * In Case C-519/07 P, APPEAL pursuant to Article 56 of the Statute of the Court of Justice, brought on 21 November 2007, Commission of the European Communities, represented by H. van Vliet and S. Noë, acti...

CLEANED FRAGMENTS EXAMPLE:
[0000-0015] JUDGMENT OF 17.... | Tag: NA
[0016-0038] 2009 — CASE C-519/07 P... | Tag: 
[0039-0076] JUDGMENT OF THE COURT (Third Chamber)... | Tag: 
[0077-0096] 17 September 2009 *... | Tag: 
[0097-0116] In Case C-519/07 P,... | Tag: 


In [5]:
def legal_sentence_tokenizer(text):
    """Custom tokenizer for legal text with position tracking"""
    tokenizer = PunktSentenceTokenizer()
    sentences = []
    char_offset = 0
    
    for sent in tokenizer.tokenize(text):
        # Find sentence boundaries in original text
        start = text.find(sent, char_offset)
        end = start + len(sent)
        
        # Legal text adjustments
        if sent.endswith(('v.', 'No.', 'Art.', 'para.', 'pp.')):
            next_char = text[end:end+1]
            if next_char.islower() or next_char.isnumeric():
                continue  # Merge abbreviation with next sentence
        
        sentences.append({
            'text': sent,
            'start': start,
            'end': end
        })
        char_offset = end
    
    return sentences


In [6]:
sentences = legal_sentence_tokenizer(full_text)
print("\nTOKENIZED SENTENCES:")
print(f"Total sentences: {len(sentences)}")
print("Sample sentences:")
for i, sent in enumerate(sentences[:50]):
    print(f"{i+1}. [{sent['start']:04}-{sent['end']:04}] {sent['text'][:80]}...")




TOKENIZED SENTENCES:
Total sentences: 188
Sample sentences:
1. [0000-0015] JUDGMENT OF 17....
2. [0016-0650] 2009 — CASE C-519/07 P JUDGMENT OF THE COURT (Third Chamber) 17 September 2009 *...
3. [0651-0794] I - 8526 COMMISSION / KONINKLIJKE FRIESLANDCAMPINA THE COURT (Third Chamber), co...
4. [0795-1616] Cunha Rodrigues, J. Klučka and A. Arabadjiev (Rapporteur), Judges, Advocate Gene...
5. [1617-1647] 52; ‘the contested decision’)....
6. [1648-1674] I - 8527 1 JUDGMENT OF 17....
7. [1675-2078] 2009 — CASE C-519/07 P The national legal framework 2 The Law of 13 December 199...
8. [2079-2356] 1996, No 651) inserted Article 15b into the 1969 Law on Corporation Tax (Wet op ...
9. [2357-2406] That scheme entered into force on 1 January 1997....
10. [2407-3092] 3 The first sentence of Article 15b(1) of the 1969 Law states: ‘In relation to a...
11. [3093-3181] The amounts thus set aside can be used for various purposes as provided for by t...
12. [3182-3522] Thus, according to Article 15b(5

In [7]:
def map_annotations(sentences, fragments):
    dataset = []
    
    for sent in sentences:
        class_label = 0
        sent_start = sent['start']
        sent_end = sent['end']
        
        # Check for overlapping argumentative fragments
        for frag in fragments:
            if frag['tag'] in ('NA', ''):
                continue
                
            # Calculate overlap
            overlap_start = max(sent_start, frag['start'])
            overlap_end = min(sent_end, frag['end'])
            
            if overlap_start < overlap_end:
                overlap_ratio = (overlap_end - overlap_start) / (frag['end'] - frag['start'])
                if overlap_ratio > 0.6:  # 60% overlap threshold
                    class_label = 1
                    break
        
        dataset.append({
            'text': sent['text'].strip(),
            'class': class_label
        })
    
    return pd.DataFrame(dataset)


In [8]:
df = map_annotations(sentences, fragments)


In [9]:
print("\nDATASET PREVIEW:")
print(f"Class Distribution:\n{df['class'].value_counts()}\n")
print("Sample Argumentative Sentences:")
print(df[df['class'] == 1].head(3))
print("\nSample Non-Argumentative Sentences:")
print(df[df['class'] == 0].head(3))




DATASET PREVIEW:
Class Distribution:
class
0    135
1     53
Name: count, dtype: int64

Sample Argumentative Sentences:
                                                 text  class
89  Findings of the Court the 11 July 2001 47 Unde...      1
91  2009 — CASE C-519/07 P 48 In relation to the f...      1
92  49 In the present case, as the Court of First ...      1

Sample Non-Argumentative Sentences:
                                                text  class
0                                    JUDGMENT OF 17.      0
1  2009 — CASE C-519/07 P JUDGMENT OF THE COURT (...      0
2  I - 8526 COMMISSION / KONINKLIJKE FRIESLANDCAM...      0


In [10]:
def final_clean(text):
    """Remove residual noise while preserving legal references"""
    # Remove standalone numbers and bullet points
    text = re.sub(r'^\d+\.?\s*', '', text)
    # Normalize quotation marks
    text = re.sub(r'[”“â€˜â€™]', "'", text)
    # Fix encoding artifacts
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply final cleaning
df['cleaned_text'] = df['text'].apply(final_clean)
df = df[df['cleaned_text'].ne('')]

# Export to CSV
output_path = 'cleaned_legal_dataset.csv'
df[['cleaned_text', 'class']].to_csv(output_path, index=False)

# Verification
print("\nFINAL DATASET SAMPLE:")
print(df[['cleaned_text', 'class']].head(10))
print(f"\nDataset saved to {output_path}")



FINAL DATASET SAMPLE:
                                        cleaned_text  class
0                                    JUDGMENT OF 17.      0
1  — CASE C-519/07 P JUDGMENT OF THE COURT (Third...      0
2  I - 8526 COMMISSION / KONINKLIJKE FRIESLANDCAM...      0
3  Cunha Rodrigues, J. Klučka and A. Arabadjiev (...      0
4                       ; ‘the contested decision’).      0
5                         I - 8527 1 JUDGMENT OF 17.      0
6  — CASE C-519/07 P The national legal framework...      0
7  , No 651) inserted Article 15b into the 1969 L...      0
8  That scheme entered into force on 1 January 1997.      0
9  The first sentence of Article 15b(1) of the 19...      0

Dataset saved to cleaned_legal_dataset.csv


In [11]:
import os

def process_directory(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    for filename in os.listdir(input_dir):
        if filename.endswith('_annotated_judgment.txt'):
            input_path = os.path.join(input_dir, filename)
            full_text, fragments = reconstruct_and_clean(input_path)
            sentences = legal_sentence_tokenizer(full_text)
            df = map_annotations(sentences, fragments)
            output_path = os.path.join(output_dir, filename.replace('.txt', '.csv'))
            df.to_csv(output_path, index=False)

# Example usage:
process_directory('annotated', 'clean_csv_datasets')


In [17]:
# %% [Cell 1] Add Enhanced Cleaning Functions
import pandas as pd
import re

def remove_leading_numbers(text):
    """Remove 1-3 digit numbers at start of text with trailing period/space"""
    return re.sub(r'^\d{1,3}\.*\s*', '', text).strip()

def filter_short_phrases(df):
    """Remove rows with fewer than 3 words"""
    return df[df['text'].apply(lambda x: len(str(x).split()) >= 3)]

# %% [Cell 2] Enhanced Processing Pipeline
def enhanced_processing(input_csv, output_csv):
    # Load cleaned dataset
    df = pd.read_csv(input_csv)
    
    print("Before enhanced cleaning:")
    print(f"Total rows: {len(df)}")
    print("Sample rows with numbers:")
    print(df[df['text'].str.match(r'^\d{1,3}\.')].head(3))
    
    # Remove leading numbers
    df['text'] = df['text'].apply(remove_leading_numbers)
    
    # Filter short phrases
    initial_count = len(df)
    df = filter_short_phrases(df)
    
    print("\nAfter enhanced cleaning:")
    print(f"Removed {initial_count - len(df)} rows")
    print(f"Remaining rows: {len(df)}")
    print("Sample cleaned rows:")
    print(df.head(3))
    
    # Save final dataset
    df.to_csv(output_csv, index=False)
    return df



In [19]:
def process_directory(input_dir, output_dir):
    # Create output directory if not exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Verify input directory exists
    if not os.path.exists(input_dir):
        raise FileNotFoundError(f"Input directory '{input_dir}' not found")
    
    # Get list of CSV files
    csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]
    
    if not csv_files:
        print(f"No CSV files found in {input_dir}")
        return
    
    # Process each file
    for csv_file in csv_files:
        input_path = os.path.join(input_dir, csv_file)
        output_path = os.path.join(output_dir, csv_file)
        
        print(f"Processing {csv_file}...")
        final_df = enhanced_processing(input_path, output_path)
        
        if final_df is not None:
            print(f"Processed {len(final_df)} rows -> {output_path}")

# Run the processing
process_directory(
    'clean_csv_datasets',
    'clean_csv_datasets2'
)


Processing R2016_Orange v European Commission___annotated_judgment.csv...
Before enhanced cleaning:
Total rows: 147
Sample rows with numbers:
Empty DataFrame
Columns: [text, class]
Index: []

After enhanced cleaning:
Removed 1 rows
Remaining rows: 146
Sample cleaned rows:
                                                text  class
0  Reports of Cases JUDGMENT OF THE COURT (First ...      0
1  Fernlund and S. Rodin, Judges, Advocate Genera...      0
2  ECLI:EU:C:2016:798\t1 Judgment By its appeal, ...      0
Processed 146 rows -> clean_csv_datasets2/R2016_Orange v European Commission___annotated_judgment.csv
Processing R2017_European Commission v Frucona Košice a___annotated_judgment.csv...
Before enhanced cleaning:
Total rows: 181
Sample rows with numbers:
Empty DataFrame
Columns: [text, class]
Index: []

After enhanced cleaning:
Removed 3 rows
Remaining rows: 178
Sample cleaned rows:
                                                text  class
0  Reports of Cases JUDGMENT OF THE COURT 