In [1]:
import os

def find_missing_files(annotated_folder, csv_folder):
    """
    Find files that exist in annotated folder but not in csv folder
    based on filename before extension.
    """
    # Get all .txt files from annotated folder
    annotated_files = set()
    if os.path.exists(annotated_folder):
        for file in os.listdir(annotated_folder):
            if file.endswith('.txt'):
                # Get filename without extension
                filename_without_ext = os.path.splitext(file)[0]
                annotated_files.add(filename_without_ext)
    else:
        print(f"Error: '{annotated_folder}' folder not found!")
        return []
    
    # Get all .csv files from csv folder
    csv_files = set()
    if os.path.exists(csv_folder):
        for file in os.listdir(csv_folder):
            if file.endswith('.csv'):
                # Get filename without extension
                filename_without_ext = os.path.splitext(file)[0]
                csv_files.add(filename_without_ext)
    else:
        print(f"Error: '{csv_folder}' folder not found!")
        return []
    
    # Find files in annotated but not in csv folder
    missing_files = annotated_files - csv_files
    
    return sorted(list(missing_files))

# Define folder paths
annotated_folder = "annotated"
csv_folder = "P_C_NA csv final combined"

# Find missing files
missing_files = find_missing_files(annotated_folder, csv_folder)

# Display results
print(f"Files in '{annotated_folder}' folder but not in '{csv_folder}' folder:")
print("=" * 70)

if missing_files:
    for i, filename in enumerate(missing_files, 1):
        print(f"{i}. {filename}")
    print(f"\nTotal missing files: {len(missing_files)}")
else:
    print("No missing files found. All files from annotated folder have corresponding CSV files.")

# Optional: Save results to a text file
save_to_file = input("\nDo you want to save the results to a file? (y/n): ").lower()
if save_to_file == 'y':
    with open("missing_files_report.txt", "w") as f:
        f.write(f"Files in '{annotated_folder}' folder but not in '{csv_folder}' folder:\n")
        f.write("=" * 70 + "\n\n")
        if missing_files:
            for i, filename in enumerate(missing_files, 1):
                f.write(f"{i}. {filename}\n")
            f.write(f"\nTotal missing files: {len(missing_files)}\n")
        else:
            f.write("No missing files found.\n")
    print("Results saved to 'missing_files_report.txt'")


Files in 'annotated' folder but not in 'P_C_NA csv final combined' folder:
1. A2010_NDSHT Nya Destination Stockholm Hotell & Teaterpaket AB v European Commission___annotated_judgment
2. A2017_Ellinikos Chrysos AE Metalleion kai Viomichanias Chrysou v European Commission___annotated_judgment
3. A2018_Scuola Elementare Maria Montessori Srl v European Commission___annotated_judgment
4. R2000_French Republic v Ladbroke Racing Ltd and Commission of the European Communitie___annotated_judgment
5. R2002_Associação dos Refinadores de Açúcar Portugueses___annotated_judgment
6. R2004_Ramondín SA and Ramondín Cápsulas SA (C-186_02 P) and Territorio Histórico de Álava - Diputación Foral de Álava (C-188_02 P) v Commission of the European Communities___annotated_judgment
7. R2006_European Commission v Italian Republic___annotated_judgment
8. R2011_European Commission v Kronoply GmbH & Co___annotated_judgment
9. R2011_France Télécom SA v European Commission___annotated_judgment
10. R2015_European Com

In [2]:
import os

def delete_files_except_specified(folder_path, files_to_keep):
    """
    Delete all .txt files from the folder except those specified in files_to_keep list.
    """
    # Files to keep (without extension)
    keep_set = set(files_to_keep)
    
    if not os.path.exists(folder_path):
        print(f"Error: Folder '{folder_path}' does not exist!")
        return
    
    deleted_files = []
    kept_files = []
    
    # Get all files in the folder
    all_files = os.listdir(folder_path)
    txt_files = [f for f in all_files if f.endswith('.txt')]
    
    print(f"Found {len(txt_files)} .txt files in '{folder_path}' folder")
    print("Processing files...\n")
    
    for file in txt_files:
        # Get filename without extension
        filename_without_ext = os.path.splitext(file)[0]
        file_path = os.path.join(folder_path, file)
        
        if filename_without_ext in keep_set:
            kept_files.append(file)
            print(f"KEEPING: {file}")
        else:
            try:
                os.remove(file_path)
                deleted_files.append(file)
                print(f"DELETED: {file}")
            except Exception as e:
                print(f"ERROR deleting {file}: {e}")
    
    # Summary
    print(f"\n{'='*60}")
    print(f"SUMMARY:")
    print(f"Files kept: {len(kept_files)}")
    print(f"Files deleted: {len(deleted_files)}")
    print(f"{'='*60}")
    
    return deleted_files, kept_files

# Define the folder path
annotated_folder = "annotated"

# List of files to keep (based on your provided list)
files_to_keep = [
    "A2010_NDSHT Nya Destination Stockholm Hotell & Teaterpaket AB v European Commission___annotated_judgment",
    "A2017_Ellinikos Chrysos AE Metalleion kai Viomichanias Chrysou v European Commission___annotated_judgment",
    "A2018_Scuola Elementare Maria Montessori Srl v European Commission___annotated_judgment",
    "R2000_French Republic v Ladbroke Racing Ltd and Commission of the European Communitie___annotated_judgment",
    "R2002_Associação dos Refinadores de Açúcar Portugueses___annotated_judgment",
    "R2004_Ramondín SA and Ramondín Cápsulas SA (C-186_02 P) and Territorio Histórico de Álava - Diputación Foral de Álava (C-188_02 P) v Commission of the European Communities___annotated_judgment",
    "R2006_European Commission v Italian Republic___annotated_judgment",
    "R2011_European Commission v Kronoply GmbH & Co___annotated_judgment",
    "R2011_France Télécom SA v European Commission___annotated_judgment",
    "R2015_European Commission v MOL Magyar Olaj- és Gázipari Nyrt___annotated_judgment",
    "R2016_DTS Distribuidora de Televisión Digital___annotated_judgment"
]

# Confirmation prompt
print("WARNING: This will permanently delete files from the 'annotated' folder!")
print(f"Files to KEEP: {len(files_to_keep)} files")
print("\nFiles that will be KEPT:")
for i, filename in enumerate(files_to_keep, 1):
    print(f"{i}. {filename}")


deleted_files, kept_files = delete_files_except_specified(annotated_folder, files_to_keep)

# Optional: Create a log file
create_log = input("\nDo you want to create a log file of the deletion operation? (y/n): ").lower()
if create_log == 'y':
    with open("deletion_log.txt", "w") as log_file:
        log_file.write("File Deletion Log\n")
        log_file.write("="*50 + "\n\n")
        log_file.write(f"Files kept ({len(kept_files)}):\n")
        for file in kept_files:
            log_file.write(f"  - {file}\n")
        log_file.write(f"\nFiles deleted ({len(deleted_files)}):\n")
        for file in deleted_files:
            log_file.write(f"  - {file}\n")
    print("Log saved to 'deletion_log.txt'")


Files to KEEP: 11 files

Files that will be KEPT:
1. A2010_NDSHT Nya Destination Stockholm Hotell & Teaterpaket AB v European Commission___annotated_judgment
2. A2017_Ellinikos Chrysos AE Metalleion kai Viomichanias Chrysou v European Commission___annotated_judgment
3. A2018_Scuola Elementare Maria Montessori Srl v European Commission___annotated_judgment
4. R2000_French Republic v Ladbroke Racing Ltd and Commission of the European Communitie___annotated_judgment
5. R2002_Associação dos Refinadores de Açúcar Portugueses___annotated_judgment
6. R2004_Ramondín SA and Ramondín Cápsulas SA (C-186_02 P) and Territorio Histórico de Álava - Diputación Foral de Álava (C-188_02 P) v Commission of the European Communities___annotated_judgment
7. R2006_European Commission v Italian Republic___annotated_judgment
8. R2011_European Commission v Kronoply GmbH & Co___annotated_judgment
9. R2011_France Télécom SA v European Commission___annotated_judgment
10. R2015_European Commission v MOL Magyar Olaj

In [2]:
import os
import pandas as pd

folder_path = 'data/all'
class_map = {'P': 1, 'C': 2, 'NA': 0}

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        
        # Read CSV with keep_default_na=False to prevent "NA" from being converted to NaN
        df = pd.read_csv(file_path, keep_default_na=False)
        
        # Check if 'class' column exists
        if 'class' not in df.columns:
            raise Exception(f"'class' column not found in {filename}")
        
        # Check for unknown class values
        for idx, value in enumerate(df['class']):
            if value not in class_map:
                raise Exception(f"Unknown class '{value}' found in file {filename} at line {idx+2}")
        
        # Convert class values
        df['class'] = df['class'].map(class_map)
        
        # Save back to the same file
        df.to_csv(file_path, index=False)

print('All files processed successfully.')


All files processed successfully.
