In [1]:
import os
import pandas as pd

# Define the input and output directories
input_dir = 'sir p_c_na'
output_dir = 'arg_vs_non-arg'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Check if input directory exists
if not os.path.exists(input_dir):
    print(f"Error: Input directory '{input_dir}' does not exist.")
    print("Please ensure the 'data/all' folder exists and contains CSV files.")
else:
    # List all CSV files in the input directory
    csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]
    
    if not csv_files:
        print("No CSV files found in the input directory.")
    else:
        print(f"Found {len(csv_files)} CSV files to process:")
        
        # Process each CSV file
        for file in csv_files:
            try:
                # Read the CSV file
                input_path = os.path.join(input_dir, file)
                df = pd.read_csv(input_path)
                
                # Check if required columns exist
                if 'text' not in df.columns or 'label' not in df.columns:
                    print(f"Warning: {file} doesn't have required 'text' and 'label' columns. Skipping.")
                    continue
                
                # Update labels: convert 'premise' and 'conclusion' to 'argumentative'
                df['label'] = df['label'].replace({
                    'premise': 'argumentative', 
                    'conclusion': 'argumentative'
                })
                
                # Save the updated CSV to the output directory
                output_path = os.path.join(output_dir, file)
                df.to_csv(output_path, index=False)
                
                print(f"✓ Processed {file} - Labels updated and saved to {output_dir}")
                
                # Show label distribution for this file
                label_counts = df['label'].value_counts()
                print(f"  Label distribution: {dict(label_counts)}")
                
            except Exception as e:
                print(f"Error processing {file}: {str(e)}")
        
        print(f"\nAll files processed successfully! Updated CSVs saved in '{output_dir}' folder.")


Found 42 CSV files to process:
✓ Processed 34.csv - Labels updated and saved to arg_vs_non-arg
  Label distribution: {'non-argumentative': 279, 'argumentative': 153}
✓ Processed 01.csv - Labels updated and saved to arg_vs_non-arg
  Label distribution: {'non-argumentative': 189, 'argumentative': 66}
✓ Processed 05.csv - Labels updated and saved to arg_vs_non-arg
  Label distribution: {'non-argumentative': 80, 'argumentative': 40}
✓ Processed 22.csv - Labels updated and saved to arg_vs_non-arg
  Label distribution: {'non-argumentative': 332, 'argumentative': 197}
✓ Processed 09.csv - Labels updated and saved to arg_vs_non-arg
  Label distribution: {'non-argumentative': 307, 'argumentative': 32}
✓ Processed 25.csv - Labels updated and saved to arg_vs_non-arg
  Label distribution: {'non-argumentative': 166, 'argumentative': 58}
✓ Processed 31.csv - Labels updated and saved to arg_vs_non-arg
  Label distribution: {'non-argumentative': 39, 'argumentative': 22}
✓ Processed 33.csv - Labels upd

In [2]:
import os
import pandas as pd

# Define the input and output directories
input_dir = 'sir p_c_na'  # Using original data to get premise/conclusion labels
output_dir = 'prem_vs_conc'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Check if input directory exists
if not os.path.exists(input_dir):
    print(f"Error: Input directory '{input_dir}' does not exist.")
    print("Please ensure the 'data/all' folder exists and contains CSV files.")
else:
    # List all CSV files in the input directory
    csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]
    
    if not csv_files:
        print("No CSV files found in the input directory.")
    else:
        print(f"Found {len(csv_files)} CSV files to process:")
        
        # Process each CSV file
        for file in csv_files:
            try:
                # Read the CSV file
                input_path = os.path.join(input_dir, file)
                df = pd.read_csv(input_path)
                
                # Check if required columns exist
                if 'text' not in df.columns or 'label' not in df.columns:
                    print(f"Warning: {file} doesn't have required 'text' and 'label' columns. Skipping.")
                    continue
                
                # Filter to keep only 'premise' and 'conclusion' labels
                filtered_df = df[df['label'].isin(['premise', 'conclusion'])].copy()
                
                # Check if there are any rows left after filtering
                if filtered_df.empty:
                    print(f"Warning: {file} has no 'premise' or 'conclusion' labels. Skipping.")
                    continue
                
                # Save the filtered CSV to the output directory
                output_path = os.path.join(output_dir, file)
                filtered_df.to_csv(output_path, index=False)
                
                print(f"✓ Processed {file}")
                print(f"  Original rows: {len(df)}")
                print(f"  Filtered rows: {len(filtered_df)}")
                
                # Show label distribution for this file
                label_counts = filtered_df['label'].value_counts()
                print(f"  Label distribution: {dict(label_counts)}")
                print()
                
            except Exception as e:
                print(f"Error processing {file}: {str(e)}")
        
        print(f"All files processed successfully! Filtered CSVs saved in '{output_dir}' folder.")
        print("Only 'premise' and 'conclusion' labels are retained.")


Found 42 CSV files to process:
✓ Processed 34.csv
  Original rows: 432
  Filtered rows: 153
  Label distribution: {'premise': 105, 'conclusion': 48}

✓ Processed 01.csv
  Original rows: 255
  Filtered rows: 66
  Label distribution: {'premise': 46, 'conclusion': 20}

✓ Processed 05.csv
  Original rows: 120
  Filtered rows: 40
  Label distribution: {'premise': 25, 'conclusion': 15}

✓ Processed 22.csv
  Original rows: 529
  Filtered rows: 197
  Label distribution: {'premise': 147, 'conclusion': 50}

✓ Processed 09.csv
  Original rows: 339
  Filtered rows: 32
  Label distribution: {'premise': 21, 'conclusion': 11}

✓ Processed 25.csv
  Original rows: 224
  Filtered rows: 58
  Label distribution: {'premise': 43, 'conclusion': 15}

✓ Processed 31.csv
  Original rows: 61
  Filtered rows: 22
  Label distribution: {'premise': 15, 'conclusion': 7}

✓ Processed 33.csv
  Original rows: 80
  Filtered rows: 34
  Label distribution: {'premise': 22, 'conclusion': 12}

✓ Processed 21.csv
  Original ro