In [2]:
import pandas as pd
import os
import numpy as np
from pathlib import Path

In [9]:
def delete_diagnosis_0_samples(csv_path, train_dir, num_to_delete=20000):
    """
    Delete approximately num_to_delete samples with diagnosis=0 from both CSV and train directory.
    
    Args:
        csv_path (str): Path to trainLabels.csv
        train_dir (str): Path to directory containing images
        num_to_delete (int): Number of diagnosis=0 samples to delete (default: 20000)
    
    Returns:
        None
    """
    # Read CSV
    df = pd.read_csv(csv_path)
    
    # Validate CSV
    required_columns = ['image', 'level']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"CSV must contain columns: {required_columns}")
    
    # Filter samples with diagnosis=0
    diag_0_df = df[df['level'] == 0]
    if len(diag_0_df) < num_to_delete:
        print(f"Warning: Only {len(diag_0_df)} samples with diagnosis=0 available, less than {num_to_delete}")
        num_to_delete = len(diag_0_df)
    
    # Randomly select num_to_delete samples
    delete_indices = diag_0_df.sample(n=num_to_delete, random_state=42).index
    
    # Log deleted files
    deleted_files = []
    missing_files = []
    
    # Delete corresponding images
    for idx in delete_indices:
        img_id = df.loc[idx, 'image']
        img_path = os.path.join(train_dir, f"{img_id}.jpeg")
        if os.path.exists(img_path):
            try:
                os.remove(img_path)
                deleted_files.append(img_path)
            except Exception as e:
                print(f"Error deleting {img_path}: {e}")
        else:
            missing_files.append(img_path)
    
    # Update DataFrame by removing selected indices
    df_updated = df.drop(delete_indices).reset_index(drop=True)
    
    # Save updated CSV
    output_csv = csv_path.replace('.csv', '_updated.csv')
    df_updated.to_csv(output_csv, index=False)
    
    # Report results
    print(f"Deleted {len(deleted_files)} images from {train_dir}")
    if missing_files:
        print(f"Warning: {len(missing_files)} images not found: {missing_files[:5]}...")
    print(f"Updated CSV saved to {output_csv}")
    print(f"Original CSV rows: {len(df)}, Updated CSV rows: {len(df_updated)}")
    print(f"Class distribution after deletion:\n{df_updated['level'].value_counts()}")

In [10]:
CSV_PATH = "trainLabels/trainLabels.csv"
TRAIN_DIR = "train"
NUM_TO_DELETE = 20000
   
    # Validate paths
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV file {CSV_PATH} does not exist")
if not os.path.exists(TRAIN_DIR):
    raise FileNotFoundError(f"Train directory {TRAIN_DIR} does not exist")
    
delete_diagnosis_0_samples(CSV_PATH, TRAIN_DIR, NUM_TO_DELETE)

Deleted 20000 images from train
Updated CSV saved to trainLabels/trainLabels_updated.csv
Original CSV rows: 35126, Updated CSV rows: 15126
Class distribution after deletion:
level
0    5810
2    5292
1    2443
3     873
4     708
Name: count, dtype: int64
