In [13]:
import pandas as pd
import shutil
import os
import cv2
import numpy as np
import albumentations as A
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import logging
import matplotlib.pyplot as plt
import math

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("preprocessing.log")
    ]
)
logger = logging.getLogger(__name__)

def create_directories(base_dir, labels):
    """Create all necessary directories for the project."""
    logger.info("Creating necessary directories...")
    for folder in ["data/augmented", "data/train", "data/val", "data/test", "uploads", "models", "reports", "logs"]:
        dir_path = os.path.join(base_dir, folder.replace("/", os.sep))
        os.makedirs(dir_path, exist_ok=True)
    
    # Create label-specific directories in each data split folder
    for split in ["augmented", "train", "val", "test"]:
        for label in labels:
            label_dir = os.path.join(base_dir, "data", split, label)
            os.makedirs(label_dir, exist_ok=True)

def perform_augmentation(base_dir, labels, target_count=12000):
    """Perform image augmentation to ensure each class has exactly target_count images."""
    logger.info(f"Starting image augmentation with target of {target_count} images per class...")
    
    # Define augmentation pipeline
    transform = A.Compose([
        A.Resize(256, 256),
        A.CLAHE(clip_limit=2.0, tile_grid_size=(8, 8), p=1.0),
        A.Rotate(limit=15, p=0.5),
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.3),
        A.Sharpen(alpha=(0.2, 0.5), lightness=(0.5, 1.0), p=0.3),
        A.ElasticTransform(alpha=1, sigma=50, alpha_affine=50, p=0.2)
    ])
    
    augmented_count = 0
    augmented_per_class = {}
    
    for label in labels:
        label_dir = os.path.join(base_dir, "data", "augmented", label)
        if not os.path.exists(label_dir):
            logger.warning(f"Directory not found for {label}, creating it")
            os.makedirs(label_dir, exist_ok=True)
            continue
        
        # Only consider original images (not already augmented ones)
        images = [f for f in os.listdir(label_dir) if not "_aug" in f]
        
        if not images:
            logger.warning(f"No images found for {label}")
            continue
        
        original_count = len(images)
        logger.info(f"Found {original_count} original images for {label}")
        
        # Calculate how many augmentations needed per image to reach target
        if original_count >= target_count:
            logger.info(f"Class {label} already has {original_count} images, no augmentation needed")
            continue
            
        # Calculate required augmentations
        remaining_images = target_count - original_count
        aug_per_image = math.ceil(remaining_images / original_count)
        
        logger.info(f"Need to create {remaining_images} more images for {label}")
        logger.info(f"Will create {aug_per_image} augmentations per original image")
        
        class_aug_count = 0
        
        # Use tqdm for progress tracking
        for img_name in tqdm(images, desc=f"Augmenting {label}"):
            img_path = os.path.join(label_dir, img_name)
            try:
                img = cv2.imread(img_path)
                if img is None:
                    logger.warning(f"Could not read {img_path}")
                    continue
                
                # Convert to grayscale if the image is in color
                if len(img.shape) == 3 and img.shape[2] == 3:
                    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                    # Convert back to 3-channel for compatibility with albumentations
                    img = cv2.cvtColor(gray_img, cv2.COLOR_GRAY2BGR)
                elif len(img.shape) == 2:
                    # If already grayscale, convert to 3-channel for compatibility
                    img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
                
                base_name = os.path.splitext(img_name)[0]
                ext = os.path.splitext(img_name)[1]
                
                # Stop augmenting if we've reached our target
                if original_count + class_aug_count >= target_count:
                    break
                
                # Create the calculated number of augmentations per image
                for i in range(aug_per_image):
                    # Break if we've reached our target
                    if original_count + class_aug_count >= target_count:
                        break
                        
                    try:
                        # Apply augmentation
                        augmented = transform(image=img)["image"]
                        # Save the augmented image
                        output_path = os.path.join(label_dir, f"{base_name}_aug{i}{ext}")
                        cv2.imwrite(output_path, augmented)
                        class_aug_count += 1
                        augmented_count += 1
                    except Exception as e:
                        logger.error(f"Error augmenting {img_path} (iteration {i}): {e}")
            
            except Exception as e:
                logger.error(f"Error processing {img_path}: {e}")
        
        final_count = original_count + class_aug_count
        augmented_per_class[label] = class_aug_count
        logger.info(f"Created {class_aug_count} augmented images for {label}. Total count: {final_count}/{target_count}")
        
        # In case we need additional augmentations to exactly reach the target
        if final_count < target_count:
            logger.warning(f"Still need {target_count - final_count} more images for {label}")
            # Augment from already augmented images if needed
            all_images = os.listdir(label_dir)
            additional_needed = target_count - final_count
            
            if all_images:
                additional_per_image = math.ceil(additional_needed / len(all_images))
                additional_count = 0
                
                for img_name in tqdm(all_images, desc=f"Additional augmentation for {label}"):
                    if final_count + additional_count >= target_count:
                        break
                        
                    img_path = os.path.join(label_dir, img_name)
                    try:
                        img = cv2.imread(img_path)
                        if img is None:
                            continue
                            
                        base_name = os.path.splitext(img_name)[0]
                        ext = os.path.splitext(img_name)[1]
                        
                        for i in range(additional_per_image):
                            if final_count + additional_count >= target_count:
                                break
                                
                            # Use a stronger augmentation for these additional images
                            strong_transform = A.Compose([
                                A.Resize(256, 256),
                                A.CLAHE(clip_limit=2.0, tile_grid_size=(8, 8), p=1.0),
                                A.Rotate(limit=30, p=0.8),
                                A.HorizontalFlip(p=0.5),
                                A.VerticalFlip(p=0.3),
                                A.RandomBrightnessContrast(p=0.5),
                                A.GaussNoise(p=0.3),
                                A.Sharpen(alpha=(0.2, 0.5), lightness=(0.5, 1.0), p=0.5)
                            ])
                            
                            augmented = strong_transform(image=img)["image"]
                            output_path = os.path.join(label_dir, f"{base_name}_extra{i}{ext}")
                            cv2.imwrite(output_path, augmented)
                            additional_count += 1
                            augmented_count += 1
                            
                    except Exception as e:
                        logger.error(f"Error in additional augmentation for {img_path}: {e}")
                
                logger.info(f"Created {additional_count} additional augmented images for {label}")
                final_count += additional_count
                augmented_per_class[label] += additional_count
        
        logger.info(f"Final count for {label}: {final_count}/{target_count}")
    
    logger.info(f"Augmentation complete. Created {augmented_count} augmented images across all classes.")
    return augmented_per_class


def find_image(image_name, archive_dir, image_folders):
    """Find an image in all available image folders."""
    for folder in image_folders:
        img_path = os.path.join(archive_dir, folder, image_name)
        if os.path.exists(img_path):
            return img_path
    return None

def main():
    # Setup directories
    base_dir = r"C:\projects_ml\Radi_Assist"
    archive_dir = os.path.join(base_dir, "data", "raw", "archive")
    
    labels = ["Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass", "Nodule",
              "Pneumonia", "Pneumothorax", "Consolidation", "Edema", "Emphysema", "Fibrosis",
              "Pleural_Thickening", "Hernia"]
    
    # Target count for each class
    target_count = 12000
    
    # Create all necessary directories
    create_directories(base_dir, labels)
    
    # Load CSV file
    csv_path = os.path.join(archive_dir, "Data_Entry_2017.csv")
    if not os.path.exists(csv_path):
        logger.error(f"Cannot find CSV file at {csv_path}")
        # Try to find alternative CSV
        csv_files = [f for f in os.listdir(archive_dir) if f.endswith('.csv')]
        if csv_files:
            csv_path = os.path.join(archive_dir, csv_files[0])
            logger.info(f"Using alternative CSV file: {csv_path}")
        else:
            logger.error(f"No CSV files found in {archive_dir}")
            return
    
    logger.info(f"Loading data from {csv_path}...")
    df = pd.read_csv(csv_path)
    logger.info(f"CSV loaded with {len(df)} records and columns: {df.columns.tolist()}")
    
    # Check if required columns exist
    required_cols = ["Image Index", "Finding Labels"]
    for col in required_cols:
        if col not in df.columns:
            similar_cols = [c for c in df.columns if c.lower().replace(" ", "_") == col.lower().replace(" ", "_")]
            if similar_cols:
                logger.info(f"Renaming column '{similar_cols[0]}' to '{col}'")
                df = df.rename(columns={similar_cols[0]: col})
            else:
                logger.error(f"Required column '{col}' not found in CSV")
                return
    
    # Find image folders
    image_folders = []
    for i in range(1, 13):
        folder_name = f"images_{str(i).zfill(3)}"
        folder_path = os.path.join(archive_dir, folder_name)
        if os.path.exists(folder_path):
            # Check if folder contains any images
            files = os.listdir(folder_path)
            image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            if image_files:
                image_folders.append(folder_name)
                logger.info(f"Found image folder {folder_name} with {len(image_files)} images")
            else:
                logger.warning(f"Folder {folder_name} exists but contains no image files")
        else:
            logger.warning(f"Image folder {folder_name} not found")
    
    # Verify images in augmented folders
    image_counts = verify_images(base_dir, labels)
    logger.info("Current images per class:")
    for label, count in image_counts.items():
        logger.info(f"- {label}: {count}")
    
    # Only perform augmentation if needed
    if sum(image_counts.values()) == 0:
        logger.warning("No images found in augmented folders. Check if images were copied correctly.")
        # If no images were copied to the augmented folders, we need to do that first
        logger.info("Attempting to copy original images to augmented folders...")
        
        copied_images = 0
        label_counts = {label: 0 for label in labels}
        
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Copying original images"):
            try:
                img_name = row["Image Index"]
                findings = row["Finding Labels"].split("|")
                
                if "No Finding" in findings:
                    continue
                    
                label = findings[0]
                if label not in labels:
                    continue
                    
                # Find the image
                src_path = find_image(img_name, archive_dir, image_folders)
                if src_path:
                    dest_path = os.path.join(base_dir, "data", "augmented", label, img_name)
                    shutil.copy(src_path, dest_path)
                    copied_images += 1
                    label_counts[label] += 1
                    
                    if copied_images % 500 == 0:
                        logger.info(f"Copied {copied_images} images so far...")
                else:
                    if idx % 1000 == 0:
                        logger.warning(f"Could not find image for row {idx}: {img_name}")
                    
            except Exception as e:
                logger.error(f"Error processing row {idx}: {e}")
        
        logger.info(f"Copied {copied_images} original images to augmented folders")
        for label, count in label_counts.items():
            logger.info(f"- {label}: {count}")
    
    # Perform image augmentation with target count
    augmented_counts = perform_augmentation(base_dir, labels, target_count=target_count)

if __name__ == "__main__":
    main()

2025-05-15 20:10:54,414 - INFO - Creating necessary directories...
2025-05-15 20:10:54,431 - INFO - Loading data from C:\projects_ml\Radi_Assist\data\raw\archive\Data_Entry_2017.csv...
2025-05-15 20:10:54,838 - INFO - CSV loaded with 112120 records and columns: ['Image Index', 'Finding Labels', 'Follow-up #', 'Patient ID', 'Patient Age', 'Patient Gender', 'View Position', 'OriginalImage[Width', 'Height]', 'OriginalImagePixelSpacing[x', 'y]', 'Unnamed: 11']
2025-05-15 20:10:54,867 - INFO - Found image folder images_001 with 4999 images
2025-05-15 20:10:54,920 - INFO - Found image folder images_002 with 10000 images
2025-05-15 20:10:54,975 - INFO - Found image folder images_003 with 10000 images
2025-05-15 20:10:55,031 - INFO - Found image folder images_004 with 10000 images
2025-05-15 20:10:55,085 - INFO - Found image folder images_005 with 10000 images
2025-05-15 20:10:55,138 - INFO - Found image folder images_006 with 10000 images
2025-05-15 20:10:55,188 - INFO - Found image folder im

In [8]:
import pandas as pd
import shutil
import os
import cv2
import numpy as np
import albumentations as A
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import logging
import matplotlib.pyplot as plt
import math

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("preprocessing.log")
    ]
)

logger = logging.getLogger(__name__)


def create_directories(base_dir, labels):
    """Create all necessary directories for the project."""
    logger.info("Creating necessary directories...")
    for folder in ["data/augmented", "data/train", "data/val", "data/test", "uploads", "models", "reports", "logs"]:
        dir_path = os.path.join(base_dir, folder.replace("/", os.sep))
        os.makedirs(dir_path, exist_ok=True)
    
    # Create label-specific directories in each data split folder
    for split in ["augmented", "train", "val", "test"]:
        for label in labels:
            label_dir = os.path.join(base_dir, "data", split, label)
            os.makedirs(label_dir, exist_ok=True)

def create_train_val_test_split(base_dir, labels, train_ratio=0.7, val_ratio=0.1, test_ratio=0.2):
    """Create and save train/validation/test splits directly from augmented data."""
    logger.info(f"Creating train/val/test splits with ratio {train_ratio}/{val_ratio}/{test_ratio}...")
    
    try:
        total_train = 0
        total_val = 0
        total_test = 0
        
        for label in labels:
            # Get all images from augmented folder for this label
            aug_dir = os.path.join(base_dir, "data", "augmented", label)
            if not os.path.exists(aug_dir):
                logger.warning(f"No augmented directory for {label}")
                continue
            
            # Create destination directories if they don't exist
            train_dir = os.path.join(base_dir, "data", "train", label)
            val_dir = os.path.join(base_dir, "data", "val", label)
            test_dir = os.path.join(base_dir, "data", "test", label)
            
            os.makedirs(train_dir, exist_ok=True)
            os.makedirs(val_dir, exist_ok=True)
            os.makedirs(test_dir, exist_ok=True)
            
            # Get all images (both original and augmented)
            all_images = os.listdir(aug_dir)
            if not all_images:
                logger.warning(f"No images found for {label} in augmented folder")
                continue
            
            logger.info(f"Creating split for {label} with {len(all_images)} images")
            
            # Group by base image name (to keep augmented versions together)
            image_groups = {}
            for img in all_images:
                base_name = img.split('_aug')[0] if '_aug' in img else img
                base_name = base_name.split('_extra')[0] if '_extra' in base_name else base_name
                if base_name not in image_groups:
                    image_groups[base_name] = []
                image_groups[base_name].append(img)
            
            # Split by base image
            base_images = list(image_groups.keys())
            
            # Calculate split sizes
            n_total = len(base_images)
            n_train = int(n_total * train_ratio)
            n_val = int(n_total * val_ratio)
            # n_test will be the remainder
            
            # Shuffle and split
            np.random.shuffle(base_images)
            train_bases = base_images[:n_train]
            val_bases = base_images[n_train:n_train+n_val]
            test_bases = base_images[n_train+n_val:]
            
            # Get all images for each split
            train_images = [img for base in train_bases for img in image_groups[base]]
            val_images = [img for base in val_bases for img in image_groups[base]]
            test_images = [img for base in test_bases for img in image_groups[base]]
            
            # Copy images to respective directories
            for img in tqdm(train_images, desc=f"Copying {label} train images"):
                src_path = os.path.join(aug_dir, img)
                dst_path = os.path.join(base_dir, "data", "train", label, img)
                shutil.copy(src_path, dst_path)
            
            for img in tqdm(val_images, desc=f"Copying {label} validation images"):
                src_path = os.path.join(aug_dir, img)
                dst_path = os.path.join(base_dir, "data", "val", label, img)
                shutil.copy(src_path, dst_path)
            
            for img in tqdm(test_images, desc=f"Copying {label} test images"):
                src_path = os.path.join(aug_dir, img)
                dst_path = os.path.join(base_dir, "data", "test", label, img)
                shutil.copy(src_path, dst_path)
            
            # Update counts
            total_train += len(train_images)
            total_val += len(val_images)
            total_test += len(test_images)
            
            logger.info(f"Split for {label}: Train={len(train_images)}, Val={len(val_images)}, Test={len(test_images)}")
        
        logger.info(f"Total split: Train={total_train}, Val={total_val}, Test={total_test}")
        
        # Create label distribution plots if the function exists
        try:
            create_label_distribution_plot(base_dir, labels)
        except NameError:
            logger.warning("create_label_distribution_plot function not found, skipping plot creation")
        
        return total_train, total_val, total_test
    
    except Exception as e:
        logger.error(f"Error creating splits: {e}")
        import traceback
        traceback.print_exc()
        return 0, 0, 0

def main():
    # Setup directories
    base_dir = r"C:\projects_ml\Radi_Assist"
    archive_dir = os.path.join(base_dir, "data", "raw", "archive")
    
    labels = ["Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass", "Nodule",
              "Pneumonia", "Pneumothorax", "Consolidation", "Edema", "Emphysema", "Fibrosis",
              "Pleural_Thickening", "Hernia"]
    
    # Create all necessary directories
    create_directories(base_dir, labels)
    
    # Verify images in augmented folders
    image_counts = {}
    for label in labels:
        aug_dir = os.path.join(base_dir, "data", "augmented", label)
        if os.path.exists(aug_dir):
            image_counts[label] = len(os.listdir(aug_dir))
        else:
            image_counts[label] = 0
    
    logger.info("Current images per class in augmented folders:")
    for label, count in image_counts.items():
        logger.info(f"- {label}: {count}")
    
    # Create train/val/test splits - this is the crucial part that was causing issues
    train_count, val_count, test_count = create_train_val_test_split(
        base_dir, labels, train_ratio=0.7, val_ratio=0.1, test_ratio=0.2
    )
    
    # Verify images in each split
    train_images = count_images_in_split(base_dir, "train", labels)
    val_images = count_images_in_split(base_dir, "val", labels)
    test_images = count_images_in_split(base_dir, "test", labels)
    
    logger.info("\nSplitting process complete!")
    logger.info(f"Final dataset:")
    logger.info(f"- Train: {train_images} images")
    logger.info(f"- Validation: {val_images} images")
    logger.info(f"- Test: {test_images} images")
    logger.info(f"Total: {train_images + val_images + test_images} images")

if __name__ == "__main__":
    main()

2025-05-16 05:01:02,864 - INFO - Creating necessary directories...
2025-05-16 05:01:02,956 - INFO - Current images per class in augmented folders:
2025-05-16 05:01:02,957 - INFO - - Atelectasis: 2500
2025-05-16 05:01:02,958 - INFO - - Cardiomegaly: 2500
2025-05-16 05:01:02,959 - INFO - - Effusion: 2500
2025-05-16 05:01:02,960 - INFO - - Infiltration: 2500
2025-05-16 05:01:02,961 - INFO - - Mass: 2500
2025-05-16 05:01:02,962 - INFO - - Nodule: 2500
2025-05-16 05:01:02,963 - INFO - - Pneumonia: 2500
2025-05-16 05:01:02,964 - INFO - - Pneumothorax: 2500
2025-05-16 05:01:02,965 - INFO - - Consolidation: 2500
2025-05-16 05:01:02,966 - INFO - - Edema: 2500
2025-05-16 05:01:02,967 - INFO - - Emphysema: 2500
2025-05-16 05:01:02,968 - INFO - - Fibrosis: 2500
2025-05-16 05:01:02,969 - INFO - - Pleural_Thickening: 2500
2025-05-16 05:01:02,970 - INFO - - Hernia: 2500
2025-05-16 05:01:02,971 - INFO - Creating train/val/test splits with ratio 0.7/0.1/0.2...
2025-05-16 05:01:02,980 - INFO - Creating 

NameError: name 'count_images_in_split' is not defined