In [None]:
import pandas as pd
import os
import random
from sklearn.model_selection import train_test_split
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def normalize_path(path):
    """Normalize path to remove trailing slashes and ensure forward slashes."""
    return os.path.normpath(path).replace('\\', '/')

def validate_paths(df, base_path):
    """Validate and complete paths in the DataFrame."""
    invalid_rows = []
    folder_images = {}
    
    for idx, row in df.iterrows():
        # Complete Extracted_Face_Path
        id_path = os.path.join(base_path, row['Extracted_Face_Path'])
        id_path = normalize_path(id_path)
        folder_path = normalize_path(row['Face_Photo_Path'])
        
        # Validate ID image
        if not os.path.isfile(id_path):
            logger.warning(f"ID image not found: {id_path}")
            invalid_rows.append(idx)
            continue
        
        # Validate folder and list images
        if not os.path.isdir(folder_path):
            logger.warning(f"Folder not found: {folder_path}")
            invalid_rows.append(idx)
            continue
        
        # Get images in folder (.jpg, .png, .jpeg)
        images = [normalize_path(os.path.join(folder_path, f)) 
                 for f in os.listdir(folder_path) 
                 if f.lower().endswith(('.jpg', '.png', '.jpeg'))]
        
        if not images:
            logger.warning(f"No images in folder: {folder_path}")
            invalid_rows.append(idx)
            continue
        
        folder_images[folder_path] = images
        df.at[idx, 'Extracted_Face_Path'] = id_path
        df.at[idx, 'Face_Photo_Path'] = folder_path
    
    # Drop invalid rows
    if invalid_rows:
        logger.info(f"Dropping {len(invalid_rows)} invalid rows")
        df = df.drop(invalid_rows).reset_index(drop=True)
    
    # Log folder_images keys for debugging
    logger.info(f"folder_images keys: {list(folder_images.keys())[:5]} (first 5)")
    logger.info(f"Face_Photo_Path sample: {df['Face_Photo_Path'].head().tolist()}")
    
    return df, folder_images

def create_true_pairs(df, folder_images):
    """Create true pairs (ID image vs. person images, label 1)."""
    pairs = []
    for idx, row in df.iterrows():
        id_path = row['Extracted_Face_Path']
        folder_path = row['Face_Photo_Path']
        person_images = folder_images.get(folder_path, [])
        
        if not person_images:
            logger.warning(f"No images found for folder: {folder_path}")
            continue
        
        for person_img in person_images:
            pairs.append({
                'id_image_path': id_path,
                'person_image_path': person_img,
                'label': 1
            })
    
    return pairs

def create_false_pairs(df, folder_images, num_false_per_id=5):
    """Create false pairs (ID image vs. different person's images, label 0)."""
    pairs = []
    folder_paths = list(folder_images.keys())
    
    for _, row in df.iterrows():
        id_path = row['Extracted_Face_Path']
        current_folder = row['Face_Photo_Path']
        
        # Select different folders for false pairs
        other_folders = [f for f in folder_paths if f != current_folder]
        if len(other_folders) < num_false_per_id:
            logger.warning(f"Not enough different folders for ID {row['ID']}, reducing false pairs")
            selected_folders = other_folders
        else:
            selected_folders = random.sample(other_folders, num_false_per_id)
        
        for folder in selected_folders:
            person_img = random.choice(folder_images[folder])
            pairs.append({
                'id_image_path': id_path,
                'person_image_path': person_img,
                'label': 0
            })
    
    return pairs

def main():
    # Configuration
    csv_path = r"D:\Projects\PhotosWorkl\face_id_database.csv"  # Update if path differs
    base_path = r"D:\Projects\PhotosWorkl"
    output_dir = r"D:\Projects\PhotosWorkl"
    num_false_per_id = 5  # Match ~5 true pairs per ID
    train_ratio = 0.8
    
     # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Load CSV
    logger.info("Loading CSV")
    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        logger.error(f"Failed to load CSV: {e}")
        return
    
    # Validate and complete paths
    logger.info("Validating paths")
    df, folder_images = validate_paths(df, base_path)
    if df.empty:
        logger.error("No valid data after validation")
        return
    
    logger.info(f"Found {len(df)} valid IDs with {sum(len(imgs) for imgs in folder_images.values())} person images")
    
    # Create true pairs
    logger.info("Creating true pairs")
    true_pairs = create_true_pairs(df, folder_images)
    if not true_pairs:
        logger.error("No true pairs generated. Check Face_Photo_Path and folder_images.")
        return
    logger.info(f"Generated {len(true_pairs)} true pairs")
    
    # Create false pairs (match number of true pairs)
    logger.info("Creating false pairs")
    num_false_pairs = len(true_pairs)  # Balance true and false
    num_false_per_id = max(1, num_false_pairs // len(df))  # Adjust false pairs per ID
    false_pairs = create_false_pairs(df, folder_images, num_false_per_id)
    logger.info(f"Generated {len(false_pairs)} false pairs")
    
    # Combine pairs
    all_pairs = true_pairs + false_pairs
    pairs_df = pd.DataFrame(all_pairs)
    logger.info(f"Total pairs: {len(pairs_df)} (Label 1: {len(pairs_df[pairs_df['label'] == 1])}, Label 0: {len(pairs_df[pairs_df['label'] == 0])})")
    
    # Save unified CSV
    all_pairs_path = os.path.join(output_dir, 'all_pairs.csv')
    pairs_df.to_csv(all_pairs_path, index=False)
    logger.info(f"Saved unified CSV: {all_pairs_path}")
    
    # Split into train and test
    logger.info("Splitting into train and test")
    train_df, test_df = train_test_split(
        pairs_df,
        test_size=1 - train_ratio,
        stratify=pairs_df['label'],
        random_state=42
    )
    
    # Save train and test CSVs
    train_path = os.path.join(output_dir, 'train_pairs.csv')
    test_path = os.path.join(output_dir, 'test_pairs.csv')
    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)
    
    logger.info(f"Saved train CSV: {train_path} ({len(train_df)} pairs)")
    logger.info(f"Saved test CSV: {test_path} ({len(test_df)} pairs)")
    logger.info(f"Train label distribution: Label 1: {len(train_df[train_df['label'] == 1])}, Label 0: {len(train_df[train_df['label'] == 0])}")
    logger.info(f"Test label distribution: Label 1: {len(test_df[test_df['label'] == 1])}, Label 0: {len(test_df[test_df['label'] == 0])}")


In [6]:
if __name__ == "__main__":
    main()

2025-05-21 18:23:05,187 - INFO - Loading CSV
2025-05-21 18:23:05,189 - INFO - Validating paths
2025-05-21 18:23:05,311 - INFO - folder_images keys: ['D:/Projects/finalGPT/originals/1159', 'D:/Projects/finalGPT/originals/116', 'D:/Projects/finalGPT/originals/1161', 'D:/Projects/finalGPT/originals/1162', 'D:/Projects/finalGPT/originals/1163'] (first 5)
2025-05-21 18:23:05,312 - INFO - Face_Photo_Path sample: ['D:/Projects/finalGPT/originals/1159', 'D:/Projects/finalGPT/originals/116', 'D:/Projects/finalGPT/originals/1161', 'D:/Projects/finalGPT/originals/1162', 'D:/Projects/finalGPT/originals/1163']
2025-05-21 18:23:05,312 - INFO - Found 728 valid IDs with 3636 person images
2025-05-21 18:23:05,312 - INFO - Creating true pairs
2025-05-21 18:23:05,333 - INFO - Generated 3636 true pairs
2025-05-21 18:23:05,334 - INFO - Creating false pairs
2025-05-21 18:23:05,384 - INFO - Generated 2912 false pairs
2025-05-21 18:23:05,389 - INFO - Total pairs: 6548 (Label 1: 3636, Label 0: 2912)
2025-05-21