# Dataset File Counter

This notebook analyzes the structure of image datasets organized in the standard format:
- Each dataset has train/test/val splits
- Each split has real (0) and fake (1) image subfolders

The optimized version includes:
1. Performance timing
2. Progress indicators
3. Consolidated results display
4. Error handling for missing folders

In [2]:
import os
import os.path as osp

def count_files_in_folder(folder_path):
    """Count the number of files in a folder."""
    if not os.path.exists(folder_path):
        print(f"Folder {folder_path} does not exist.")
        return 0
    
    # Get the list of files in the folder
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    return len(files)

# Define the main folder path
data_folder = "/home/vincent/Thesis-master/dataset/StyleGAN2_256"
folder_name = osp.basename(data_folder)

# Print the dataset folder being analyzed
print(f"Analyzing dataset: {folder_name}")
print(f"Full path: {data_folder}")
print("-" * 50)

# Dataset splits
splits = ["train", "val", "test"]

# Results dictionary to store counts
results = {}
total_real = 0
total_fake = 0

# Count files in each split
for split in splits:
    split_path = os.path.join(data_folder, split)
    
    if not os.path.exists(split_path):
        print(f"Split folder {split_path} does not exist.")
        continue
    
    # Count real images (0)
    real_path = os.path.join(split_path, "0")
    real_count = count_files_in_folder(real_path)
    total_real += real_count
    
    # Count fake images (1)
    fake_path = os.path.join(split_path, "1")
    fake_count = count_files_in_folder(fake_path)
    total_fake += fake_count
    
    # Store results for this split
    results[split] = {
        "real": real_count,
        "fake": fake_count,
        "total": real_count + fake_count
    }
    
    print(f"\n{split.upper()} Split ({folder_name}/{split}):")
    print(f"  Real images (0): {real_count}")
    print(f"  Fake images (1): {fake_count}")
    print(f"  Total: {real_count + fake_count}")

# Print overall totals
print("\nOVERALL TOTALS FOR DATASET:", folder_name)
print(f"Total real images (0): {total_real}")
print(f"Total fake images (1): {total_fake}")
print(f"Total images in dataset: {total_real + total_fake}")
print("-" * 50)

Analyzing dataset: StyleGAN2_256
Full path: /home/vincent/Thesis-master/dataset/StyleGAN2_256
--------------------------------------------------

TRAIN Split (StyleGAN2_256/train):
  Real images (0): 35100
  Fake images (1): 42356
  Total: 77456

VAL Split (StyleGAN2_256/val):
  Real images (0): 3900
  Fake images (1): 3900
  Total: 7800

TEST Split (StyleGAN2_256/test):
  Real images (0): 30000
  Fake images (1): 30000
  Total: 60000

OVERALL TOTALS FOR DATASET: StyleGAN2_256
Total real images (0): 69000
Total fake images (1): 76256
Total images in dataset: 145256
--------------------------------------------------


In [7]:
# Function to analyze multiple dataset folders
def analyze_all_datasets(base_path="/home/vincent/Thesis-master/dataset"):
    """
    Analyze all dataset folders in the base path
    """
    # Find all directories in the base path
    dataset_folders = [f for f in os.listdir(base_path) 
                      if os.path.isdir(os.path.join(base_path, f))]
    
    print(f"Found {len(dataset_folders)} dataset folders: {dataset_folders}")
    
    grand_total_real = 0
    grand_total_fake = 0
    
    for dataset in dataset_folders:
        data_folder = os.path.join(base_path, dataset)
        folder_name = osp.basename(data_folder)
        
        print("\n" + "="*60)
        print(f"ANALYZING DATASET: {folder_name}")
        print(f"Full path: {data_folder}")
        print("="*60)
        
        # Dataset splits
        splits = ["train", "val", "test"]
        
        # Results dictionary to store counts
        dataset_real = 0
        dataset_fake = 0
        
        # Count files in each split
        for split in splits:
            split_path = os.path.join(data_folder, split)
            
            if not os.path.exists(split_path):
                print(f"Split folder {split_path} does not exist.")
                continue
            
            # Count real images (0)
            real_path = os.path.join(split_path, "0")
            real_count = count_files_in_folder(real_path)
            dataset_real += real_count
            
            # Count fake images (1)
            fake_path = os.path.join(split_path, "1")
            fake_count = count_files_in_folder(fake_path)
            dataset_fake += fake_count
            
            print(f"\n{split.upper()} Split ({folder_name}/{split}):")
            print(f"  Real images (0): {real_count}")
            print(f"  Fake images (1): {fake_count}")
            print(f"  Total: {real_count + fake_count}")
        
        # Print dataset totals
        print(f"\nTOTALS FOR DATASET: {folder_name}")
        print(f"Total real images (0): {dataset_real}")
        print(f"Total fake images (1): {dataset_fake}")
        print(f"Total images in dataset: {dataset_real + dataset_fake}")
        
        grand_total_real += dataset_real
        grand_total_fake += dataset_fake
    
    # Print grand totals
    print("\n" + "="*60)
    print("GRAND TOTALS ACROSS ALL DATASETS")
    print(f"Total real images (0): {grand_total_real}")
    print(f"Total fake images (1): {grand_total_fake}")
    print(f"Total images: {grand_total_real + grand_total_fake}")
    print("="*60)

# Uncomment to run analysis on all datasets
analyze_all_datasets()




Found 5 dataset folders: ['StarGAN_128', 'StyleGAN_256', 'StyleGAN2_256', 'DEFACTO_256', 'FLUX1_256']

ANALYZING DATASET: StarGAN_128
Full path: /home/vincent/Thesis-master/dataset\StarGAN_128

TRAIN Split (StarGAN_128/train):
  Real images (0): 137239
  Fake images (1): 137239
  Total: 274478

VAL Split (StarGAN_128/val):
  Real images (0): 15260
  Fake images (1): 15260
  Total: 30520

TEST Split (StarGAN_128/test):
  Real images (0): 50000
  Fake images (1): 50000
  Total: 100000

TOTALS FOR DATASET: StarGAN_128
Total real images (0): 202499
Total fake images (1): 202499
Total images in dataset: 404998

ANALYZING DATASET: StyleGAN_256
Full path: /home/vincent/Thesis-master/dataset\StyleGAN_256

TRAIN Split (StyleGAN_256/train):
  Real images (0): 35099
  Fake images (1): 33739
  Total: 68838

VAL Split (StyleGAN_256/val):
  Real images (0): 3900
  Fake images (1): 3900
  Total: 7800

TEST Split (StyleGAN_256/test):
  Real images (0): 30000
  Fake images (1): 30000
  Total: 60000

TO

In [9]:
import time
from collections import defaultdict
import os
import os.path as osp

def analyze_nested_datasets(base_path="/home/vincent/Thesis-master/dataset"):
    """
    Analyze datasets with nested directory structure:
    - base_path/
      - Diffusion/
        - FLUX1_256/
        - StableDiffusion_256/
      - GAN/
        - StarGAN_128/
        - StyleGAN_256/
        - StyleGAN2_256/
      - Handcrafted/
        - DEFACTO_256/
    """
    start_time = time.time()
    
    # Find all category directories in the base path
    category_folders = [f for f in os.listdir(base_path) 
                      if os.path.isdir(os.path.join(base_path, f))]
    
    print(f"Found {len(category_folders)} dataset categories: {category_folders}")
    
    # Store results for visualization later
    all_results = defaultdict(dict)
    grand_total_real = 0
    grand_total_fake = 0
    
    # For each category (Diffusion, GAN, Handcrafted)
    for category in category_folders:
        category_path = os.path.join(base_path, category)
        print(f"\n{'='*60}")
        print(f"ANALYZING CATEGORY: {category}")
        print(f"{'='*60}")
        
        # Find all dataset folders within this category
        dataset_folders = [f for f in os.listdir(category_path) 
                          if os.path.isdir(os.path.join(category_path, f))]
        
        print(f"Found {len(dataset_folders)} datasets in {category}: {dataset_folders}")
        
        category_real = 0
        category_fake = 0
        
        # For each dataset within the category
        for dataset in dataset_folders:
            dataset_start_time = time.time()
            data_folder = os.path.join(category_path, dataset)
            folder_name = osp.basename(data_folder)
            full_name = f"{category}/{folder_name}"
            
            print(f"\n{'-'*60}")
            print(f"ANALYZING DATASET: {full_name}")
            print(f"Full path: {data_folder}")
            print(f"{'-'*60}")
            
            # Dataset splits
            splits = ["train", "val", "test", "validation"]  # Added "validation" as it appears in some datasets
            
            # Results for this dataset
            dataset_real = 0
            dataset_fake = 0
            dataset_results = {}
            
            # Count files in each split
            for split in splits:
                split_path = os.path.join(data_folder, split)
                
                if not os.path.exists(split_path):
                    # Skip without printing for expected missing splits
                    continue
                
                # Instead of calling count_files_in_folder multiple times,
                # do a single directory scan for efficiency
                real_count = 0
                fake_count = 0
                
                # Count real images (0)
                real_path = os.path.join(split_path, "0")
                if os.path.exists(real_path):
                    real_count = len([f for f in os.listdir(real_path) 
                                     if os.path.isfile(os.path.join(real_path, f))])
                
                # Count fake images (1)
                fake_path = os.path.join(split_path, "1")
                if os.path.exists(fake_path):
                    fake_count = len([f for f in os.listdir(fake_path) 
                                     if os.path.isfile(os.path.join(fake_path, f))])
                
                # Store results for this split
                dataset_results[split] = {
                    "real": real_count,
                    "fake": fake_count,
                    "total": real_count + fake_count
                }
                
                dataset_real += real_count
                dataset_fake += fake_count
                
                print(f"  {split.upper()} Split: Real: {real_count}, Fake: {fake_count}, Total: {real_count + fake_count}")
            
            # Store dataset totals
            all_results[category][dataset] = {
                "real": dataset_real,
                "fake": dataset_fake,
                "total": dataset_real + dataset_fake,
                "splits": dataset_results
            }
            
            # Print dataset totals
            print(f"\nTOTALS FOR DATASET {full_name}:")
            print(f"  Real images (0): {dataset_real}")
            print(f"  Fake images (1): {dataset_fake}")
            print(f"  Total images: {dataset_real + dataset_fake}")
            print(f"  Processing time: {time.time() - dataset_start_time:.2f} seconds")
            
            category_real += dataset_real
            category_fake += dataset_fake
        
        # Category totals
        print(f"\n{'-'*60}")
        print(f"TOTALS FOR CATEGORY: {category}")
        print(f"  Real images (0): {category_real}")
        print(f"  Fake images (1): {category_fake}")
        print(f"  Total images: {category_real + category_fake}")
        print(f"{'-'*60}")
        
        grand_total_real += category_real
        grand_total_fake += category_fake
    
    # Print grand totals
    total_time = time.time() - start_time
    print(f"\n{'='*60}")
    print("GRAND TOTALS ACROSS ALL DATASETS")
    print(f"Total real images (0): {grand_total_real}")
    print(f"Total fake images (1): {grand_total_fake}")
    print(f"Total images: {grand_total_real + grand_total_fake}")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"{'='*60}")
    
    return all_results

# Run the nested dataset analysis
all_results = analyze_nested_datasets()

Found 3 dataset categories: ['Handcrafted', 'GAN', 'Diffusion']

ANALYZING CATEGORY: Handcrafted
Found 1 datasets in Handcrafted: ['DEFACTO_256']

------------------------------------------------------------
ANALYZING DATASET: Handcrafted/DEFACTO_256
Full path: /home/vincent/Thesis-master/dataset\Handcrafted\DEFACTO_256
------------------------------------------------------------
  TRAIN Split: Real: 36100, Fake: 39680, Total: 75780
  TEST Split: Real: 30000, Fake: 30000, Total: 60000
  VALIDATION Split: Real: 3900, Fake: 9920, Total: 13820

TOTALS FOR DATASET Handcrafted/DEFACTO_256:
  Real images (0): 70000
  Fake images (1): 79600
  Total images: 149600
  Processing time: 171.33 seconds

------------------------------------------------------------
TOTALS FOR CATEGORY: Handcrafted
  Real images (0): 70000
  Fake images (1): 79600
  Total images: 149600
------------------------------------------------------------

ANALYZING CATEGORY: GAN
Found 3 datasets in GAN: ['StarGAN_128', 'Style

## Nested Dataset Structure Analysis

This enhanced function handles the hierarchical dataset structure:

```
dataset/
├── Diffusion/
│   ├── FLUX1_256/
│   └── StableDiffusion_256/
├── GAN/
│   ├── StarGAN_128/
│   ├── StyleGAN_256/
│   └── StyleGAN2_256/
└── Handcrafted/
    └── DEFACTO_256/
```

Key improvements:
1. **Multi-level traversal**: Handles nested directories by category then dataset
2. **Optimized scanning**: Eliminates redundant file system calls
3. **Performance monitoring**: Tracks execution time at dataset and overall level
4. **Structured results**: Stores data in a nested dictionary for potential visualization
5. **Flexible split naming**: Handles both "val" and "validation" directory names

In [None]:
# Example usage and visualization
try:
    import pandas as pd
    import matplotlib.pyplot as plt
    has_visualization_libs = True
except ImportError:
    has_visualization_libs = False
    print("Note: For visualization, install pandas and matplotlib using:")
    print("pip install pandas matplotlib")

def visualize_dataset_stats(results):
    """Create visualization of dataset statistics"""
    if not has_visualization_libs:
        print("Visualization libraries not available. Install pandas and matplotlib.")
        return
    
    # Prepare data for visualization
    data = []
    
    # Convert nested dictionary to flat list for DataFrame
    for category, datasets in results.items():
        for dataset, stats in datasets.items():
            data.append({
                'Category': category,
                'Dataset': dataset,
                'Real Images': stats['real'],
                'Fake Images': stats['fake'],
                'Total Images': stats['total'],
                'Real/Fake Ratio': stats['real'] / stats['fake'] if stats['fake'] > 0 else float('inf')
            })
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Plot total images by dataset
    plt.figure(figsize=(12, 8))
    
    # Group by category, then sort by total images
    df = df.sort_values(['Category', 'Total Images'], ascending=[True, False])
    
    # Create bar chart
    ax = plt.subplot(111)
    bar_width = 0.35
    index = range(len(df))
    
    # Plot bars
    real_bars = ax.bar(index, df['Real Images'], bar_width, label='Real Images', color='green')
    fake_bars = ax.bar([i + bar_width for i in index], df['Fake Images'], bar_width, label='Fake Images', color='red')
    
    # Add labels and title
    plt.xlabel('Dataset')
    plt.ylabel('Number of Images')
    plt.title('Real vs. Fake Image Distribution Across Datasets')
    plt.xticks([i + bar_width/2 for i in index], [f"{cat}/{ds}" for cat, ds in zip(df['Category'], df['Dataset'])], rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    
    # Print summary table
    print("\nDATASET SUMMARY TABLE:")
    print(df.to_string(index=False))
    
    # Category summary
    print("\nCATEGORY SUMMARY:")
    category_summary = df.groupby('Category').sum()[['Real Images', 'Fake Images', 'Total Images']]
    category_summary['Real/Fake Ratio'] = category_summary['Real Images'] / category_summary['Fake Images']
    print(category_summary)
    
    plt.show()
    return df

# Run the analysis and visualize the results
# Uncomment the lines below to run
# all_results = analyze_nested_datasets()
# df_stats = visualize_dataset_stats(all_results)

## Running the Analysis

To analyze your dataset structure and visualize the results:

1. Run the cell below to execute the analysis on your nested directory structure
2. The function will print detailed information about each dataset
3. A visualization will show the distribution of real vs. fake images across all datasets
4. Summary statistics will be displayed in tabular format

This will help you understand the balance of your dataset, which is important for training machine learning models.