In [3]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from collections import Counter
from tqdm import tqdm
import matplotlib.pyplot as plt

In [6]:
def dataset_analysis(base_path='./cat', max_cats_to_check=1000):
    cat_folders = sorted([d for d in os.listdir(base_path) 
                         if os.path.isdir(os.path.join(base_path, d))])
    
    print(f"Found {len(cat_folders)} cats folders (subjects)") 
    
    # Check images per cat
    images_per_cat = []
    image_sizes = []
    formats = []
    total_size = 0
    
    for cat_folder in tqdm(cat_folders[:max_cats_to_check], desc="Scanning"):
        folder_path = os.path.join(base_path, cat_folder)
        images = [f for f in os.listdir(folder_path) 
                 if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
        
        images_per_cat.append(len(images))
        
        # Check first image in folder for details
        if images:
            img_path = os.path.join(folder_path, images[0])
            try:
                with Image.open(img_path) as im:
                    image_sizes.append(im.size)
                    formats.append(im.format)
                total_size += os.path.getsize(img_path) / (1024 * 1024)
            except:
                pass
    
    # Print statistics
    images_array = np.array(images_per_cat)
    print(f"\nSTATISTICS (from {max_cats_to_check} cats):")
    print(f"   Images per cat - Mean: {images_array.mean():.2f}, "
          f"Min: {images_array.min()}, Max: {images_array.max()}")
    
    if image_sizes:
        widths = [s[0] for s in image_sizes]
        heights = [s[1] for s in image_sizes]
        print(f"   Image width - Mean: {np.mean(widths):.0f}px, "
              f"Range: [{min(widths)}, {max(widths)}]")
        print(f"   Image height - Mean: {np.mean(heights):.0f}px, "
              f"Range: [{min(heights)}, {max(heights)}]")
    if formats:
        format_counts = Counter(formats)
        print(f"   Formats: {dict(format_counts)}")    
    return {
        'images_per_cat': images_per_cat,
        'image_sizes': image_sizes,
        'formats': formats
    }

stats = dataset_analysis('./cat', max_cats_to_check=164100)

Found 164100 cats folders (subjects)


Scanning: 100%|██████████| 164100/164100 [00:10<00:00, 15144.06it/s]



STATISTICS (from 164100 cats):
   Images per cat - Mean: 3.92, Min: 2, Max: 10
   Image width - Mean: 224px, Range: [224, 224]
   Image height - Mean: 224px, Range: [224, 224]
   Formats: {'PNG': 164100}
