In [26]:
import os
from PIL import Image
from collections import Counter


In [30]:
def check(folder):
    modes = []
    entries = os.listdir(folder)
    counter = Counter()
    for entry in entries:
        if not entry.lower().endswith(".jpg"):
            print(entry)
        else:
            img = Image.open(f"{folder}/{entry}")
            counter[img.mode] += 1
    
    return counter

In [31]:
cat_dir = "datasets/PetImages/Cat"
dog_dir = "datasets/PetImages/Dog"

modes = check(cat_dir)
print(modes)
modes = check(dog_dir)
print(modes)

Counter({'RGB': 12470, 'P': 26, 'L': 3})
Counter({'RGB': 12461, 'P': 31, 'CMYK': 3, 'RGBA': 2, 'L': 2})


In [None]:

cat_images = os.listdir(cat_dir)
filter_out_from(cat_images, cat_dir)
dog_images = os.listdir(dog_dir)
filter_out_from(dog_images, dog_dir)
cat_images = os.listdir(cat_dir)
dog_images = os.listdir(dog_dir)
print(f"Number of cat images: {len(cat_images)}")
print(f"Number of dog images: {len(dog_images)}")


img = Image.open("image.png")
print(img.mode)

In [36]:
import os
from PIL import Image

image_folder = "datasets/Cat"
image_files = [f for f in os.listdir(image_folder) if not f.startswith(".")]

for file_name in image_files:
    path = os.path.join(image_folder, file_name)
    try:
        img = Image.open(path)
        img.verify()  # check if it's a valid image
    except Exception as e:
        print(f"Skipping invalid image: {file_name}, reason: {e}")

In [37]:
import urllib.request
import zipfile
import os
import shutil
from PIL import Image


KAGGLE_CATS_DOGS_URL = "https://download.microsoft.com/download/3/e/1/3e1c3f21-ecdb-4869-8368-6deba77b919f/kagglecatsanddogs_5340.zip"
DOWNLOADED_DIR = "downloaded"
UNZIPPED_DIR = f"{DOWNLOADED_DIR}/PetImages"
DATASETS_DIR = "datasets"


def download_data():
    os.makedirs(DOWNLOADED_DIR, exist_ok=True)
    download_path = f"{DOWNLOADED_DIR}/catsdogs.zip"

    if not os.path.exists(UNZIPPED_DIR):
        if not os.path.exists(download_path):
            urllib.request.urlretrieve(KAGGLE_CATS_DOGS_URL, download_path)
            print(f"Downloaded to {download_path}")
        else:
            print(f"Found data {download_path}")

        with zipfile.ZipFile(download_path, 'r') as zip_ref:
            zip_ref.extractall(DOWNLOADED_DIR)
        print(f"Unzipped to {DOWNLOADED_DIR}")
    else:
        print(f"Found unzipped {UNZIPPED_DIR}")


def extract_data_from(entries, folder):
    for file_name in entries:
        file_path = f"{UNZIPPED_DIR}/{folder}/{file_name}"
        if os.path.getsize(file_path) > 0 and file_name.lower().endswith(".jpg"):
            img = Image.open(file_path)
            if img.mode in ('RGB', 'RGBA'):
                shutil.copyfile(file_path, f"datasets/{folder}/{file_name}")


def extract_datasets():
    os.mkdir("datasets")
    os.makedirs("datasets/Cat")
    os.mkdir("datasets/Dog")
    cat_images = os.listdir(f"{UNZIPPED_DIR}/Cat")
    dog_images = os.listdir(f"{UNZIPPED_DIR}/Dog")
    extract_data_from(cat_images, "Cat")
    extract_data_from(dog_images, "Dog")
    cat_ds = os.listdir(f"{DATASETS_DIR}/Cat")
    dog_ds = os.listdir(f"{DATASETS_DIR}/Dog")
    print(f"\nCats: {len(cat_ds)} \ {len(cat_images)}")
    print(f"Dogs: {len(dog_ds)} \ {len(dog_images)}")
    print(f"Total: {len(cat_ds) + len(dog_ds)}")


def clean_datasets():
    datasets_path = "datasets"
    if os.path.exists(datasets_path):
        try:
            shutil.rmtree("datasets")
            print(f"Successfully cleaned datasets")
        except Exception as e:
            print(f"Error deleting datasets: {e}")
    else:
        print("No datasets directory.")



clean_datasets()
download_data()
extract_datasets()

Successfully cleaned datasets
Found unzipped downloaded/PetImages

Cats: 12470 \ 12501
Dogs: 12463 \ 12501


In [44]:
cat_ds = os.listdir(f"{DATASETS_DIR}/Cat")
dog_ds = os.listdir(f"{DATASETS_DIR}/Dog")
print(f"Total: {len(cat_ds) + len(dog_ds)}")

Total: 24933
