In [65]:
import os
from zipfile import ZipFile
import random
import tensorflow as tf
from tensorflow import keras
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Resizing, Rescaling
from shutil import copyfile
from skimage.transform import resize
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [10]:
# extract the data from the zip file
local_zip = 'data/kagglecatsanddogs_5340.zip'

with ZipFile(local_zip, 'r') as zip:
    zip.extractall("data/")

In [11]:
print(len(os.listdir('data/PetImages/Cat/')))
print(len(os.listdir('data/PetImages/Dog/')))

12501
12501


In [34]:
# create folders for the resized images
create_folder_names = ["data/resized_images", "data/resized_images/cats", "data/resized_images/dogs"]

for dir in create_folder_names:
    try:
        os.mkdir(dir)
        print("Directory " + dir + " created")
    except:
        print("Directory " + dir + " not created")

Directory data/resized_images created
Directory data/resized_images/cats created
Directory data/resized_images/dogs created


In [67]:
# resize the images and save them in the resized_images folder
def resize_image(source, destination, width, height):
    all_files = []
    
    for file in os.listdir(source):
        file_path = source + "/" + file
    
        if os.path.getsize(file_path) and file.endswith(".jpg"):
            all_files.append(file_path)
        else:
            print(f"{file_path} is empty")
        
    for file in tqdm(all_files):       
        img = plt.imread(file)
        res = resize(img, (width, height))
        plt.imsave(destination + "/" + file.split("/")[-1], res)
        
    print(f"{len(all_files)} images resized")
        

resize_image("data/PetImages/Cat", "data/resized_images/cats", 150, 150)
resize_image("data/PetImages/Dog", "data/resized_images/dogs", 150, 150)

data/PetImages/Cat/666.jpg is empty
data/PetImages/Cat/Thumbs.db is empty


100%|██████████| 12499/12499 [04:15<00:00, 48.92it/s]


12499 images resized
data/PetImages/Dog/11702.jpg is empty
data/PetImages/Dog/Thumbs.db is empty


100%|██████████| 12499/12499 [04:03<00:00, 51.23it/s]

12499 images resized





In [66]:
# create folders for the train, validation and test sets
create_folder_names = [
    "data/cats-and-dogs",
    "data/cats-and-dogs/train",
    "data/cats-and-dogs/test",
    "data/cats-and-dogs/validation",
    "data/cats-and-dogs/train/cats",
    "data/cats-and-dogs/train/dogs",
    "data/cats-and-dogs/test/cats",
    "data/cats-and-dogs/test/dogs",
    "data/cats-and-dogs/validation/cats",
    "data/cats-and-dogs/validation/dogs"
]

for dir in tqdm(create_folder_names):
    try:
        os.mkdir(dir)
        print("Directory " + dir + " created")
    except:
        print("Directory " + dir + " not created")

100%|██████████| 10/10 [00:00<00:00, 2488.46it/s]

Directory data/cats-and-dogs created
Directory data/cats-and-dogs/train created
Directory data/cats-and-dogs/test created
Directory data/cats-and-dogs/validation created
Directory data/cats-and-dogs/train/cats created
Directory data/cats-and-dogs/train/dogs created
Directory data/cats-and-dogs/test/cats created
Directory data/cats-and-dogs/test/dogs created
Directory data/cats-and-dogs/validation/cats created
Directory data/cats-and-dogs/validation/dogs created





In [68]:
# split the data into train, validation and test sets
def split_data(source, train_dir, test_dir, val_dir, train_size, test_size):
    all_filess = []
    
    for file in os.listdir(source):
        file_path = source + "/" + file
        
        if os.path.getsize(file_path):
            all_filess.append(file_path)
        else:
            print(f"{file_path} is empty")
    
    n_files = len(all_filess)
    split_point = int(n_files * train_size)
    
    shuffled = random.sample(all_filess, n_files)
    train_set = shuffled[:split_point]
    others = shuffled[split_point:]
    
    n_files = len(others)
    split_point = int(n_files * (test_size / (1 - train_size)))
    
    test_set = others[:split_point]
    val_set = others[split_point:]
    
    for file in tqdm(train_set):
        copyfile(file, train_dir + "/" + file.split("/")[-1])
    
    for file in tqdm(test_set):
        copyfile(file, test_dir + "/" + file.split("/")[-1])
        
    for file in tqdm(val_set):
        copyfile(file, val_dir + "/" + file.split("/")[-1])
        
cat_src_dir = "data/resized_images/cats"
train_cat_dir = "data/cats-and-dogs/train/cats"
test_cat_dir = "data/cats-and-dogs/test/cats"
val_cat_dir = "data/cats-and-dogs/validation/cats"

dog_src_dir = "data/resized_images/dogs"
dog_train_dir = "data/cats-and-dogs/train/dogs"
dog_test_dir = "data/cats-and-dogs/test/dogs"
dog_val_dir = "data/cats-and-dogs/validation/dogs"

train_size = 0.8
test_size = 0.15

split_data(cat_src_dir, train_cat_dir, test_cat_dir, val_cat_dir, train_size, test_size)
split_data(dog_src_dir, dog_train_dir, dog_test_dir, dog_val_dir, train_size, test_size) 

100%|██████████| 9999/9999 [00:46<00:00, 216.48it/s]
100%|██████████| 1875/1875 [00:08<00:00, 217.82it/s]
100%|██████████| 625/625 [00:02<00:00, 218.36it/s]
100%|██████████| 9999/9999 [00:46<00:00, 215.05it/s]
100%|██████████| 1875/1875 [00:08<00:00, 215.60it/s]
100%|██████████| 625/625 [00:02<00:00, 220.49it/s]
