In [1]:
import os
from zipfile import ZipFile
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.layers import Resizing, Rescaling
from shutil import copyfile
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:21:00.0, compute capability: 8.6



In [2]:
# extract the data from the zip file
local_zip = 'data/kagglecatsanddogs_5340.zip'

with ZipFile(local_zip, 'r') as zip:
    zip.extractall("data/")
os.remove("data/CDLA-Permissive-2.0.pdf")
os.remove("data/readme[1].txt")

In [3]:
print(len(os.listdir('data/PetImages/Cat/')))
print(len(os.listdir('data/PetImages/Dog/')))

12501
12501


In [6]:
# create folders for the resized images
create_folder_names = ["data/resized_images"]

for dir in create_folder_names:
    try:
        os.mkdir(dir)
        print("Directory " + dir + " created")
    except:
        print("Directory " + dir + " not created")

Directory data/resized_images created


In [8]:
# resize the images and save them in the resized_images folder
from keras.preprocessing.image import load_img, img_to_array, array_to_img

def resize_image(source, destination, width, height):
    all_files = []
    type = source.split("/")[-1]
    
    for file in os.listdir(source):
        file_path = source + "/" + file
    
        if os.path.getsize(file_path) and file.endswith(".jpg"):
            all_files.append(file_path)
        else:
            print(f"{file_path} is empty")
        
    for file in tqdm(all_files):       
        img = load_img(file, target_size=(width, height))
        img = img_to_array(img)
        img = array_to_img(img)
        img.save(destination + "/" + type.lower() + "_" + file.split("/")[-1])
        
    print(f"{len(all_files)} images resized")
        

resize_image("data/PetImages/Cat", "data/resized_images/", 150, 150)
resize_image("data/PetImages/Dog", "data/resized_images/", 150, 150)

data/PetImages/Cat/666.jpg is empty
data/PetImages/Cat/Thumbs.db is empty


100%|██████████| 12499/12499 [02:12<00:00, 94.52it/s] 


12499 images resized
data/PetImages/Dog/11702.jpg is empty
data/PetImages/Dog/Thumbs.db is empty


100%|██████████| 12499/12499 [02:33<00:00, 81.32it/s] 

12499 images resized





In [9]:
# create folders for the train, validation and test sets
create_folder_names = [
    "data/pets_images",
    "data/pets_images/train",
    "data/pets_images/test",
    "data/pets_images/validation",
]

for dir in create_folder_names:
    try:
        os.mkdir(dir)
        print("Directory " + dir + " created")
    except:
        print("Directory " + dir + " not created")

Directory data/pets_images created
Directory data/pets_images/train created
Directory data/pets_images/test created
Directory data/pets_images/validation created


In [10]:
# split the data into train, validation and test sets
def split_data(source, train_dir, test_dir, val_dir, train_size, test_size):
    all_filess = []
    
    for file in os.listdir(source):
        file_path = source + "/" + file
        
        if os.path.getsize(file_path):
            all_filess.append(file_path)
        else:
            print(f"{file_path} is empty")
    
    n_files = len(all_filess)
    split_point = int(n_files * train_size)
    
    shuffled = random.sample(all_filess, n_files)
    train_set = shuffled[:split_point]
    others = shuffled[split_point:]
    
    n_files = len(others)
    split_point = int(n_files * (test_size / (1 - train_size)))
    
    test_set = others[:split_point]
    val_set = others[split_point:]
    
    for file in tqdm(train_set):
        copyfile(file, train_dir + "/" + file.split("/")[-1])
    
    for file in tqdm(test_set):
        copyfile(file, test_dir + "/" + file.split("/")[-1])
        
    for file in tqdm(val_set):
        copyfile(file, val_dir + "/" + file.split("/")[-1])
        
src_dir = "data/resized_images/"
train_dir = "data/pets_images/train/"
test_dir = "data/pets_images/test/"
val_dir = "data/pets_images/validation/"

train_size = 0.7
test_size = 0.15

split_data(src_dir, train_dir, test_dir, val_dir, train_size, test_size)

100%|██████████| 17498/17498 [02:00<00:00, 145.53it/s]
100%|██████████| 3749/3749 [00:27<00:00, 135.62it/s]
100%|██████████| 3751/3751 [00:27<00:00, 137.08it/s]
