In [1]:
#import libraries
import h5py
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def load_dataset():
    train_dataset = h5py.File('/content/drive/MyDrive/catvnoncat/train_catvnoncat.h5', "r")
    train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
    train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels

    test_dataset = h5py.File('/content/drive/MyDrive/catvnoncat/test_catvnoncat.h5', "r")
    test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
    test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels

    classes = np.array(test_dataset["list_classes"][:]) # the list of classes

    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))

    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

In [4]:
# Assuming load_dataset() function is defined and called to get the dataset
train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes = load_dataset()

# Split the training set further to create a validation set
# Here, we are splitting the training set into 80% training and 20% validation
train_x, val_x, train_y, val_y = train_test_split(train_set_x_orig, train_set_y_orig.T, test_size=0.2, random_state=1)
train_y = train_y.T
val_y = val_y.T

In [5]:
#Let's say you want to create a smaller prototype dataset, for example with 10% of the data
# Here's how you could do it:
def create_prototype_dataset(train_x, train_y, val_x, val_y, test_set_x_orig, test_set_y_orig, fraction=0.1):
    # Determine the number of examples to include in the prototype
    num_train = int(train_x.shape[0] * fraction)
    num_val = int(val_x.shape[0] * fraction)
    num_test = int(test_set_x_orig.shape[0] * fraction)

    # Create prototype datasets
    prototype_train_x = train_x[:num_train]
    prototype_train_y = train_y[:, :num_train]
    prototype_val_x = val_x[:num_val]
    prototype_val_y = val_y[:, :num_val]
    prototype_test_x = test_set_x_orig[:num_test]
    prototype_test_y = test_set_y_orig[:, :num_test]

    return prototype_train_x, prototype_train_y, prototype_val_x, prototype_val_y, prototype_test_x, prototype_test_y

In [6]:
# Call the function to create the prototype datasets
proto_train_x, proto_train_y, proto_val_x, proto_val_y, proto_test_x, proto_test_y = create_prototype_dataset(train_x, train_y, val_x, val_y, test_set_x_orig, test_set_y_orig)

# Now you have a prototype dataset with a smaller number of examples
print(f"Prototype Training set: {proto_train_x.shape}, {proto_train_y.shape}")
print(f"Prototype Validation set: {proto_val_x.shape}, {proto_val_y.shape}")
print(f"Prototype Testing set: {proto_test_x.shape}, {proto_test_y.shape}")

Prototype Training set: (16, 64, 64, 3), (1, 16)
Prototype Validation set: (4, 64, 64, 3), (1, 4)
Prototype Testing set: (5, 64, 64, 3), (1, 5)
