In [1]:
from google.colab.patches import cv2_imshow
import cv2
import matplotlib.pyplot as plt
from keras.datasets import mnist, fashion_mnist
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import os, json
from sklearn.utils import shuffle

## MNIST

In [None]:
# RUNNING CONFIGURATION
RESCALED_IMAGE_HEIGHT = 4
RESCALED_IMAGE_WIDTH = 4
DESIRE_CLASSES = {3, 6}
PARENT_DIR = "/content/drive/MyDrive/UTSA Student/Google Colab/Spring 2024/EE5423 HW4ML/Project/dataset"
OUTPUT_DIR = "./MNIST-2"
OUTPUT_X_TRAIN_FILE = "./x_train.npy"
OUTPUT_Y_TRAIN_FILE = "./y_train.npy"
OUTPUT_X_TEST_FILE = "./x_test.npy"
OUTPUT_Y_TEST_FILE = "./y_test.npy"
OUTPUT_X_FILE = "./x.npy"
OUTPUT_Y_FILE = "./y.npy"
CONFIG_FILE = './config.json'

In [None]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [None]:
# Rescale training dataset
rescaled_x_train = []
for img in tqdm(x_train):
  resized_img = cv2.resize(img, (RESCALED_IMAGE_WIDTH, RESCALED_IMAGE_HEIGHT))
  rescaled_x_train.append(resized_img)

100%|██████████| 60000/60000 [00:00<00:00, 281259.38it/s]


In [None]:
# Rescale testing dataset
rescaled_x_test = []
for img in tqdm(x_test):
  resized_img = cv2.resize(img, (RESCALED_IMAGE_WIDTH, RESCALED_IMAGE_HEIGHT))
  rescaled_x_test.append(resized_img)

100%|██████████| 10000/10000 [00:00<00:00, 338802.24it/s]


In [None]:
# Extract desire classes
desire_x_train = []
desire_y_train = []
desire_x_test = []
desire_y_test = []

for img, label in zip(rescaled_x_train, y_train):
  if label in DESIRE_CLASSES:
    desire_x_train.append(img)
    desire_y_train.append(label)

for img, label in zip(rescaled_x_test, y_test):
  if label in DESIRE_CLASSES:
    desire_x_test.append(img)
    desire_y_test.append(label)

desire_x_train = np.array(desire_x_train)
desire_y_train = np.array(desire_y_train)
desire_x_test = np.array(desire_x_test)
desire_y_test = np.array(desire_y_test)

In [None]:
print(desire_x_train.shape)
print(desire_y_train.shape)
print(desire_x_test.shape)
print(desire_y_test.shape)

(12049, 4, 4)
(12049,)
(1968, 4, 4)
(1968,)


In [None]:
desire_x = np.concatenate((desire_x_train, desire_x_test), axis=0)
desire_y = np.concatenate((desire_y_train, desire_y_test), axis=0)

In [None]:
print(desire_x.shape)
print(desire_y.shape)

(14017, 4, 4)
(14017,)


In [None]:
12049 + 1968

14017

In [None]:
# Store to disk
full_output_dir = os.path.join(PARENT_DIR, OUTPUT_DIR)

In [None]:
if not os.path.exists(full_output_dir):
    print("Creating {} directory".format(full_output_dir))
    os.mkdir(full_output_dir)

In [None]:
full_output_x_train_file = os.path.join(full_output_dir, OUTPUT_X_TRAIN_FILE)
full_output_y_train_file = os.path.join(full_output_dir, OUTPUT_Y_TRAIN_FILE)
full_output_x_test_file = os.path.join(full_output_dir, OUTPUT_X_TEST_FILE)
full_output_y_test_file = os.path.join(full_output_dir, OUTPUT_Y_TEST_FILE)
full_output_x_file = os.path.join(full_output_dir, OUTPUT_X_FILE)
full_output_y_file = os.path.join(full_output_dir, OUTPUT_Y_FILE)
full_config_file = os.path.join(full_output_dir, CONFIG_FILE)

In [None]:
with open(full_output_x_train_file, 'wb') as f:
    np.save(f, desire_x_train)

with open(full_output_y_train_file, 'wb') as f:
    np.save(f, desire_y_train)

with open(full_output_x_test_file, 'wb') as f:
    np.save(f, desire_x_test)

with open(full_output_y_test_file, 'wb') as f:
    np.save(f, desire_y_test)

with open(full_output_x_file, 'wb') as f:
    np.save(f, desire_x)

with open(full_output_y_file, 'wb') as f:
    np.save(f, desire_y)

In [None]:
running_config = {
    "RESCALED_IMAGE_HEIGHT": RESCALED_IMAGE_HEIGHT,
    "RESCALED_IMAGE_WIDTH": RESCALED_IMAGE_WIDTH,
    "DESIRE_CLASSES": list(DESIRE_CLASSES),
}
with open(full_config_file, 'w') as f:
    json.dump(running_config, f, indent=4)

In [None]:
print("DONE !")

DONE !


## Fashion-MNIST

In [None]:
ESCALED_IMAGE_HEIGHT = 4
RESCALED_IMAGE_WIDTH = 4
DESIRE_CLASSES = {0, 3, 6}
PARENT_DIR = "/content/drive/MyDrive/UTSA Student/Spring 2024/EE5423 HW4ML/Project/dataset"
OUTPUT_DIR = "./Fashion-MNIST-3"
OUTPUT_X_TRAIN_FILE = "./x_train.npy"
OUTPUT_Y_TRAIN_FILE = "./y_train.npy"
OUTPUT_X_TEST_FILE = "./x_test.npy"
OUTPUT_Y_TEST_FILE = "./y_test.npy"
OUTPUT_X_FILE = "./x.npy"
OUTPUT_Y_FILE = "./y.npy"
CONFIG_FILE = './config.json'

In [None]:
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

In [None]:
print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)


In [None]:
# Rescale training dataset
rescaled_x_train = []
for img in tqdm(x_train):
  resized_img = cv2.resize(img, (RESCALED_IMAGE_WIDTH, RESCALED_IMAGE_HEIGHT))
  rescaled_x_train.append(resized_img)

100%|██████████| 60000/60000 [00:00<00:00, 354829.04it/s]


In [None]:
# Rescale testing dataset
rescaled_x_test = []
for img in tqdm(x_test):
  resized_img = cv2.resize(img, (RESCALED_IMAGE_WIDTH, RESCALED_IMAGE_HEIGHT))
  rescaled_x_test.append(resized_img)

100%|██████████| 10000/10000 [00:00<00:00, 325205.39it/s]


In [None]:
# Extract desire classes
desire_x_train = []
desire_y_train = []
desire_x_test = []
desire_y_test = []

for img, label in zip(rescaled_x_train, y_train):
  if label in DESIRE_CLASSES:
    desire_x_train.append(img)
    desire_y_train.append(label)

for img, label in zip(rescaled_x_test, y_test):
  if label in DESIRE_CLASSES:
    desire_x_test.append(img)
    desire_y_test.append(label)

desire_x_train = np.array(desire_x_train)
desire_y_train = np.array(desire_y_train)
desire_x_test = np.array(desire_x_test)
desire_y_test = np.array(desire_y_test)

In [None]:
print(desire_x_train.shape)
print(desire_y_train.shape)
print(desire_x_test.shape)
print(desire_y_test.shape)

(18000, 4, 4)
(18000,)
(3000, 4, 4)
(3000,)


In [None]:
desire_x = np.concatenate((desire_x_train, desire_x_test), axis=0)
desire_y = np.concatenate((desire_y_train, desire_y_test), axis=0)

In [None]:
print(desire_x.shape)
print(desire_y.shape)

(21000, 4, 4)
(21000,)


In [None]:
# Store to disk
full_output_dir = os.path.join(PARENT_DIR, OUTPUT_DIR)

In [None]:
if not os.path.exists(full_output_dir):
    print("Creating {} directory".format(full_output_dir))
    os.mkdir(full_output_dir)

Creating /content/drive/MyDrive/UTSA Student/Google Colab/Spring 2024/EE5423 HW4ML/Project/dataset/./Fashion-MNIST-3 directory


In [None]:
full_output_x_train_file = os.path.join(full_output_dir, OUTPUT_X_TRAIN_FILE)
full_output_y_train_file = os.path.join(full_output_dir, OUTPUT_Y_TRAIN_FILE)
full_output_x_test_file = os.path.join(full_output_dir, OUTPUT_X_TEST_FILE)
full_output_y_test_file = os.path.join(full_output_dir, OUTPUT_Y_TEST_FILE)
full_output_x_file = os.path.join(full_output_dir, OUTPUT_X_FILE)
full_output_y_file = os.path.join(full_output_dir, OUTPUT_Y_FILE)
full_config_file = os.path.join(full_output_dir, CONFIG_FILE)

In [None]:
with open(full_output_x_train_file, 'wb') as f:
    np.save(f, desire_x_train)

with open(full_output_y_train_file, 'wb') as f:
    np.save(f, desire_y_train)

with open(full_output_x_test_file, 'wb') as f:
    np.save(f, desire_x_test)

with open(full_output_y_test_file, 'wb') as f:
    np.save(f, desire_y_test)

with open(full_output_x_file, 'wb') as f:
    np.save(f, desire_x)

with open(full_output_y_file, 'wb') as f:
    np.save(f, desire_y)

In [None]:
running_config = {
    "RESCALED_IMAGE_HEIGHT": RESCALED_IMAGE_HEIGHT,
    "RESCALED_IMAGE_WIDTH": RESCALED_IMAGE_WIDTH,
    "DESIRE_CLASSES": list(DESIRE_CLASSES),
}
with open(full_config_file, 'w') as f:
    json.dump(running_config, f, indent=4)

In [None]:
print("DONE !")

DONE !


## Synthetic dataset

In [2]:
DIM = 16
NUM_SAMPLES = 100
TRAIN_RATIO = 0.9
CLASSES = {0, 1}
PARENT_DIR = "/content/drive/MyDrive/UTSA Student/Spring 2024/EE5423 HW4ML/Project/dataset"
OUTPUT_DIR = "./Syn-Dataset-{}".format(DIM)
D1_mean = 0
D1_var = 1
D2_mean = 0
D2_var = 2
OUTPUT_X_TRAIN_FILE = "./x_train.npy"
OUTPUT_Y_TRAIN_FILE = "./y_train.npy"
OUTPUT_X_TEST_FILE = "./x_test.npy"
OUTPUT_Y_TEST_FILE = "./y_test.npy"
OUTPUT_X_FILE = "./x.npy"
OUTPUT_Y_FILE = "./y.npy"
CONFIG_FILE = './config.json'
RANDOM_SEED = 2505

np.random.seed(RANDOM_SEED)

In [5]:
# Class 0

class0_first_half_x = np.random.normal(D1_mean, np.mean(D1_var), size=(NUM_SAMPLES//2, DIM//2))
class0_remain_half_x = np.random.normal(D2_mean, np.mean(D2_var), size=(NUM_SAMPLES//2, DIM//2))

print(class0_first_half_x.shape)
print(class0_remain_half_x.shape)

(50, 8)
(50, 8)


In [6]:
class0_x = np.concatenate((class0_first_half_x, class0_remain_half_x), axis=1)
class0_y = np.array([0]*(NUM_SAMPLES//2))

In [7]:
print(class0_x.shape)
print(class0_y.shape)

(50, 16)
(50,)


In [9]:
# Class 1
# Generate first half features

class1_first_half_x = np.random.normal(D2_mean, np.mean(D2_var), size=(NUM_SAMPLES//2, DIM//2))
class1_remain_half_x = np.random.normal(D1_mean, np.mean(D1_var), size=(NUM_SAMPLES//2, DIM//2))

print(class1_first_half_x.shape)
print(class1_remain_half_x.shape)

(50, 8)
(50, 8)


In [10]:
class1_x = np.concatenate((class1_first_half_x, class1_remain_half_x), axis=1)
class1_y = np.array([1]*(NUM_SAMPLES//2))

In [11]:
print(class1_x.shape)
print(class1_y.shape)

(50, 16)
(50,)


In [12]:
x = np.concatenate((class0_x, class1_x))
y = np.concatenate((class0_y, class1_y))

print(x[:10])
print(y[:10])

[[ 1.21032426  0.60249278 -0.20830931 -0.90614163 -0.05865025 -1.32363443
  -2.02769935 -1.53658472 -0.24951249  0.43126338 -1.53946236  2.20857346
  -1.31390724  1.72527745 -1.64179038 -0.34652552]
 [ 0.67559968 -0.48776958 -1.37611705 -0.65289745 -0.41104945  1.27969705
  -2.37110323  0.66894191 -2.7803578  -1.01478267  0.53503999 -1.05406139
   1.0753292  -0.63899467 -5.47589969  4.9561116 ]
 [ 0.88304837  1.71031227  0.81355539 -0.71032827  0.78415627 -1.23336737
  -1.91678898  1.13516251 -7.94314755 -1.40462375 -0.4601786   0.74063568
  -3.84521508 -3.74168286  1.65625867 -1.25053481]
 [ 0.49315781 -0.51778338 -0.58797732 -0.95854178  0.11954221 -1.09056223
   1.08273621  0.88647243 -0.06915037 -0.46835277  2.3775031  -0.26840777
   1.23769394  1.72740355  0.7674933  -2.88237631]
 [-0.80925663 -1.14782712 -2.07013024  0.36788244  0.74059655  1.75165628
  -0.50151525  3.61432787 -1.397523    1.13413044 -0.66037669  5.11713574
  -0.14358046  4.70090831  3.92804527  3.17646903]
 [ 0.

In [15]:
x, y = shuffle(x, y, random_state=RANDOM_SEED)

In [16]:
print(x.shape)
print(y.shape)

print(x[:10])
print(y[:10])

(100, 16)
(100,)
[[ 3.91416581 -3.27586677 -3.32958692 -1.60332008  0.44957739 -4.72089664
   2.27857668 -0.85447523  1.86337405 -0.71859439 -0.58377755  0.16093142
   2.08442781 -0.3929432  -0.03679235 -1.15387286]
 [ 0.19068718  0.25593233 -3.33131042  2.45604245 -0.15764768 -1.11056261
   1.6755934  -0.86877509 -0.5535919   2.50337699  1.05303353 -0.03653094
   0.42988401 -0.78930394  0.57847027  2.13980818]
 [-0.43955502  1.12821504  3.18561614 -0.19432549 -1.61004678  0.5974602
   3.035116   -1.13461407  1.03088021 -1.26729182 -1.70541416  1.43888353
  -0.79844542 -0.5455803  -1.03949367 -0.52038881]
 [-1.00259912 -0.89314872  0.10540034  2.22028628 -3.25466625 -0.65560342
  -0.52262418  2.10353837 -0.76055309  0.49731092 -1.8291888  -1.14264863
  -0.86471184  0.75519329 -0.6061135  -0.33685399]
 [ 0.67559968 -0.48776958 -1.37611705 -0.65289745 -0.41104945  1.27969705
  -2.37110323  0.66894191 -2.7803578  -1.01478267  0.53503999 -1.05406139
   1.0753292  -0.63899467 -5.47589969  4

In [17]:
# split into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-TRAIN_RATIO, random_state=RANDOM_SEED)

In [18]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(90, 16)
(90,)
(10, 16)
(10,)


In [19]:
# Store to disk
full_output_dir = os.path.join(PARENT_DIR, OUTPUT_DIR)

if not os.path.exists(full_output_dir):
    print("Creating {} directory".format(full_output_dir))
    os.mkdir(full_output_dir)

Creating /content/drive/MyDrive/UTSA Student/Spring 2024/EE5423 HW4ML/Project/dataset/./Syn-Dataset-16 directory


In [20]:
full_output_x_train_file = os.path.join(full_output_dir, OUTPUT_X_TRAIN_FILE)
full_output_y_train_file = os.path.join(full_output_dir, OUTPUT_Y_TRAIN_FILE)
full_output_x_test_file = os.path.join(full_output_dir, OUTPUT_X_TEST_FILE)
full_output_y_test_file = os.path.join(full_output_dir, OUTPUT_Y_TEST_FILE)
full_output_x_file = os.path.join(full_output_dir, OUTPUT_X_FILE)
full_output_y_file = os.path.join(full_output_dir, OUTPUT_Y_FILE)
full_config_file = os.path.join(full_output_dir, CONFIG_FILE)

In [21]:
with open(full_output_x_train_file, 'wb') as f:
    np.save(f, x_train)

with open(full_output_y_train_file, 'wb') as f:
    np.save(f, y_train)

with open(full_output_x_test_file, 'wb') as f:
    np.save(f, x_test)

with open(full_output_y_test_file, 'wb') as f:
    np.save(f, y_test)

with open(full_output_x_file, 'wb') as f:
    np.save(f, x)

with open(full_output_y_file, 'wb') as f:
    np.save(f, y)

In [23]:
running_config = {
    "CLASSES": list(CLASSES),
    "DIM": DIM,
    "NUM_SAMPLES": NUM_SAMPLES,
    "TRAIN_RATIO": TRAIN_RATIO,
    "RANDOM_SEED": RANDOM_SEED,
    "D1_mean": D1_mean,
    "D1_var": D1_var,
    "D2_mean": D2_mean,
    "D2_var": D2_var
}
with open(full_config_file, 'w') as f:
    json.dump(running_config, f, indent=4)