In [1]:
from google.colab.patches import cv2_imshow
import cv2
import matplotlib.pyplot as plt
from keras.datasets import mnist, fashion_mnist
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import os, json
from sklearn.utils import shuffle

## MNIST

In [28]:
# RUNNING CONFIGURATION
RESCALED_IMAGE_HEIGHT = 2
RESCALED_IMAGE_WIDTH = 2
DESIRE_CLASSES = {3, 6}
PARENT_DIR = "/content/drive/MyDrive/UTSA Student/Spring 2024/EE5423 HW4ML/Project/dataset"
OUTPUT_DIR = "MNIST-2"
OUTPUT_X_TRAIN_FILE = "./x_train.npy"
OUTPUT_Y_TRAIN_FILE = "./y_train.npy"
OUTPUT_X_TEST_FILE = "./x_test.npy"
OUTPUT_Y_TEST_FILE = "./y_test.npy"
OUTPUT_X_FILE = "./x.npy"
OUTPUT_Y_FILE = "./y.npy"
CONFIG_FILE = './config.json'

In [29]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [30]:
# Rescale training dataset
rescaled_x_train = []
for img in tqdm(x_train):
  resized_img = cv2.resize(img, (RESCALED_IMAGE_WIDTH, RESCALED_IMAGE_HEIGHT))
  rescaled_x_train.append(resized_img)

100%|██████████| 60000/60000 [00:00<00:00, 112719.81it/s]


In [31]:
# Rescale testing dataset
rescaled_x_test = []
for img in tqdm(x_test):
  resized_img = cv2.resize(img, (RESCALED_IMAGE_WIDTH, RESCALED_IMAGE_HEIGHT))
  rescaled_x_test.append(resized_img)

100%|██████████| 10000/10000 [00:00<00:00, 166998.62it/s]


In [32]:
# Extract desire classes
desire_x_train = []
desire_y_train = []
desire_x_test = []
desire_y_test = []

for img, label in zip(rescaled_x_train, y_train):
  if label in DESIRE_CLASSES:
    desire_x_train.append(img)
    desire_y_train.append(label)

for img, label in zip(rescaled_x_test, y_test):
  if label in DESIRE_CLASSES:
    desire_x_test.append(img)
    desire_y_test.append(label)

desire_x_train = np.array(desire_x_train)
desire_y_train = np.array(desire_y_train)
desire_x_test = np.array(desire_x_test)
desire_y_test = np.array(desire_y_test)

In [33]:
print(desire_x_train.shape)
print(desire_y_train.shape)
print(desire_x_test.shape)
print(desire_y_test.shape)

(12049, 2, 2)
(12049,)
(1968, 2, 2)
(1968,)


In [34]:
desire_x = np.concatenate((desire_x_train, desire_x_test), axis=0)
desire_y = np.concatenate((desire_y_train, desire_y_test), axis=0)

In [35]:
print(desire_x.shape)
print(desire_y.shape)

(14017, 2, 2)
(14017,)


In [36]:
12049 + 1968

14017

In [37]:
random_indices = np.random.choice(len(desire_x_train), 90)
desire_x_train = desire_x_train[random_indices]
desire_y_train = desire_y_train[random_indices]

In [38]:
random_indices = np.random.choice(len(desire_x_test), 10)
desire_x_test = desire_x_test[random_indices]
desire_y_test = desire_y_test[random_indices]

In [39]:
# Store to disk
full_output_dir = os.path.join(PARENT_DIR, OUTPUT_DIR)

In [40]:
if not os.path.exists(full_output_dir):
    print("Creating {} directory".format(full_output_dir))
    os.mkdir(full_output_dir)

In [41]:
full_output_x_train_file = os.path.join(full_output_dir, OUTPUT_X_TRAIN_FILE)
full_output_y_train_file = os.path.join(full_output_dir, OUTPUT_Y_TRAIN_FILE)
full_output_x_test_file = os.path.join(full_output_dir, OUTPUT_X_TEST_FILE)
full_output_y_test_file = os.path.join(full_output_dir, OUTPUT_Y_TEST_FILE)
full_output_x_file = os.path.join(full_output_dir, OUTPUT_X_FILE)
full_output_y_file = os.path.join(full_output_dir, OUTPUT_Y_FILE)
full_config_file = os.path.join(full_output_dir, CONFIG_FILE)

In [42]:
with open(full_output_x_train_file, 'wb') as f:
    np.save(f, desire_x_train)

with open(full_output_y_train_file, 'wb') as f:
    np.save(f, desire_y_train)

with open(full_output_x_test_file, 'wb') as f:
    np.save(f, desire_x_test)

with open(full_output_y_test_file, 'wb') as f:
    np.save(f, desire_y_test)

with open(full_output_x_file, 'wb') as f:
    np.save(f, desire_x)

with open(full_output_y_file, 'wb') as f:
    np.save(f, desire_y)

In [43]:
running_config = {
    "RESCALED_IMAGE_HEIGHT": RESCALED_IMAGE_HEIGHT,
    "RESCALED_IMAGE_WIDTH": RESCALED_IMAGE_WIDTH,
    "DESIRE_CLASSES": list(DESIRE_CLASSES),
}
with open(full_config_file, 'w') as f:
    json.dump(running_config, f, indent=4)

In [44]:
print("DONE !")

DONE !


## Fashion-MNIST

In [77]:
ESCALED_IMAGE_HEIGHT = 2
RESCALED_IMAGE_WIDTH = 2
DESIRE_CLASSES = {0, 3, 6}
PARENT_DIR = "/content/drive/MyDrive/UTSA Student/Spring 2024/EE5423 HW4ML/Project/dataset"
OUTPUT_DIR = "./Fashion-MNIST-3"
OUTPUT_X_TRAIN_FILE = "./x_train.npy"
OUTPUT_Y_TRAIN_FILE = "./y_train.npy"
OUTPUT_X_TEST_FILE = "./x_test.npy"
OUTPUT_Y_TEST_FILE = "./y_test.npy"
OUTPUT_X_FILE = "./x.npy"
OUTPUT_Y_FILE = "./y.npy"
CONFIG_FILE = './config.json'

In [78]:
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

In [79]:
print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)


In [80]:
# Rescale training dataset
rescaled_x_train = []
for img in tqdm(x_train):
  resized_img = cv2.resize(img, (RESCALED_IMAGE_WIDTH, RESCALED_IMAGE_HEIGHT))
  rescaled_x_train.append(resized_img)

100%|██████████| 60000/60000 [00:00<00:00, 340598.81it/s]


In [81]:
# Rescale testing dataset
rescaled_x_test = []
for img in tqdm(x_test):
  resized_img = cv2.resize(img, (RESCALED_IMAGE_WIDTH, RESCALED_IMAGE_HEIGHT))
  rescaled_x_test.append(resized_img)

100%|██████████| 10000/10000 [00:00<00:00, 339804.10it/s]


In [82]:
# Extract desire classes
desire_x_train = []
desire_y_train = []
desire_x_test = []
desire_y_test = []

for img, label in zip(rescaled_x_train, y_train):
  if label in DESIRE_CLASSES:
    desire_x_train.append(img)
    desire_y_train.append(label)

for img, label in zip(rescaled_x_test, y_test):
  if label in DESIRE_CLASSES:
    desire_x_test.append(img)
    desire_y_test.append(label)

desire_x_train = np.array(desire_x_train)
desire_y_train = np.array(desire_y_train)
desire_x_test = np.array(desire_x_test)
desire_y_test = np.array(desire_y_test)

In [83]:
print(desire_x_train.shape)
print(desire_y_train.shape)
print(desire_x_test.shape)
print(desire_y_test.shape)

(18000, 2, 2)
(18000,)
(3000, 2, 2)
(3000,)


In [84]:
desire_x = np.concatenate((desire_x_train, desire_x_test), axis=0)
desire_y = np.concatenate((desire_y_train, desire_y_test), axis=0)

In [85]:
print(desire_x.shape)
print(desire_y.shape)

(21000, 2, 2)
(21000,)


In [86]:
random_indices = np.random.choice(len(desire_x_train), 135)
desire_x_train = desire_x_train[random_indices]
desire_y_train = desire_y_train[random_indices]
print(len(desire_x_train))

135


In [87]:
random_indices = np.random.choice(len(desire_x_test), 15)
desire_x_test = desire_x_test[random_indices]
desire_y_test = desire_y_test[random_indices]

In [88]:
# Store to disk
full_output_dir = os.path.join(PARENT_DIR, OUTPUT_DIR)

In [89]:
if not os.path.exists(full_output_dir):
    print("Creating {} directory".format(full_output_dir))
    os.mkdir(full_output_dir)

In [90]:
full_output_x_train_file = os.path.join(full_output_dir, OUTPUT_X_TRAIN_FILE)
full_output_y_train_file = os.path.join(full_output_dir, OUTPUT_Y_TRAIN_FILE)
full_output_x_test_file = os.path.join(full_output_dir, OUTPUT_X_TEST_FILE)
full_output_y_test_file = os.path.join(full_output_dir, OUTPUT_Y_TEST_FILE)
full_output_x_file = os.path.join(full_output_dir, OUTPUT_X_FILE)
full_output_y_file = os.path.join(full_output_dir, OUTPUT_Y_FILE)
full_config_file = os.path.join(full_output_dir, CONFIG_FILE)

In [91]:
with open(full_output_x_train_file, 'wb') as f:
    np.save(f, desire_x_train)

with open(full_output_y_train_file, 'wb') as f:
    np.save(f, desire_y_train)

with open(full_output_x_test_file, 'wb') as f:
    np.save(f, desire_x_test)

with open(full_output_y_test_file, 'wb') as f:
    np.save(f, desire_y_test)

with open(full_output_x_file, 'wb') as f:
    np.save(f, desire_x)

with open(full_output_y_file, 'wb') as f:
    np.save(f, desire_y)

In [92]:
running_config = {
    "RESCALED_IMAGE_HEIGHT": RESCALED_IMAGE_HEIGHT,
    "RESCALED_IMAGE_WIDTH": RESCALED_IMAGE_WIDTH,
    "DESIRE_CLASSES": list(DESIRE_CLASSES),
}
with open(full_config_file, 'w') as f:
    json.dump(running_config, f, indent=4)

In [93]:
print("DONE !")

DONE !


## Synthetic dataset

In [95]:
DIM = 4
NUM_SAMPLES = 100
TRAIN_RATIO = 0.9
CLASSES = {0, 1}
PARENT_DIR = "/content/drive/MyDrive/UTSA Student/Spring 2024/EE5423 HW4ML/Project/dataset"
OUTPUT_DIR = "./Syn-Dataset-{}".format(DIM)
D1_mean = 0
D1_var = 1
D2_mean = 0
D2_var = 2
OUTPUT_X_TRAIN_FILE = "./x_train.npy"
OUTPUT_Y_TRAIN_FILE = "./y_train.npy"
OUTPUT_X_TEST_FILE = "./x_test.npy"
OUTPUT_Y_TEST_FILE = "./y_test.npy"
OUTPUT_X_FILE = "./x.npy"
OUTPUT_Y_FILE = "./y.npy"
CONFIG_FILE = './config.json'
RANDOM_SEED = 2505

np.random.seed(RANDOM_SEED)

In [96]:
# Class 0

class0_first_half_x = np.random.normal(D1_mean, np.mean(D1_var), size=(NUM_SAMPLES//2, DIM//2))
class0_remain_half_x = np.random.normal(D2_mean, np.mean(D2_var), size=(NUM_SAMPLES//2, DIM//2))

print(class0_first_half_x.shape)
print(class0_remain_half_x.shape)

(50, 2)
(50, 2)


In [97]:
class0_x = np.concatenate((class0_first_half_x, class0_remain_half_x), axis=1)
class0_y = np.array([0]*(NUM_SAMPLES//2))

In [98]:
print(class0_x.shape)
print(class0_y.shape)

(50, 4)
(50,)


In [99]:
# Class 1
# Generate first half features

class1_first_half_x = np.random.normal(D2_mean, np.mean(D2_var), size=(NUM_SAMPLES//2, DIM//2))
class1_remain_half_x = np.random.normal(D1_mean, np.mean(D1_var), size=(NUM_SAMPLES//2, DIM//2))

print(class1_first_half_x.shape)
print(class1_remain_half_x.shape)

(50, 2)
(50, 2)


In [100]:
class1_x = np.concatenate((class1_first_half_x, class1_remain_half_x), axis=1)
class1_y = np.array([1]*(NUM_SAMPLES//2))

In [101]:
print(class1_x.shape)
print(class1_y.shape)

(50, 4)
(50,)


In [102]:
x = np.concatenate((class0_x, class1_x))
y = np.concatenate((class0_y, class1_y))

print(x[:10])
print(y[:10])

[[ 1.3760194  -0.34164502 -2.99322754 -5.54831045]
 [-1.04779321  0.37475254 -1.01940335 -1.6498197 ]
 [-0.24714573  0.53942525  0.98228518 -0.47499501]
 [-0.58598548  1.01515986  0.55491654 -4.71360473]
 [ 0.24018883  0.47826397  1.6886987   2.20410104]
 [-0.02145142  1.15984038 -0.46805269  0.40109993]
 [-1.6052683  -0.28907255  1.24568355  0.31908204]
 [-1.27098151 -1.92257003 -1.25753705  1.10510766]
 [-0.06811353 -1.20551905 -0.89380969 -2.34044295]
 [-1.09796332 -1.19904003 -1.45930105  1.33907779]]
[0 0 0 0 0 0 0 0 0 0]


In [103]:
x, y = shuffle(x, y, random_state=RANDOM_SEED)

In [104]:
print(x.shape)
print(y.shape)

print(x[:10])
print(y[:10])

(100, 4)
(100,)
[[-1.49724513 -1.20569773 -1.53699294  1.49344204]
 [-1.01156104 -0.46255788  2.8390121   1.68761376]
 [-1.06657732  1.30547118  0.37030833  0.50414792]
 [ 0.06495697  0.4007486   0.15955336  1.1453149 ]
 [-0.17173882 -2.06203603  0.25327173  0.12175931]
 [ 0.15741425  0.34132582  3.08268001 -1.54210186]
 [ 0.36019482 -3.61591349  0.08294541  0.38690188]
 [-0.06811353 -1.20551905 -0.89380969 -2.34044295]
 [-0.5839145   1.33036545 -1.28362822 -0.25362277]
 [-3.5742118  -4.12979452  0.9612043  -0.7227807 ]]
[0 0 1 0 1 0 1 0 1 1]


In [105]:
# split into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-TRAIN_RATIO, random_state=RANDOM_SEED)

In [106]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(90, 4)
(90,)
(10, 4)
(10,)


In [107]:
# Store to disk
full_output_dir = os.path.join(PARENT_DIR, OUTPUT_DIR)

if not os.path.exists(full_output_dir):
    print("Creating {} directory".format(full_output_dir))
    os.mkdir(full_output_dir)

In [108]:
full_output_x_train_file = os.path.join(full_output_dir, OUTPUT_X_TRAIN_FILE)
full_output_y_train_file = os.path.join(full_output_dir, OUTPUT_Y_TRAIN_FILE)
full_output_x_test_file = os.path.join(full_output_dir, OUTPUT_X_TEST_FILE)
full_output_y_test_file = os.path.join(full_output_dir, OUTPUT_Y_TEST_FILE)
full_output_x_file = os.path.join(full_output_dir, OUTPUT_X_FILE)
full_output_y_file = os.path.join(full_output_dir, OUTPUT_Y_FILE)
full_config_file = os.path.join(full_output_dir, CONFIG_FILE)

In [109]:
with open(full_output_x_train_file, 'wb') as f:
    np.save(f, x_train)

with open(full_output_y_train_file, 'wb') as f:
    np.save(f, y_train)

with open(full_output_x_test_file, 'wb') as f:
    np.save(f, x_test)

with open(full_output_y_test_file, 'wb') as f:
    np.save(f, y_test)

with open(full_output_x_file, 'wb') as f:
    np.save(f, x)

with open(full_output_y_file, 'wb') as f:
    np.save(f, y)

In [110]:
running_config = {
    "CLASSES": list(CLASSES),
    "DIM": DIM,
    "NUM_SAMPLES": NUM_SAMPLES,
    "TRAIN_RATIO": TRAIN_RATIO,
    "RANDOM_SEED": RANDOM_SEED,
    "D1_mean": D1_mean,
    "D1_var": D1_var,
    "D2_mean": D2_mean,
    "D2_var": D2_var
}
with open(full_config_file, 'w') as f:
    json.dump(running_config, f, indent=4)