In [1]:
import os
import sys
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

from typing import Iterable, Tuple, List
from sklearn.manifold import TSNE
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

if not tf.config.list_physical_devices('GPU'):
    print('Warning: without a GPU the training will take a lo:ng time...')

IMAGE_SIZE = (256, 256, 3)
NUM_CLASSES = 125
SKETCH_ROOT = "/home/sysung98/MIDS/W281/final_project/data/rendered_256x256/256x256/sketch"
tx_000100000000_fp = SKETCH_ROOT + '/tx_000100000000'

categories = os.listdir(tx_000100000000_fp)
categories = [x for x in categories if x != '.DS_Store']
len(categories)

2023-04-14 12:19:43.043222: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-14 12:19:47.644695: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-14 12:19:47.889630: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-14 12:19:47.889717: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been bu

125

In [2]:
def get_image_dataset(ratios: List[float], augmentations: List[str] = None, color_mode: str = 'rgb',
                    seed: int = 1, batch_size: int = 32) -> List[tf.data.Dataset]:
    '''
    Get tensorflow dataset using generator to avoid RAM limitations
    Splits into train, val, test from all provided augmentations 
    '''

    # No augmentation by default
    augmentations = augmentations or ['tx_000000000000']

    assert sum(ratios) == 1 and len(ratios) == 3, 'Sum of 3 ratios must add to 1'

    datasets: List[Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]] = []
    for aug in augmentations:
        print(f'Reading images from augmentation {aug}...')
        train_ds = tf.keras.utils.image_dataset_from_directory(
            directory = os.path.join(SKETCH_ROOT, aug),
            image_size = IMAGE_SIZE[:2],
            label_mode='categorical',
            seed = seed,
            color_mode = color_mode,
            validation_split = (1 - ratios[0]),
            subset = 'training',
            batch_size=batch_size
        )

        val_ds = tf.keras.utils.image_dataset_from_directory(
            directory = os.path.join(SKETCH_ROOT, aug),
            image_size = IMAGE_SIZE[:2],
            label_mode='categorical',
            seed = seed,
            color_mode = 'rgb',
            validation_split = (1 - ratios[0]),
            subset = 'validation',
            batch_size=batch_size
        )

        val_batch_count = int(tf.data.experimental.cardinality(val_ds))
        test_ds = val_ds.take(int(val_batch_count * ratios[1]))
        val_ds = val_ds.skip(int(val_batch_count * ratios[1]))

        datasets.append([train_ds, val_ds, test_ds])
    
    combined = datasets.pop()
    for train, val, test in datasets:
        combined[0] = combined[0].concatenate(train)
        combined[1] = combined[1].concatenate(val)
        combined[2] = combined[2].concatenate(test)

    if batch_size != None:
        return [c.shuffle(batch_size) for c in combined]
    else:
        return combined

In [3]:
train_ds, val_ds, test_ds = get_image_dataset(
    [0.8, 0.1, 0.1], 
    augmentations=['tx_000100000000'],
    color_mode = 'grayscale',
    batch_size = None
)

Reading images from augmentation tx_000100000000...
Found 75481 files belonging to 125 classes.
Using 60385 files for training.


2023-04-14 12:19:53.361153: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-14 12:19:53.361309: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-14 12:19:53.361359: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-14 12:19:55.921204: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-14 12:19:55.921378: I tensorflow/compile

Found 75481 files belonging to 125 classes.
Using 15096 files for validation.


In [13]:
sample_train_ds = train_ds.take(tf.data.experimental.cardinality(train_ds) // 50)

X = []
true_y = []

ct = 0
for x, y in sample_train_ds.as_numpy_iterator():
    true_y.append(np.argmax(y==1))
    X.append(x.reshape((65536, )))

X = np.array(X)
true_y = np.array(true_y)

print(X.shape)
print(true_y.shape)

2023-04-14 13:01:35.577755: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [60385]
	 [[{{node Placeholder/_0}}]]
2023-04-14 13:01:35.579211: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_4' with dtype int32 and shape [60385]
	 [[{{node Placeholder/_4}}]]


(1207, 65536)
(1207,)


In [14]:
# Define function to calculate silhouette score
def calc_silhouette(X_embedded, y, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=424)
    labels = kmeans.fit_predict(X_embedded)
    score = silhouette_score(X_embedded, labels)
    return score

param_grid = {
    'perplexity': [50, 75, 100],
    'learning_rate': [5, 10, 20, 50]
}

param_combinations = ParameterGrid(param_grid)

for params in param_combinations:
    print(f"Testing params: {params}")
    tsne = TSNE(
        perplexity=params['perplexity'], 
        learning_rate=params['learning_rate'],
        n_iter=5000
    )
    X_embedded = tsne.fit_transform(X)

    silhouette_scores = calc_silhouette(X_embedded, y, 125)

    print(f'silhouette_score: {silhouette_scores}')
    print(f'num iters: {tsne.n_iter_}')
    print()

Testing params: {'learning_rate': 5, 'perplexity': 50}
silhouette_score: 0.36586230993270874
num iters: 149

Testing params: {'learning_rate': 5, 'perplexity': 75}
silhouette_score: 0.35680365562438965
num iters: 2399

Testing params: {'learning_rate': 5, 'perplexity': 100}
silhouette_score: 0.37026265263557434
num iters: 149

Testing params: {'learning_rate': 10, 'perplexity': 50}
silhouette_score: 0.3790605068206787
num iters: 2249

Testing params: {'learning_rate': 10, 'perplexity': 75}
silhouette_score: 0.3737393915653229
num iters: 1599

Testing params: {'learning_rate': 10, 'perplexity': 100}
silhouette_score: 0.3897615969181061
num iters: 2499

Testing params: {'learning_rate': 20, 'perplexity': 50}
silhouette_score: 0.36549368500709534
num iters: 1499

Testing params: {'learning_rate': 20, 'perplexity': 75}
silhouette_score: 0.3703679144382477
num iters: 2549

Testing params: {'learning_rate': 20, 'perplexity': 100}
silhouette_score: 0.3710382282733917
num iters: 1749

Testing 

In [None]:
# import matplotlib.pyplot as plt

# Y = tsne.fit_transform(
#     X
# )

# plt.scatter(Y[:, 0], Y[:, 1], c=true_y)
# plt.title('t-SNE visualization of data')
# plt.show()