# Train UMAP (9, 112, 224)

### Code for training, loading and projecting data with UMAP into 2D
We are partly working and interpolating at (9, 112, 224) because the 2D UMAP projections proved to be more human understandable.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import umap

from glob import glob
from scipy.spatial.distance import cdist
from tqdm.notebook import tqdm
import pickle
import h5py

In [2]:
# Loading the training set. UMAP will only be trained on real samples originating from the training set.
h5f = h5py.File('./diversity_saves/train_set.h5', 'r')

In [3]:
# Load 3000 images
real_im = h5f['images'][:3000]
h5f.close()

# Train UMAP

#### Training umap using training data. This step can be skipped and if there is already saved umap object.
Running this might lead to different results than the one presented in the article. The training data was shuffled and the UMAP dimensionality reduction is sensitve to changes in input distribution.
(Varying the number/sampling of the images used in the training might be enough to get embedding space similar to the one in the article, we were able to reproduce similar projection on many different UMAP training).

In [None]:
# Training UMAP
trans = umap.UMAP(n_neighbors=20, random_state=10, min_dist = 0.0).fit(real_im[:1500].reshape([1500, 9*112*224]))

In [1]:
f_name = './diversity_saves/umap_1500.sav'

In [29]:
# Saving the trained UMAP object using pickle
pickle.dump(trans, open(f_name, 'wb'))

# Load UMAP

In [3]:
# Saving the trained UMAP object using pickle
f_name = './diversity_saves/umap_3000.sav'

In [4]:
loaded_model = pickle.load((open(f_name, 'rb')))

<class 'umap.umap_.UMAP'>


# Project UMAP

#### Projecting umap on a subset unseen training samples and synthetic samples
Projection is done for 1000 real (from training) and 1000 synthetic images

In [31]:
# loading other training samples
h5f = h5py.File('./diversity_saves/train_set.h5', 'r')
real_im_transform = h5f['images'][3000:4000]
h5f.close()

In [32]:
# Saving the small samples for reproducability
u_real = loaded_model.transform(real_im_transform.reshape([1000, 9*112*224]))
np.save('./diversity_saves/umap_real_1k_large.npy', u_real)

In [33]:
h5f = h5py.File('./diversity_saves/synth_set.h5', 'r')
synth_im_transform = h5f['images'][:1000]
h5f.close()

In [34]:
u_fake = loaded_model.transform(synth_im_transform.reshape([1000, 9*112*224]))
np.save('./diversity_saves/umap_synth_1k_large.npy', u_fake)

# Project Umap full data

#### Projecting umap on the whole training and synthetic dataset
Projection is done for 7832 real (from training) and 10000 synthetic images
Takes very long to run (around 8h for training + synthetic projection; 20 minutes per split of 1000 images)

In [None]:
# Projecting training data with UMAP

# Splitting data into 1000 samples chunk.
# Working with the whole data would make the projection lag out and the progress untractable
h5f = h5py.File('./diversity_saves/train_set.h5', 'r')
splits_1k = len(h5f['images'])//1000
len_train = len(h5f['images'])
if len_train%1000 != 0:
    splits_1k += 1

umap_train_full = np.zeros((len_train, 2))

for i in tqdm(range(splits_1k)):
    if len_train - i*1000 > 1000:
        upper_bound = 1000*(i+1)
    else:
        upper_bound = len_train
    temp_im_train = h5f['images'][1000*i:upper_bound]
    temp_im_train = np.array(temp_im_train).reshape([len(temp_im_train), 9*112*224])
    
    umap_train_full[1000*i:upper_bound,:] = loaded_model.transform(temp_im_train)
    
np.save('./diversity_saves/umap_train_full.npy', umap_train_full)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

In [None]:
# Projecting synthetic data with UMAP

# Splitting data into 1000 samples chunk.
# Working with the whole data would make the projection lag out and the progress untractable
h5f = h5py.File('./diversity_saves/synth_set.h5', 'r')
splits_1k = len(h5f['images'])//1000
len_synth = len(h5f['images'])
if len_synth%1000 != 0:
    splits_1k_ += 1

umap_synth_full = np.zeros((len_synth, 2))

for i in tqdm(range(splits_1k)):
    if len_synth - i*1000 > 1000:
        upper_bound = 1000*(i+1)
    else:
        upper_bound = len_synth
    temp_im_synth = h5f['images'][1000*i:upper_bound]
    temp_im_synth = np.array(temp_im_synth).reshape([len(temp_im_synth), 9*112*224])

    umap_synth_full[1000*i:upper_bound,:] = loaded_model.transform(temp_im_synth)
np.save('./diversity_saves/umap_synth_full.npy', umap_synth_full)