# **Contents:**
The following Notebook depicts the process of loading image datasets, passing them through a ResNet50 classifier and then reducing the dimensionality of the output for plotting with UMAP in 2 and 3 dimensions.

#**Imports:**

In [None]:
!pip install umap-learn

print('Mounting google drive...')
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/Diss_GAN"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 6.4 MB/s 
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.7.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 37.8 MB/s 
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82829 sha256=ee6e67c8ff9511ccf578cd3c55b4f6fbca6843e55f1d9a044645d90861f6b554
  Stored in directory: /root/.cache/pip/wheels/b3/52/a5/1fd9e3e76a7ab34f134c07469cd6f16e27ef3a37aeff1fe821
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for pynndescent: filename=pynndescent-0.5.7-py3-none-any.whl size=54286 sha256=f1e1f8c24c4a91fbd5c2c0dfc01a02b393eb0df02b456726ef2a4cd95e2a918d
  Stored in directo

In [None]:
import torch
import glob
import torchvision.models as models
import cv2
import torchvision.transforms as transforms
from tqdm import tqdm
import numpy as np
from umap import UMAP
import plotly
import plotly.express as px
from PIL import Image
import os

#**Load Model for Creating Image Embeddings:**

In [None]:
model = getattr(models, 'resnet50')(pretrained=True)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

In [None]:
# Removing final layer of ResNet50 to leave the last embedding layer as the final output:
model.fc = torch.nn.Identity()

#**Pass Image Datasets Through ResNet Model:**

In [None]:
tt_data_loc = 'data/tt_crops/*'
niki_data_loc = 'data/niki/*'
tcga_data_loc = 'data/tcga/*'
gen_data_loc = 'data/gen/*'

In [None]:
# Create lists of images in directories:
tt_imgs = (glob.glob(tt_data_loc))
niki_imgs = (glob.glob(niki_data_loc))
tcga_imgs = (glob.glob(tcga_data_loc))
gen_imgs = (glob.glob(gen_data_loc))

# Display top 5 images:
tt_imgs[0:5]

['data/tt_crops/61i_2.jpeg',
 'data/tt_crops/61j_2.jpeg',
 'data/tt_crops/62a_2.jpeg',
 'data/tt_crops/62b_2.jpeg',
 'data/tt_crops/62c_2.jpeg']

In [None]:
def reduce_dataset(dataset, model, device, ds_name):
    """
    Function to reduce an each image in a dataset from the RBG input shape 
    (512, 512, 3) to a 2,048-long embedding vector produced by ResNet50.
    """
    # Define transform to convert image np arrays to Tensors:
    transform = transforms.Compose([
        transforms.ToTensor()
    ])

    # Assinging model to GPU and put in evaluation mode:
    model.cuda()
    model.eval()

    img_output = []

    for img in tqdm(dataset, desc = '%s Image Processing' % ds_name):
        # Read image:
        image = cv2.imread(img)

        # Convert BGR image to RGB image
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Convert the image to a Torch tensor
        img_tensor = transform(image)

        # Reshaping:
        img_tensor = img_tensor.unsqueeze(0) 

        # Assing data to gpu:
        img_tensor = img_tensor.to(device)

        # Get output (2,048 Tensor vector) from model:
        mod_output = model(img_tensor)

        # Convert to numpy array and append to list:
        np_arr = mod_output.cpu().detach().numpy()[0]
        img_output.append(np_arr)

    return img_output


In [None]:
# Setting device as GPU if available:
cuda = True if torch.cuda.is_available() else False
if cuda:
    device = "cuda:0"
else:
    device = 'cpu'

In [None]:
tt_umap = reduce_dataset(tt_imgs, model, device, 'Tharun Thompson')
niki_umap = reduce_dataset(niki_imgs, model, device, 'Nikiforov')
tcga_umap = reduce_dataset(tcga_imgs, model, device, 'TCGA')
gen_umap = reduce_dataset(gen_imgs, model, device, 'GAN-Generated')

Tharun Thompson Image Processing: 100%|██████████| 1496/1496 [00:56<00:00, 26.33it/s]
Nikiforov Image Processing: 100%|██████████| 720/720 [04:17<00:00,  2.80it/s]
TCGA Image Processing: 100%|██████████| 517/517 [03:00<00:00,  2.87it/s]
GAN-Generated Image Processing: 100%|██████████| 1500/1500 [08:40<00:00,  2.88it/s]


In [None]:
# Combining the lists:
data_comb = tt_umap + niki_umap + tcga_umap + gen_umap

# Converting to np array:
umap_data = np.array(data_comb)
print(umap_data.shape)

(4233, 2048)


# **Create Dataset Labels for UMAP:**

In [None]:
# Create labels according to amount of input images in datasets:
def create_labels(dataset, ds_name):
    img_no = len(dataset)
    lbls = [ds_name] * img_no

    return lbls

In [None]:
tt_lbls = create_labels(tt_imgs, 'Source Data')
niki_lbls = create_labels(niki_imgs, 'Nikiforov')
tcga_lbls = create_labels(tcga_imgs, 'TCGA')
gen_lbls = create_labels(gen_imgs, 'GAN-Generated')

In [None]:
# Combine the lists and convert to np array:
lbls_comb = tt_lbls + niki_lbls + tcga_lbls + gen_lbls

umap_lbls = np.array(lbls_comb)
print(umap_lbls.shape)

(4233,)


#**UMAP:**

##**2D Projection:**

In [None]:
# Initialize UMAP:
umap_2d = UMAP(random_state = 42)

In [None]:
# Fit the UMAP and make projections based on the data:
umap_2d.fit(umap_data)

projections = umap_2d.transform(umap_data)

In [None]:
fig = px.scatter(
            projections, x=0, y=1,
            color=umap_lbls, 
            labels={'color': 'Dataset'},
            width=1200,
            color_discrete_map={'Source Data': 'gold', 
                                'Nikiforov': 'orchid', 
                                'TCGA': 'green'}
            )
fig.update_layout(
    font_size = 13,
    legend = dict(font = dict(size = 14), itemsizing = 'constant'),
    legend_title_font_size=20,
    xaxis_title="UMAP 1", 
    yaxis_title="UMAP 2"
)
fig.show()

### **Including GAN-Generated Images:**

In [None]:
fig = px.scatter(
            projections, x=0, y=1,
            color=umap_lbls, 
            labels={'color': 'Dataset'},
            width=1200,
            color_discrete_map={'Source Data': 'gold',  
                                'Nikiforov': 'orchid', 
                                'TCGA': 'green',
                                'GAN-Generated': 'royalblue'}
            )
fig.update_layout(
    font_size = 13,
    legend = dict(font = dict(size = 14), itemsizing = 'constant'),
    legend_title_font_size=20,
    xaxis_title="UMAP 1", 
    yaxis_title="UMAP 2"
)
fig.show()

##**3D Projection:**

In [None]:
# Initialize UMAP:
umap_3d = UMAP(n_components=3, init='random', random_state=42)

In [None]:
# Fit the UMAP and make projections based on the data:
umap_3d.fit(umap_data)

projections_3d = umap_3d.transform(umap_data)

In [None]:
fig = px.scatter_3d(
            projections_3d, x=0, y=1, z=2,
            color=umap_lbls, 
            labels={'color': 'Dataset'},
            width=1200, height=1000,
            color_discrete_map={'Source Data': 'gold', 
                                'Nikiforov': 'orchid', 
                                'TCGA': 'green'}
            )
fig.update_layout(
    font_size = 13,
    legend = dict(font = dict(size = 14), itemsizing = 'constant'),
    legend_title_font_size=20,
    xaxis_title="UMAP 1", 
    yaxis_title="UMAP 2",
    #zaxis_title="UMAP3"
)
fig.update_traces(marker_size=2)
fig.show()

In [None]:
# Saving to html:
plotly.offline.plot(fig, filename='images/umap/src_niki_tcga_3d.html')

'images/umap/src_niki_tcga_3d.html'

### **Including GAN-Generated Images:**

In [None]:
fig = px.scatter_3d(
            projections_3d, x=0, y=1, z=2,
            color=umap_lbls, 
            labels={'color': 'Dataset'},
            width=1200, height=1000,
            color_discrete_map={'Source Data': 'gold', 
                                'Nikiforov': 'orchid', 
                                'TCGA': 'green',
                                'GAN-Generated': 'royalblue'}
            )
fig.update_layout(
    font_size = 13,
    legend = dict(font = dict(size = 14), itemsizing = 'constant'),
    legend_title_font_size=20,
    xaxis_title="UMAP 1", 
    yaxis_title="UMAP 2",
    #zaxis_title="UMAP3"
)
fig.update_traces(marker_size=2)
fig.show()

In [None]:
plotly.offline.plot(fig, filename='images/umap/gan_gen_3d.html')

'images/umap/gan_gen_3d.html'