In [None]:
import tensorflow as tf

tfk = tf.keras
tfkl = tf.keras.layers
import numpy as np
import matplotlib.pyplot as plt
import os, glob
import tqdm
import re

PATH_TO_DATA = "data/final_28x28_proc"

In [None]:
def load_image_from_path(path):
    """
    Decode a jpg image from the string filepath

    :param path: Path to image
    :return: 3D image tensor
    """
    raw_img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(raw_img, channels=3)
    img = tf.image.convert_image_dtype(img, 'float32') # pre-normalises to 0,1
    
    return img.numpy()

filelist = glob.glob(os.path.join(PATH_TO_DATA, "*", "*.jpg"))
all_data = np.array([load_image_from_path(f) for f in tqdm.tqdm(filelist, desc='Processing data')])

In [None]:
monochrome_data = '...' # write code here - either to take the mean, or pick out an individual component

fig, axes = plt.subplots(1, 5)

randidx = np.random.randint(0, len(monochrome_data), size=5)

for ax, example in zip(axes, randidx):
    ax.imshow(monochrome_data[example], cmap='Greys')
    ax.axis("off")

In [None]:
from sklearn.decomposition import PCA

# For sklearn we need to flatten all non-leading dimensions - don't worry, this isn't permanent!
pca_data = monochrome_data.reshape(len(monochrome_data), -1)
print("PCA input has shape {}".format(pca_data.shape))


In [None]:
# N most prominent components, for example
pca_model = PCA(n_components=20).fit(pca_data)

eigengalaxies = pca_model.components_.reshape(pca_model.n_components, 28, 28)

fig, axes = plt.subplots(3, pca_model.n_components // 3)

for idx, (ax, eigengalaxy) in enumerate(zip(axes.ravel(), eigengalaxies)):
    ax.imshow(eigengalaxy, cmap='Greys')
    ax.axis("off")
    ax.set_title(f"PC {idx}")
    
plt.show()

plt.plot(1 - pca_model.explained_variance_ratio_)
plt.xlabel("Principal component")
plt.ylabel("Fraction of explained variance")

### How do galaxies look with truncated principal components?

In [None]:
pca_result = PCA(n_components=20).fit(pca_data)
pca_coeffs = pca_result.transform(pca_data)

fig, axes = plt.subplots(1, 5)

randidx = np.random.randint(0, len(monochrome_data), size=5)

fig, axes = plt.subplots(1, 5)

for ax, example in zip(axes, randidx):
    ax.imshow(monochrome_data[example], cmap='Greys')
    ax.axis("off")
    
plt.show()

eigengalaxies = pca_model.components_.reshape(pca_model.n_components, 28, 28)

n_components = '...'
truncated_imgs = '...'

fig, axes = plt.subplots(1, 5)

for ax, example in zip(axes, randidx):
    ax.imshow(truncated_imgs[example], cmap='Greys')
    ax.axis("off")
    
plt.show()


### How does the PCA latent space look?

In [None]:
morphology = np.array([f.split("/")[-1][:-4].split("_")[-1] for f in filelist])

def amend_morphology_str(morphstr):
    # Merge all ellipticals into one class
    if re.search("E.", morphstr):
        return "E"
    
    # Merge all barred spirals into one class
    if re.search("SB.", morphstr):
        return "SB"
    
    # Move transitional spirals into Irregular
    if re.search("Sm", morphstr):
        return "Irr"
    
    # All spirals in the same class
    if re.search("S.", morphstr):
        return "S"
    
    else:
        return morphstr
    
morphology_corr = np.array([amend_morphology_str(s) for s in morphology])

for m in np.unique(morphology_corr):
    print("{}: {} of {}".format(m, (morphology_corr == m).sum(), len(morphology_corr)))
    
fig, ax = plt.subplots(1, len(np.unique(morphology_corr)))

print("Example morphology classes")
for i, m in enumerate(np.unique(morphology_corr)):
    idx = np.random.choice(np.argwhere(morphology_corr == m).flatten())
    ax[i].imshow(all_data[idx])
    ax[i].set_title(m)
    ax[i].axis("off")

In [None]:
pca_coeffs = PCA(n_components=2).fit_transform(pca_data)

for group in ['E', 'Irr', 'S', 'SB', 'NA']:
    subset = morphology_corr == group
    
    plt.scatter(*pca_coeffs[subset].T, s=1, label=group)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.show()

### Does PCA divide the classes well?
* The plot below shows PC1 and PC2, and colours the points by their label.
* Try changing the groups present in the list below, and see which classes can be distinguished from each other. Does it make sense?

In [None]:
pca_coeffs = PCA(n_components=2).fit_transform(pca_data)

for group in ['E', 'S']:
    subset = morphology_corr == group
    
    plt.scatter(*pca_coeffs[subset].T, s=1, label=group)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.show()