In this notebook we:
- Load image names from the data pngs saved in `1. Preprocess_images.ipynb`, removing any which have already had feature vectors found (in the S3 'miro-images-feature-vectors' bucket). This step was neccessary since we ran this code over different sessions.
- Create a dataset, run dataloader and get feature vectors using the vgg16 pretrained model.
- Each feature vector for each image in stored in "feature_vectors/A0000001"
- We then pull in the feature vectors found in the above step, scale them, take a sample of 5000, and use the elbow method to see how many principle components you can reduce to whilst keeping the explained variance at 1. This value is about 100 components.
- We then save dimensionality reduced feature vectors to S3 for these 5000 images, choosing 2, 20, 80, 100, 500, 1000 components. Also saved in the 'miro-images-feature-vectors' bucket under the prefixes "reduced_feature_vectors_i_dims/A0000001" where i = 2, 20, 80, 100, 500, 1000.
- 120576 images had feature vectors and reduced feature vectors found.

Note:
- If using an instance with a GPU, this notebook will run using the GPU.

In [None]:
import os
from io import BytesIO
import json
import pickle

from PIL import Image
from tqdm import tqdm
import boto3
import numpy as np
import torch
from torchvision.models import vgg
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
vgg16 = vgg.vgg16(pretrained=True).to(device)
vgg16 = vgg16.eval()  # for no dropout behaviour

### 1. Load image names, removing any which have already had feature vectors found

In [None]:
# Get all the png image names from the data folder
images_dir = "../data/"
image_type = ".png"

image_names = os.listdir(images_dir)
image_names = [os.path.splitext(file)[0] for file in image_names if image_type in file]
len(image_names)

In [None]:
# Remove the image files which already have feature vectors found
feat_vect_file_dir = "feature_vectors"

s3 = boto3.resource("s3")
my_bucket = s3.Bucket("miro-images-feature-vectors")

images_run = [
    os.path.basename(file.key)
    for file in my_bucket.objects.filter(Prefix=feat_vect_file_dir)
]

In [None]:
len(images_run)

In [None]:
image_names = [x for x in image_names if x not in images_run]
len(image_names)

#### Test showing an image

In [None]:
i = 0
img = Image.open(images_dir + image_names[i] + image_type)
print(type(img))
img

### 2. Create dataset, run dataloader and get feature vectors

In [None]:
class imagesDataset(Dataset):
    def __init__(
        self, image_names, images_dir, image_type, transforms=transforms.ToTensor()
    ):

        self.transforms = transforms
        self.image_names = image_names
        self.images_dir = images_dir
        self.image_type = image_type
        self.index_to_id = {index: id for index, id in enumerate(self.image_names)}

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, index):
        image_id = self.index_to_id[index]

        im = Image.open(self.images_dir + image_id + self.image_type)

        img = self.transforms(im)

        image_name = image_id

        return image_name, img

In [None]:
# Create all the images transforms
min_img_size = (
    224,
    224,
)  # The min size, as noted in the PyTorch pretrained models doc, is 224 px.
transform_pipeline = transforms.Compose(
    [
        transforms.Resize(min_img_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

# Remove the last layer from the model, so that the output will be a feature vector
vgg16_short = vgg16
vgg16_short.classifier = vgg16.classifier[:4]

In [None]:
our_dataloader = DataLoader(
    dataset=imagesDataset(
        image_names=image_names,
        images_dir=images_dir,
        image_type=image_type,
        transforms=transform_pipeline,
    ),
    batch_size=32,
    shuffle=True,
)

### 3. Save feature vectors to S3

In [None]:
s3 = boto3.client("s3")
for image_names, images in tqdm(our_dataloader):
    images = images.to(device)
    feature_vectors = vgg16_short(images)
    feature_vectors = feature_vectors.to("cpu")
    for image_name, feature_vector in zip(image_names, feature_vectors):
        s3.put_object(
            Bucket="miro-images-feature-vectors",
            Key="feature_vectors/" + image_name,
            Body=feature_vector.detach().numpy().tobytes(),
        )

## Test

In [None]:
our_dataloader_test = DataLoader(
    dataset=imagesDataset(
        image_names=images_run[0:2],
        images_dir=images_dir,
        image_type=image_type,
        transforms=transform_pipeline,
    ),
    batch_size=32,
    shuffle=True,
)

In [None]:
for image_names, images in tqdm(our_dataloader_test):
    images = images.to(device)
    feature_vectors = vgg16_short(images)
    feature_vectors = feature_vectors.to("cpu")
    for image_name, feature_vector in zip(image_names, feature_vectors):
        print(image_name)
        print(feature_vector[0:10])
        print(feature_vector.detach().numpy().tobytes()[0:100])

## 4. Dimensionality reduction for a sample of 5000
How small can the feature vectors be without losing interesting information?

### 4a. Pull in the feature vectors found in the above step

In [None]:
# https://alexwlchan.net/2017/07/listing-s3-keys/
def get_all_s3_keys(bucket):
    """Get a list of all keys in an S3 bucket."""
    keys = []

    kwargs = {"Bucket": bucket}
    while True:
        resp = s3.list_objects_v2(**kwargs)
        for obj in resp["Contents"]:
            keys.append(obj["Key"])

        try:
            kwargs["ContinuationToken"] = resp["NextContinuationToken"]
        except KeyError:
            break

    return keys

In [None]:
s3 = boto3.client("s3")
bucket_name = "miro-images-feature-vectors"

keys = get_all_s3_keys(bucket_name)

folder_name = "feature_vectors"
keys = [k for k in keys if k.split("/")[0] == folder_name]

len(keys)

In [None]:
keys[0]

In [None]:
keys = np.random.choice(keys, 5000)

In [None]:
feature_vectors = {}
for key in tqdm(keys):
    obj = s3.get_object(Bucket=bucket_name, Key=key)
    read_obj = obj["Body"].read()

    feature_vectors[key] = np.frombuffer(read_obj, dtype=np.float)

In [None]:
feature_vectors_list = list(feature_vectors.values())
feature_vectors_names = list(feature_vectors.keys())

In [None]:
len(feature_vectors_list)

### 4b. How many dimensions can we reduce to?

In [None]:
data_rescaled.shape

In [None]:
scaler = StandardScaler()
data_rescaled = scaler.fit_transform(feature_vectors_list)
corner_x = 100

# Fitting the PCA algorithm with our Data
pca = PCA().fit(data_rescaled)  # (n_samples, n_features)
variance_vals = np.cumsum(pca.explained_variance_ratio_)
# Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot([0, 1000], [1, 1], "r--")
plt.plot(variance_vals)
plt.plot(corner_x, variance_vals[corner_x], "x")
plt.xlabel("Number of Components")
plt.ylabel("Variance (%)")  # for each component
plt.title("Feature Vectors Explained Variance")
plt.xlim(0, 1000)
plt.show()

In [None]:
fig = plt.figure()
plt.plot([0, 1000], [1, 1], "r--")
plt.plot(variance_vals)
plt.plot(corner_x, variance_vals[corner_x], "x")
plt.xlabel("Number of Components")
plt.ylabel("Variance (%)")  # for each component
plt.title("Feature Vectors Explained Variance")
plt.xlim(0, 1000)
plt.savefig("../feat_vec_var.png")
plt.close(fig)

### 4c. Save dimensionality reduced feature vectors to S3

In [None]:
corner_x = 500
pca = PCA(n_components=corner_x)
transformed_feature_vectors = pca.fit_transform(data_rescaled)

In [None]:
s3 = boto3.client("s3")
for i, transformed_data in tqdm(enumerate(transformed_feature_vectors)):
    image_name = os.path.basename(feature_vectors_names[i])
    s3.put_object(
        Bucket="miro-images-feature-vectors",
        Key="reduced_feature_vectors_{}_dims/{}".format(corner_x, image_name),
        Body=bytes(transformed_data),
    )