In this notebook we
- Take all 120588 images from the S3 bucket `wellcomecollection-miro-images-public`.
- We read the image, resize it to having a maximum length or width of 224, and convert any black and white images to RGB.
- These images are then saved in batches to `storage/data`, with the prefix 'processed_images_batch_'.
- We also take a look at how big all the images were, in which we found some of the images were so big that it was worth not saving them in batches, but rather each in a separate png file, which would make reading easier. Thus, we read the previous batches of processed images and saved each image as a png, e.g. A0000001.png
- Due to some image errors 4 of the batches didn't work, so the images from these are saved individually in pngs at the end.
- `storage/data` now contains 120576 images as pngs

Note: 11 images didn't save, the names of these are saved in "../data/images_not_saved" and are:
['B0008000/B0008543.jpg','B0008000/B0008573.jpg','B0009000/B0009632.jpg','B0010000/B0010992.jpg','L0038000/L0038247.jpg',
 'L0083000/L0083878.jpg','L0086000/L0086135.jpg','L0086000/L0086136.jpg','Large Files/L0078598.jpg','Large Files/L0080109.jpg','Large Files/L0080110.jpg']


In [None]:
import os
from io import BytesIO
import json
import pickle

from PIL import Image
from tqdm import tqdm
import boto3
import numpy as np
import matplotlib.pyplot as plt

## 1. Get image names from S3

In [None]:
bucket_name = "wellcomecollection-miro-images-public"

In [None]:
sts = boto3.client("sts")
assumed_role_object = sts.assume_role(
    RoleArn="arn:aws:iam::760097843905:role/calm-assumable_read_role",
    RoleSessionName="AssumeRoleSession1",
)
credentials = assumed_role_object["Credentials"]

In [None]:
s3_fetch = boto3.resource(
    "s3",
    aws_access_key_id=credentials["AccessKeyId"],
    aws_secret_access_key=credentials["SecretAccessKey"],
    aws_session_token=credentials["SessionToken"],
)

In [None]:
bucket = s3_fetch.Bucket(bucket_name)
bucket_info = bucket.meta.client.list_objects(Bucket=bucket.name, Delimiter="/")

In [None]:
# Get all folder names.
folder_names = [f["Prefix"] for f in bucket_info.get("CommonPrefixes")]
print("{} image folders".format(len(folder_names)))  # 219

# Get all file dirs from all folders. Takes a minute or so
print("Getting all file dir names for all images...")
file_dirs = []
for folder_name in tqdm(folder_names):
    file_dirs.extend([s.key for s in bucket.objects.filter(Prefix=folder_name)])
print("{} image files".format(len(file_dirs)))  # 120589

##  2. Preprocess images

In [None]:
def get_image(file_dir, bucket_name):

    obj = s3_fetch.Object(bucket_name, file_dir)
    im = Image.open(BytesIO(obj.get()["Body"].read()))
    im.thumbnail((224, 224))
    if im.mode != "RGB":
        im = im.convert("RGB")

    return im

In [None]:
# https://chrisalbon.com/python/data_wrangling/break_list_into_chunks_of_equal_size/
# Create a function called "chunks" with two arguments, l and n:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i : i + n]

In [None]:
batch_size = 1000
batches = chunks(file_dirs, batch_size)

In [None]:
filename_prefix = "processed_images_batch_"
filenames = os.listdir("../data/")
batch_numbers_completed = [
    int(os.path.splitext(filename)[0].replace(filename_prefix, ""))
    for filename in filenames
    if filename_prefix in filename
]

print(
    "{} batches completed out of {}".format(
        len(batch_numbers_completed), round(len(file_dirs) / batch_size)
    )
)

In [None]:
# It takes a long time and sometimes there are errors, so I will process and save in batches

for i, batch in tqdm(enumerate(batches)):
    if not i in batch_numbers_completed:
        print(i)
        try:
            batch_images = {
                os.path.splitext(os.path.basename(file_dir))[0]: get_image(
                    file_dir, bucket_name
                )
                for file_dir in batch
            }
            with open(
                "../data/processed_images_batch_{}.pkl".format(i), "wb"
            ) as handle:
                pickle.dump(batch_images, handle)
        except:
            print("Issue with batch {}".format(i))

### (Optional) how big are the images files?

In [None]:
file_dir = "../data/"
file_name_start = "processed_images_batch"

image_file_names = os.listdir(file_dir)
image_file_names = [file for file in image_file_names if file_name_start in file]

In [None]:
sizes = {}
for image_file_name in tqdm(image_file_names):
    statinfo = os.stat(file_dir + image_file_name)
    sizes[image_file_name] = statinfo.st_size

In [None]:
plt.hist(list(sizes.values()), bins=100)
plt.show()

In [None]:
max(sizes.values())

In [None]:
sum(list(sizes.values()))  # 34GB

# 3. The batch pickles are too big to load and keep in memory

I individually save them into pngs, which means I can access one at a time.

This is more useful since I only need one at a time to get the feature vectors, and I 
don't need to plot them all in one go (it'd be a mess anyway).

For the pathways I only need a few of the images at once.

In [None]:
file_dir = "../data/"
file_name_start = "processed_images_batch"

data_file_names = os.listdir(file_dir)
batch_image_file_names = [
    file for file in data_file_names if (file_name_start in file and ".pkl" in file)
]

In [None]:
for image_file_name in tqdm(image_file_names):
    with open(file_dir + image_file_name, "rb") as handle:
        image_batch = pickle.load(handle)
        for image_name, image in image_batch.items():
            if not image_name + ".png" in data_file_names:
                image.save(file_dir + image_name + ".png")

## 4. Get the images for the files in the 4 batches that didn't work in step 2

Find the image names that didn't get saved in batches and individually save them as pngs.

Need to run the first part of step 3.

In [None]:
incomplete_batch_numbers = [
    i
    for i in range(0, round(len(file_dirs) / batch_size))
    if i not in batch_numbers_completed
]
incomplete_batch_numbers

In [None]:
bad_batch_image_file_names = []
for i, batch in enumerate(batches):
    if i in incomplete_batch_numbers:
        print(i)
        bad_batch_image_file_names.extend(list(batch))

In [None]:
len(bad_batch_image_file_names)

In [None]:
bad_images = []
good_images = []
for image_dir in tqdm(bad_batch_image_file_names):
    try:
        image = get_image(image_dir, bucket_name)
        image_name = os.path.splitext(os.path.basename(image_dir))[0]
        image.save(file_dir + image_name + ".png")
        good_images.append(image_dir)
    except:
        bad_images.append(image_dir)

In [None]:
print(len(good_images))
print(len(bad_images))

In [None]:
np.save("../data/images_not_saved", np.array(bad_images))

In [None]:
bad_images