Code based on: https://github.com/sorenbouma/keras-oneshot/blob/master/load_data.py

This script preprocesses the omniglot dataset (training and test set, stored as PNG files in subfolders) into a numpy array of shape (n_classes, n_examples, n_drawings, width, height), as well as a dictionary that can be referenced to find out which alphabet a particular (character) class belongs to. This data is then stored as pickle files.

In [0]:
import numpy as np
from imageio import imread
import pickle
import os


In [0]:
# mount the data needed to drive folder so we can use them in colab, see the data download link in Practical 4a.1
from google.colab import drive
!mkdir drive
drive.mount('drive')

In [0]:
#  list all the data in your drive folder to see if mount successfully.
!ls "drive/My Drive/"

In [0]:
"""Script to preprocess the omniglot dataset and pickle it into an array that's easy to index by character type"""
data_path = os.path.join("drive","My Drive","data_DL_practical", "omniglot")
train_path = os.path.join(data_path, "omniglot_train")
test_path = os.path.join(data_path,"omniglot_test")

lang_dict = {}

def loadimgs(path, n=0):
    X=[]
    y = []
    cat_dict = {}
    lang_dict = {}
    curr_y = n
    # we load every alphabet separately so we can isolate them later
    for alphabet in os.listdir(path):
        print("loading alphabet: " + alphabet)
        lang_dict[alphabet] = [curr_y,None]
        alphabet_path = os.path.join(path,alphabet)
        # every letter/category has its own column in the array, so load separately
        for letter in os.listdir(alphabet_path):
            cat_dict[curr_y] = (alphabet, letter)
            category_images=[]
            letter_path = os.path.join(alphabet_path, letter)
            for filename in os.listdir(letter_path):
                image_path = os.path.join(letter_path, filename)
                image = imread(image_path)
                category_images.append(image)
                y.append(curr_y)
            try:
                X.append(np.stack(category_images))
            #edge case  - last one
            except ValueError as e:
                print(e)
                print("error - category_images:", category_images)
            curr_y += 1
            lang_dict[alphabet][1] = curr_y - 1
    y = np.vstack(y)
    X = np.stack(X)
    return X, y, lang_dict

X, y, c = loadimgs(train_path)
with open(os.path.join(data_path, "omniglot_train.p"), "wb") as f:
    pickle.dump((X, c), f)

X, y, c = loadimgs(test_path)
with open(os.path.join(data_path, "omniglot_test.p"), "wb") as f:
    pickle.dump((X, c), f)