In [1]:
from keras.datasets import cifar10
import os
import cv2 as cv

Using TensorFlow backend.


# 1. Load Dataset

In [2]:
(X_train, Y_train), (X_test, Y_test) = cifar10.load_data()

In [3]:
labels = {
    0: 'airplane',
    1: 'automobile',
    2: 'bird',
    3: 'cat',
    4: 'deer',
    5: 'dog',
    6: 'frog',
    7: 'horse',
    8: 'ship',
    9: 'truck'
}

Extracting image information, such as the filename and the label.

In [4]:
def get_info(data):
    """ Extract image information from CIFAR-10.
    
    The extracted information is:
    * the filename, which is of the form (image_number)-(label).png,
    * the label.
    
    Args:
        data:
            CIFAR-10 images, as numpy array.
    
    Returns:
        list of dictionaries containing the filename and the label for each image.
    """
    img_info = []
    
    index = 0
    for i in range(data.shape[0]):
        img_info.append({"filename": '{}-{}.png'.format(index, labels[data[i][0]]), "label": labels[data[i][0]]})
        
        index += 1
    
    return img_info

In [5]:
train = get_info(Y_train)
test = get_info(Y_test)

# 2. Download Dataset

Create directories to host the train and test datasets.

In [7]:
PARENT_DIR = input("Enter path of parent directory: ")
TRAIN_DIR = os.path.join(PARENT_DIR, 'train')
TEST_DIR = os.path.join(PARENT_DIR, 'test')

if not os.path.isdir(PARENT_DIR):
    try:
        os.mkdir(PARENT_DIR)
        print(f"Directory {PARENT_DIR} was created successfully.")
    except OSError as ex:
        print(f"Directory {PARENT_DIR} cannot be created.\n{ex}")

if os.path.isdir(PARENT_DIR):
    if not os.path.isdir(TRAIN_DIR):
        try:
            os.mkdir(TRAIN_DIR)
            print(f"Directory {TRAIN_DIR} was created successfully.")
        except OSError as ex:
            print(f"Directory {TRAIN_DIR} cannot be created.\n{ex}")
    
    if not os.path.isdir(TEST_DIR):
        try:
            os.mkdir(TEST_DIR)
            print(f"Directory {TEST_DIR} was created successfully.")
        except OSError as ex:
            print(f"Directory {TEST_DIR} cannot be created.\n{ex}")

Enter path of parent directory: ../static/cifar10
Directory ../static/cifar10 was created successfully.
Directory ../static/cifar10\train was created successfully.
Directory ../static/cifar10\test was created successfully.


Download train and test images

In [13]:
def save_images(data, data_info, directory):
    """
    
    Args:
        data:
        data_info:
        directory:
    """
    for img in data_info:
        filename = img["filename"]
        img_index = int(filename[0 : filename.find("-")])
        
        path = os.path.join(directory, filename)
        
        cv.imwrite(path, data[img_index])

ERROR! Session/line number was not unique in database. History logging moved to new session 912


In [14]:
save_images(X_train, train, TRAIN_DIR)
save_images(X_test, test, TEST_DIR)

# 3. Search Engine Files

Create directory to host the files (queries.txt, qrels.txt).

In [15]:
PARENT_DIR = '../data/'

if not os.path.isdir(PARENT_DIR):
    try:
        os.mkdir(PARENT_DIR)
        print(f"Directory {PARENT_DIR} was created successfully.")
    except OSError as ex:
        print(f"Directory {PARENT_DIR} cannot be created.\n{ex}")

Directory ../data/ was created successfully.


## 3.1 Queries

The queries file (queries.txt), consists of records with the following structure: query_id, filename

In [17]:
FILE_NAME = 'queries.txt'

with open(os.path.join(PARENT_DIR, FILE_NAME), 'w') as f:
    for img in test:
        query_id = img["filename"][0 : img["filename"].find("-")]
        query_body = img["filename"]
        query = "{} {}\n".format(query_id, query_body)
        
        f.write(query)

## 3.2 Relevance File

The document-query relevance file (qrels.txt), consists of records with the following structure: query_id, iteration, doc_id, relevance.

In [19]:
FILE_NAME = 'qrels.txt'

with open(os.path.join(PARENT_DIR, FILE_NAME), 'w') as f:
    for query in test:
        query_id = query["filename"][0 : query["filename"].find("-")]
        iteration = "0"
        relevance = "1"
        
        # find relevant images, with respect to the query
        # relevant images are those that have the same label
        for img in train:
            if img["label"] == query["label"]:
                doc_id = img["filename"][0 : img["filename"].find("-")]
                
                record = "{} {} {} {}\n".format(query_id, iteration, doc_id, relevance)
                
                f.write(record)