In [1]:
# Standard library imports,
import os
import gzip

# Third-party imports,
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import cv2

# Torch imports,
import torch
import torchvision
from torch.utils.data import Dataset
from torchvision import transforms

## Custom Dataset

In [2]:
class MNIST(Dataset):
    def __init__(self, X, Y):

        # Defining normalisation transform,
        self.norm_trans = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307), (0.3081))
        ])

        # Converting Y to tensors,
        self.Y = torch.tensor(Y)
        del Y

        # Converting X to tensors and normalising,
        self.X = self.normalise(X)
        del X

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.Y[index]

    def normalise(self, X):
        normalised_X = []
        for image in X:
            normalised_image = self.norm_trans(image)
            normalised_X.append(normalised_image)
            del image
        return torch.stack(normalised_X)

## Preprocessing Data

In [3]:
if __name__ == "__main__":

    parent_dir = os.path.dirname(os.getcwd())
    data_dir = parent_dir + "\\Datasets\\MNIST\\"

    # Extracting training images and labels,
    with gzip.open(data_dir + "train-images-idx3-ubyte.gz", "r") as bytestream:
        bytestream.read(16)
        training_images_bytes = bytestream.read(60000*28*28)

    training_images = np.frombuffer(training_images_bytes, dtype=np.uint8).astype(np.float32)
    del training_images_bytes
    training_images = training_images.reshape(60000, 28, 28, 1)

    with gzip.open(data_dir + "train-labels-idx1-ubyte.gz", "r") as bytestream:
        bytestream.read(8)
        training_labels_bytes = bytestream.read(60000)

    training_labels = np.frombuffer(training_labels_bytes, dtype=np.uint8).astype(np.int64)
    del training_labels_bytes
    
    # Saving training dataset,
    training_dataset = MNIST(training_images, training_labels)
    del(training_images, training_labels)
    torch.save(training_dataset, "training.pt")
    del training_dataset
    
    # Extracting test images and labels,
    with gzip.open(data_dir + "t10k-images-idx3-ubyte.gz", "r") as bytestream:
        bytestream.read(16)
        test_images_bytes = bytestream.read(10000*28*28)

    test_images = np.frombuffer(test_images_bytes, dtype=np.uint8).astype(np.float32)
    del test_images_bytes
    test_images = test_images.reshape(10000, 28, 28, 1)

    with gzip.open(data_dir + "t10k-labels-idx1-ubyte.gz", "r") as bytestream:
        bytestream.read(8)
        test_labels_bytes = bytestream.read(10000)

    test_labels = np.frombuffer(test_labels_bytes, dtype=np.uint8).astype(np.int64)
    del test_labels_bytes

    # Saving training dataset,
    test_dataset = MNIST(test_images, test_labels)
    del(test_images, test_labels)
    torch.save(test_dataset, "test.pt")
    del test_dataset