### Dataset Split Description

The MNIST Dataset already has a train and a test set. The preexisting split will be used to ensure that there are no data leakage issues.

### Dataset

A Folder Dataset will be initialized using Pytorches Folder Dataset Class.

In [1]:
##define libraries
import os, gc
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torchvision import tv_tensors
import torchvision.transforms.functional as fn
from torchvision.io import read_image
from torch.utils.data import Dataset
from torchvision import datasets
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt

In [2]:
##define paths
basepath = "C:/Users/vanth/OneDrive/Desktop/JHUClasses/data/numbers_mnist/"
trainpath = basepath + 'train/'
valpath = basepath + 'val/'

In [3]:
class CustomImageDataset(Dataset):
    def __init__(self, labelspath, imgspath, transform=None, target_transform=None):
        ##use dictionary to keep track of images
        data_dict = {}
        
        ##get the image file full paths
        imgfiles = sorted(os.listdir(imgspath))
        data_dict['imgpath'] = [imgspath + file for file in imgfiles]

        ##get the label file full paths
        labelfiles = sorted(os.listdir(labelspath))
        data_dict['labelpath'] = [labelspath + file for file in labelfiles]

        ##debug
        # print(data_dict['labelpath'])
        
        ##get labels and bboxes
        labels, bboxes = [], []
        for labelpath in tqdm(data_dict['labelpath'], 'Reading Labels'):
            with open(labelpath, 'r') as f:
                lines = f.readlines()
            
            ##a list of labels and bbox coordinates
            labels.append([line[0] for line in lines])
            bboxes.append([line[2:] for line in lines])

        ##remove strings from bbox coordinates
        for i, boxes in enumerate(bboxes):
            ##convert bbox strings to float coordinates
            boxes = [np.array(i.replace('\n', '').split(' ')).astype(float) for i in boxes]
            bboxes[i] = boxes
        print(boxes)
        
        data_dict['labels'] = labels
        data_dict['bboxes'] = bboxes
                
        ##convert dict to df
        self.df = pd.DataFrame(data_dict)
        
        ##define transforms
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        ##open image
        img_path = self.df.iloc[idx]['imgpath']
        image = fn.resize(read_image(img_path), size=[256, 256])
        print(image.shape)

        ##retrieve labels
        labels = torch.tensor(np.array(self.df.iloc[idx]['labels']).astype(int))
        print("Len Labels: ", labels.shape)
        
        ##retrieve bboxes
        bboxes = torch.tensor(np.array(self.df.iloc[idx]['bboxes']).astype(np.float64))
        print("Len Bboxes: ", bboxes.shape)
        
        ##transform image if applicable
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(labels)
        return image, labels
        

### Transforms

In addition to the previously mentioned augmentations, the dataset will undergo runtime transformations such as normalization and random rotation. The rotations will have to be minor, since overly rotating an image can cause some letters and numbers to be confused, such as 6s and 9s.

In [4]:
from torchvision.transforms import v2
transforms = v2.Compose([
    v2.Normalize(mean=[0.485, 0.456], std=[0.229, 0.224]),])



In [5]:
%%time

##initialize datasets
train_data = CustomImageDataset(trainpath + 'labels/', trainpath + 'images/')
val_data = CustomImageDataset(valpath + 'labels/', valpath + 'images/')

gc.collect()

Reading Labels: 100%|█████████████████████████████████████████████████████████| 80000/80000 [00:03<00:00, 26088.84it/s]


[array([0.73046875, 0.1953125 , 0.109375  , 0.109375  ]), array([0.0546875 , 0.23828125, 0.109375  , 0.109375  ]), array([0.6015625, 0.15625  , 0.109375 , 0.109375 ]), array([0.25390625, 0.47265625, 0.109375  , 0.109375  ]), array([0.75      , 0.46484375, 0.109375  , 0.109375  ])]


Reading Labels: 100%|█████████████████████████████████████████████████████████| 12000/12000 [00:00<00:00, 25234.75it/s]


[array([0.28125  , 0.1796875, 0.109375 , 0.109375 ]), array([0.15234375, 0.93359375, 0.109375  , 0.109375  ]), array([0.42578125, 0.28515625, 0.109375  , 0.109375  ])]
CPU times: total: 1.88 s
Wall time: 5.16 s


0

In [6]:
a = np.array(['1', '2']).astype(int)
torch.tensor(a)

tensor([1, 2], dtype=torch.int32)

### Dataloaders

The dataloader will be one that is compatible with Pytorch's dataset folder. The dataloader is where the transforms will defined.

In [7]:
%%time 

##define dataloaders
train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
val_loader = DataLoader(val_data, batch_size=8, shuffle=True)

gc.collect()

CPU times: total: 15.6 ms
Wall time: 76.7 ms


0

In [8]:
%%time

##sanity check
# Display image and label.
train_features, train_labels = next(iter(train_loader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
# print(f"Labels batch shape: {train_bboxes.size()}")
img = train_features[0].squeeze()
label = train_labels[0]
plt.imshow(img, cmap="gray")
plt.show()
print(f"Label: {label}")

gc.collect()

torch.Size([1, 256, 256])
Len Labels:  torch.Size([6])
Len Bboxes:  torch.Size([6, 4])
torch.Size([1, 256, 256])
Len Labels:  torch.Size([6])
Len Bboxes:  torch.Size([6, 4])
torch.Size([1, 256, 256])
Len Labels:  torch.Size([2])
Len Bboxes:  torch.Size([2, 4])
torch.Size([1, 256, 256])
Len Labels:  torch.Size([1])
Len Bboxes:  torch.Size([1, 4])
torch.Size([1, 256, 256])
Len Labels:  torch.Size([4])
Len Bboxes:  torch.Size([4, 4])
torch.Size([1, 256, 256])
Len Labels:  torch.Size([0])
Len Bboxes:  torch.Size([0])
torch.Size([1, 256, 256])
Len Labels:  torch.Size([5])
Len Bboxes:  torch.Size([5, 4])
torch.Size([1, 256, 256])
Len Labels:  torch.Size([4])
Len Bboxes:  torch.Size([4, 4])




RuntimeError: stack expects each tensor to be equal size, but got [6] at entry 0 and [2] at entry 2