In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

torch.__version__

'1.8.0'

# Dataset and DataLoader Class

PyTorch provides two data primitives: `torch.utils.data.DataLoader` and `torch.utils.data.Dataset` that allow you to use pre-loaded datasets as well as your own data.

## Dataset class and loading dataset

`Dataset` stores the samples and their corresponding labels.

In [2]:
from torch.utils.data import Dataset
#Forces to give same random number every time it gets compiled
torch.manual_seed(1)

<torch._C.Generator at 0x7f52dd6e11f8>

`torch.utils.data.Dataset` is an abstract class representing a dataset. Custom dataset should inherit Dataset and override,
1. \_\_init\_\_(): The \_\_init\_\_ function is run once when instantiating the Dataset object. We initialize the directory containing the images, the annotations file, and both transforms
2. \_\_len\_\_(): The \_\_len\_\_ function returns the number of samples in our dataset.
3. \_\_getitem\_\_(): The \_\_getitem\_\_ function loads and returns a sample from the dataset at the given index `index`.

In [4]:
#Class to load sample dummy dataset
class toyDataset(Dataset):
    #Constructor with default values
    def __init__(self, length=10, transform=None, target_transform=None):
        self.len = length
        #x :: input features
        self.x = 2 * torch.randint(0, 101,(length, 2), dtype=torch.float64)
        #y :: target labels
        self.y = torch.ones(length, 1)
        #Whether data features need to transformed (like, normalization, etc)
        self.transform = transform
        #Whether data labels need to transformed (like, generating one-hot vectors, etc)
        self.target_transform = target_transform
        
    #Method overriding to return the total number of instances 
    def __len__(self):
        return self.len
    
    #Method overriding to return data samples
    def __getitem__(self, index):
        sample = self.x[index], self.y[index]
        if self.transform:
            sample[0] = self.transform(sample[0])
        if self.target_transform:
            sample[1] = self.transform(sample[1])
        return sample

In [5]:
#Creating instance of toyDataset and accessing example instances
data = toyDataset()
for i in range(5):
    print(data[i])

(tensor([46., 50.], dtype=torch.float64), tensor([1.]))
(tensor([ 74., 200.], dtype=torch.float64), tensor([1.]))
(tensor([200., 190.], dtype=torch.float64), tensor([1.]))
(tensor([144.,  28.], dtype=torch.float64), tensor([1.]))
(tensor([ 98., 168.], dtype=torch.float64), tensor([1.]))


### Transform
Most of time, we need to do some type of tranformation in the dataset, like normalising the data, setting the image size, etc. Thus, there is need to write some pre-processing code. <br>
It is ideal to implement them as class rather than functions.

In [None]:
class transform_my_data(object):
    def __init__(self, tranformation_params):
        """
            Constructor
        """
        self.tranformation_params = tranformation_params
    
    def __call__(self, sample):
        """
            Executor:
            Necessary tranformation
            to each instance of data.
            
        """
        x, y = sample
        x *= self.tranformation_params
        
        return x, y       

In [None]:
class normalise_my_data(object):
    def __init__(self, total_instances):
        """
            Constructor
        """
        self.total_instances = total_instances
    
    def __call__(self, sample):
        """
            Executor:
            Necessary tranformation
            to each instance of data.
        """
        x, y = sample
        x /= self.total_instances
        
        return x, y

Creating instance of transform and using tranform parameter from out dataset's constructor, we can initialise transformation in our dataset.

In [None]:
print(len(data))

In [None]:
transform = transform_my_data(0.2)
normalise = normalise_my_data(len(data))

In [None]:
transformed_dataset = toyDataset(transform=transform)

In [None]:
normalised_dataset = toyDataset(transform=normalise)

In [None]:
# Use loop to print out first 10 elements in dataset

for i in range(5):
    print(data[i])
    print(transformed_dataset[i])
    print(normalised_dataset[i])
    print("")

### Composing multiple transform

In [None]:
from torchvision import transforms

In [None]:
data_transform = transforms.Compose([transform, normalise])
data_transform

In [None]:
data_transform(data[0])

The Compose object wil perform each transorm concurrently.

In [None]:
dataset = toyDataset(transform=data_transform)

In [None]:
for i in range(5):
    print(dataset[i])

### Dealing with real dataset

So far,
1. dataset was not real and was small, therefore we initialised at __init__(), which must not be done for real datasets, as it will load the entire dataset at once, consuming large memory.
2. we have iterated through the dataset using for loop, where we miss various features like, batching, shuffling, load the data in multiprocessing environment. Hence we will use dataloader (iterator).

In [None]:
from matplotlib.pyplot import imshow
import matplotlib.pylab as plt
from PIL import Image
import os

In [None]:
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

In [None]:
directory = "./../../data/fmnist-sample/"
csv_file = "index.csv"
csv_path = os.path.join(directory+csv_file)

In [None]:
data_name = pd.read_csv(csv_path)
data_name.head()

In [None]:
#Filename, Label/class
data_name.iloc[0,1], data_name.iloc[0,0]

In [None]:
image_name = data_name.iloc[0,1]
image_path = os.path.join(directory+image_name)
image = Image.open(image_path)
plt.imshow(image,cmap='gray', vmin=0, vmax=255)
plt.title(data_name.iloc[0, 0])
plt.show()

In [None]:
class fashionDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.root_dir = root_dir
        csv_path = os.path.join(root_dir+csv_file)
        self.csv_file = pd.read_csv(csv_path)
        self.transform = transform
    
    def __len__(self):
        return len(self.csv_file)
    
    def __getitem__(self, idx):
        """
            to fetch instances of dataset
            idx :: index
        """
        #Loading the image
        img_name = os.path.join(self.root_dir+self.csv_file.iloc[idx, 1])
        image = Image.open(img_name)
        
        #Loading the label
        label = self.csv_file.iloc[idx, 0]
        
        #Applying transformation
        if self.transform:
            image = self.transform(image)
            
        return image, label

In [None]:
#Creating object of dataset
fdata = fashionDataset(csv_file, directory)

In [None]:
#fetching length of total instances
len(fdata)

In [None]:
#Accessing a particulare instance of a dataset
img = fdata[100]

plt.imshow(img[0],cmap='gray', vmin=0, vmax=255)
plt.title(img[1])
plt.show()

As part of transform, torchvision provides several built-in transformation for images, like, CenterCrop, ColorJitter, Pad, ToTensor, etc. Click [here](https://pytorch.org/docs/stable/torchvision/transforms.html) for more.

In [None]:
img_transformation = transforms.Compose([transforms.CenterCrop(20), transforms.ToTensor()])
newFData = fashionDataset(csv_file=csv_file, root_dir=directory, transform=img_transformation)

In [None]:
#function to load the image for display
def show(sample, shape=(28,28)):
    plt.imshow(sample[0].numpy().reshape(shape), cmap='gray')
    plt.title(sample[1])
    plt.show()

In [None]:
#Accessing a particulare instance of a dataset
img = newFData[100]
show(img, shape=(20,20))

In [None]:
for i in range(5):
    show(newFData[i], shape=(20,20))

### Iterating over dataset using DataLoader

`DataLoader` wraps an iterable around the `Dataset` to enable easy access to the samples.

In [None]:
from torch.utils.data import DataLoader

dataloader = DataLoader(newFData, batch_size=5, shuffle=True, num_workers=5)

for i_batch, sample_batched in enumerate(dataloader):
    for i in range(sample_batched[0].shape[0]):
        sample = (sample_batched[0][i], sample_batched[1][i])
        show(sample, shape=(20,20))
    if i_batch == 2:
        break

### Working with torchvision/torch pre-build datasets

Click [here](https://pytorch.org/docs/stable/torchvision/datasets.html) to explore different datasets.

Common parameters across torchvision datasets

1. `root:` the path where the data is stored.
2. `train:` specifies training or test data.
3. `download:` downloads the dataset if not available at root
4. `transform:` feature transformation
5. `target_transform:` label transformation

In [None]:
import torchvision.datasets as dsets

#importing the pre-built mnist dataset
mnist_dataset = dsets.MNIST(root='./../data/',
                           train=False, # If True, creates dataset from training.pt, otherwise from test.pt.
                           download=True,
                           transform = transforms.ToTensor())

Each element of the dataset object contains a tuple.

In [None]:
mnist_dataset[0][0].shape

In [None]:
mnist_dataset[0][1]

In [None]:
show(mnist_dataset[0])

### Working with torchtext pre-built dataset

In [None]:
from torchtext.datasets import IMDB

In [None]:
train_iter = IMDB(root='./../data/IMDB', split='train')
test_iter = IMDB(root='./../data/IMDB', split='test')

In [None]:
train_data = []
for label, line in train_iter:
    train_data.append((label, line))

In [None]:
test_data = []
for label, line in test_iter:
    test_data.append((label, line))

In [None]:
sample_idx = torch.randint(len(train_data), size=(1,)).item()
label, line = train_data[sample_idx]
print(sample_idx, '\t', line, '\t', label)