# Dataset in pytorch

PyTorch supports two different types of datasets:

* map-style datasets,

* iterable-style datasets.

Map-style datasets
A map-style dataset is one that implements the __getitem__() and __len__() protocols, and represents a map from (possibly non-integral) indices/keys to data samples.

For example, such a dataset, when accessed with dataset[idx], could read the idx-th image and its corresponding label from a folder on the disk.

In [1]:
import torch
from torch.utils.data import Dataset

In [6]:
sample_size = 5
number_features =  3

features = torch.arange(sample_size*number_features).reshape(sample_size, number_features) * 1.0
print(features)
print(features[0])

tensor([[ 0.,  1.,  2.],
        [ 3.,  4.,  5.],
        [ 6.,  7.,  8.],
        [ 9., 10., 11.],
        [12., 13., 14.]])
tensor([0., 1., 2.])


In [7]:
label = torch.randint(low=0, high=3, size=(sample_size,))
print(label)
label[0]

tensor([0, 2, 2, 0, 2])


tensor(0)

In [9]:
label.shape[0]

5

In [12]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        super(CustomDataset, self).__init__()
        self.features = features
        self.labels = labels
        assert features.shape[0] == labels.shape[0]
    
    def __getitem__(self, i):
        return self.features[i], self.labels[i]
    
    def __len__(self):
        return self.features.shape[0]
    

In [13]:
dataset = CustomDataset(features, label)

In [14]:
print(len(dataset))

5


In [16]:
sampel_feature, sample_label = dataset[0]
print(sampel_feature)
print(sample_label)

tensor([0., 1., 2.])
tensor(0)


In [17]:
sampel_feature, sample_label = dataset[1]
print(sampel_feature)
print(sample_label)

tensor([3., 4., 5.])
tensor(2)
