In [1]:
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader

In [2]:
obs = 100
features = 20
data =np.random.randn(obs, features)

In [3]:
data

array([[-0.24418729,  0.6854535 , -0.91058804, ..., -0.80364591,
        -0.65806535,  0.45685279],
       [-0.54783622,  0.12852351,  1.32109651, ...,  1.24891264,
         0.58118708, -1.19595368],
       [-0.09019966,  1.0480449 , -0.71543625, ..., -0.75019219,
         1.96801555, -0.78251175],
       ...,
       [ 0.07931463, -1.32270665,  1.48855617, ..., -0.02503276,
         1.95273595,  2.23360995],
       [-0.92971847,  0.0633094 ,  0.86460596, ..., -0.41261984,
         0.50410715, -1.07780622],
       [ 0.05239653, -1.63789854, -1.37620943, ..., -0.24415884,
         1.66564548, -0.56063445]])

In [4]:
torch_data = torch.tensor(data)

In [5]:
print(data.shape)
print(torch_data.shape)

(100, 20)
torch.Size([100, 20])


In [6]:
print(data.dtype)
print(torch_data.dtype)

float64
torch.float64


In [7]:
# Sometimes the pytorch and numpy has the same methods

In [8]:
print(torch.tensor(data).float().dtype)
print(torch.tensor(data).long().dtype)

torch.float32
torch.int64


## Tensor dataset

In [9]:
# You need to first convert the data into tensor and then use tensor dataset
dataset = TensorDataset(torch_data)

In [10]:
dataset.tensors

(tensor([[-0.2442,  0.6855, -0.9106,  ..., -0.8036, -0.6581,  0.4569],
         [-0.5478,  0.1285,  1.3211,  ...,  1.2489,  0.5812, -1.1960],
         [-0.0902,  1.0480, -0.7154,  ..., -0.7502,  1.9680, -0.7825],
         ...,
         [ 0.0793, -1.3227,  1.4886,  ..., -0.0250,  1.9527,  2.2336],
         [-0.9297,  0.0633,  0.8646,  ..., -0.4126,  0.5041, -1.0778],
         [ 0.0524, -1.6379, -1.3762,  ..., -0.2442,  1.6656, -0.5606]],
        dtype=torch.float64),)

In [11]:
len(dataset)

100

In [12]:
len(dataset.tensors)

1

In [13]:
dataset.tensors[0]

tensor([[-0.2442,  0.6855, -0.9106,  ..., -0.8036, -0.6581,  0.4569],
        [-0.5478,  0.1285,  1.3211,  ...,  1.2489,  0.5812, -1.1960],
        [-0.0902,  1.0480, -0.7154,  ..., -0.7502,  1.9680, -0.7825],
        ...,
        [ 0.0793, -1.3227,  1.4886,  ..., -0.0250,  1.9527,  2.2336],
        [-0.9297,  0.0633,  0.8646,  ..., -0.4126,  0.5041, -1.0778],
        [ 0.0524, -1.6379, -1.3762,  ..., -0.2442,  1.6656, -0.5606]],
       dtype=torch.float64)

In [14]:
dataset.tensors[0].shape

torch.Size([100, 20])

In [15]:
# adding labels to the dataset
labels = torch.ceil(torch.linspace(0, 4, obs))

In [16]:
labels

tensor([0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
        2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3., 3.,
        3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
        3., 3., 3., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
        4., 4., 4., 4., 4., 4., 4., 4., 4., 4.])

In [17]:
labels = labels.reshape(len(labels), 1)

In [18]:
dataset = TensorDataset(torch_data, labels)

In [19]:
len(dataset)

100

In [20]:
dataset[10]

(tensor([ 1.4071, -0.3454,  0.0166, -0.1237, -0.9819,  0.0603, -1.0729, -1.0792,
         -1.0838,  0.7402, -0.5326, -0.8233,  1.6424,  0.2813, -0.3783,  0.3989,
         -1.1332, -0.9233, -0.5074, -0.4731], dtype=torch.float64),
 tensor([1.]))

In [21]:
len(dataset.tensors)

2

In [22]:
dataset.tensors[0].shape

torch.Size([100, 20])

In [23]:
dataset.tensors[1].shape

torch.Size([100, 1])

# DataLoaders

In [31]:
batch = 25
dataloader = DataLoader(dataset, batch_size=batch, shuffle=True)
# dataloader gets reshuffled everytime you call it

In [32]:
dataloader.dataset.tensors[0].size()

torch.Size([100, 20])

In [33]:
# Only when you iterate over the dataloader, the methods of the dataloader will be called
for i, j in dataloader:
  print(i.shape, j.shape)

torch.Size([25, 20]) torch.Size([25, 1])
torch.Size([25, 20]) torch.Size([25, 1])
torch.Size([25, 20]) torch.Size([25, 1])
torch.Size([25, 20]) torch.Size([25, 1])


In [34]:
for i, j in dataloader:
  print(j.T)

tensor([[1., 4., 3., 3., 2., 3., 1., 4., 4., 3., 2., 3., 4., 3., 4., 4., 1., 3.,
         1., 1., 4., 4., 4., 1., 3.]])
tensor([[2., 1., 1., 3., 2., 1., 2., 4., 4., 1., 2., 1., 2., 4., 1., 2., 1., 2.,
         3., 2., 1., 2., 2., 3., 4.]])
tensor([[3., 4., 4., 3., 1., 1., 3., 1., 3., 1., 2., 2., 3., 3., 1., 1., 3., 3.,
         4., 2., 4., 2., 3., 1., 3.]])
tensor([[2., 2., 4., 1., 2., 2., 0., 3., 3., 2., 4., 1., 4., 3., 2., 4., 2., 4.,
         4., 1., 4., 2., 3., 4., 2.]])


In [36]:
#  To get only one batch
i, j = next(iter(dataloader))
print(i.shape, j.shape)
print(j.T)

torch.Size([25, 20]) torch.Size([25, 1])
tensor([[3., 1., 1., 1., 1., 3., 3., 3., 1., 2., 3., 2., 2., 4., 1., 2., 4., 4.,
         2., 3., 1., 1., 3., 3., 4.]])
