# Building input pipelines in PyTorch

## Creating a PyTorch DataLoader from existing tensors

In [1]:
my_iterator = range(5)

for item in my_iterator:
    print(item)

0
1
2
3
4


In [2]:
import torch
import numpy as np
from torch.utils.data import DataLoader

t = torch.arange(6, dtype=torch.float32)
data_loader = DataLoader(t)

In [3]:
#for i in range(1):
for item in data_loader:
    print(item)
    

tensor([0.])
tensor([1.])
tensor([2.])
tensor([3.])
tensor([4.])
tensor([5.])


In [4]:
# Instead of iterating over single elements, we can iterate over batches

data_loader = DataLoader(t, batch_size=3, shuffle=True, drop_last=False)


for batch in data_loader: # Start counting at 1
    print(f'batch: ',  batch)

batch:  tensor([5., 3., 0.])
batch:  tensor([1., 4., 2.])


### Combining two tensors into a joint dataset

In [5]:
from torch.utils.data import Dataset

class JointDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [6]:
torch.manual_seed(1)

t_x = torch.rand([4, 3], dtype=torch.float32)
t_y = torch.arange(4)
print(t_x)
print(t_y)

joint_dataset = JointDataset(t_x, t_y)
x,y = joint_dataset[3]
print(x,y)

for example in joint_dataset:
    print('  x: ', example[0], 
          '  y: ', example[1])



tensor([[0.7576, 0.2793, 0.4031],
        [0.7347, 0.0293, 0.7999],
        [0.3971, 0.7544, 0.5695],
        [0.4388, 0.6387, 0.5247]])
tensor([0, 1, 2, 3])
tensor([0.4388, 0.6387, 0.5247]) tensor(3)
  x:  tensor([0.7576, 0.2793, 0.4031])   y:  tensor(0)
  x:  tensor([0.7347, 0.0293, 0.7999])   y:  tensor(1)
  x:  tensor([0.3971, 0.7544, 0.5695])   y:  tensor(2)
  x:  tensor([0.4388, 0.6387, 0.5247])   y:  tensor(3)


#### Use buit-in class

In [7]:
# We can also use TensorDataset directly
from torch.utils.data import TensorDataset
joint_dataset = TensorDataset(t_x, t_y)

for example in joint_dataset:
    print('  x: ', example[0], 
          '  y: ', example[1])

  x:  tensor([0.7576, 0.2793, 0.4031])   y:  tensor(0)
  x:  tensor([0.7347, 0.0293, 0.7999])   y:  tensor(1)
  x:  tensor([0.3971, 0.7544, 0.5695])   y:  tensor(2)
  x:  tensor([0.4388, 0.6387, 0.5247])   y:  tensor(3)


### Shuffle, batch, and repeat

In [8]:
torch.manual_seed(1)
data_loader = DataLoader(dataset=joint_dataset, batch_size=2, shuffle=True)

for i, batch in enumerate(data_loader, 1):
        print(f'batch {i}:', 'x:', batch[0], 
              '\n         y:', batch[1])
        
for epoch in range(2):
    print(f'epoch {epoch+1}')
    for i, batch in enumerate(data_loader, 1):
        print(f'batch {i}:', 'x:', batch[0], 
              '\n         y:', batch[1])

batch 1: x: tensor([[0.3971, 0.7544, 0.5695],
        [0.7576, 0.2793, 0.4031]]) 
         y: tensor([2, 0])
batch 2: x: tensor([[0.7347, 0.0293, 0.7999],
        [0.4388, 0.6387, 0.5247]]) 
         y: tensor([1, 3])
epoch 1
batch 1: x: tensor([[0.7576, 0.2793, 0.4031],
        [0.3971, 0.7544, 0.5695]]) 
         y: tensor([0, 2])
batch 2: x: tensor([[0.7347, 0.0293, 0.7999],
        [0.4388, 0.6387, 0.5247]]) 
         y: tensor([1, 3])
epoch 2
batch 1: x: tensor([[0.4388, 0.6387, 0.5247],
        [0.3971, 0.7544, 0.5695]]) 
         y: tensor([3, 2])
batch 2: x: tensor([[0.7576, 0.2793, 0.4031],
        [0.7347, 0.0293, 0.7999]]) 
         y: tensor([0, 1])


## Linear regression dataset
Recall the model 
$$y = wx + b + \epsilon, \quad x\sim \mathcal{N}(0,2^2) \text{ and } \epsilon \sim \mathcal{N}(0,0.6^2)$$
We can create this data using a dataset object

In [10]:


N = 50;

x = 2*torch.randn(N, dtype=torch.float)
noise = 0.6*torch.randn(N, dtype=torch.float)
w = -1.5
b = 3

y = w*x + b + noise
joint_dataset = TensorDataset(x,y);
print(joint_dataset[0:3])


(tensor([-2.7265, -1.9664,  3.0225]), tensor([ 6.7774,  6.7625, -1.3927]))
