In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, Dataset, DataLoader

from torch.utils.data import Subset
import torchvision
from torchvision import datasets, models, transforms, utils
from torch.autograd import Variable
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
cuda_enabled = torch.cuda.is_available()
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#cuda_enabled = False
CUDA_VISIBLE_DEVICES=0,1,2,3
cudnn.benchmark = True

from skimage import io, transform
import matplotlib.pyplot as plt

import os
import pandas as pd
import numpy as np
import glob
from __future__ import print_function, division
import datetime
import sys
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

from ast import literal_eval

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

<matplotlib.pyplot._IonContext at 0x7fb262714190>


#### Bidirectional Recurrent Neural Network

In [256]:
# Bidirectional recurrent neural network (many-to-one)
# class BiRNN(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers, num_classes):
#         super(BiRNN, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
#         self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection
    
#     def forward(self, x):
#         # Set initial states
#         h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection 
#         c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        
#         # Forward propagate LSTM
#         out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
        
#         # Decode the hidden state of the last time step
#         out = self.fc(out[:, -1, :])
#         return out

In [113]:
# Bidirectional recurrent neural network (many-to-one)
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiRNN, self).__init__()
        self.is_training = False
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Dropout(p=0.5, inplace=False)
        self.fc = nn.Linear(hidden_size*2, num_classes)
        # (-1, sequence_length * input_size)
        # self.linear = nn.Linear(3*244*244, num_classes)
        # self.linear = nn.Linear(144, 256)
        self.linear = nn.Linear(self.hidden_size*2, self.num_classes)
        # self.linear = nn.Linear(hidden_size*2, num_classes)
    
    def forward(self, x):
        # Set initial states
        # print(x)
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection 
        # print(h0)
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        # print(c0)
        # if cuda_enabled:
        #     h0 = h0.cuda()  # 2 for bidirection
        #     c0 = c0.cuda()
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
        
        # Decode hidden state of last time step
        if self.is_training:
            out = self.fc(out[:, -1, :])
        else:
            out = out[:, -1, :]
        # out = F.log_softmax(self.linear(out), dim=1)
        return out



#### Body and Eye Tracking Dataset

Dataset class
-------------

``torch.utils.data.Dataset`` is an abstract class representing a
dataset.
Your custom dataset should inherit ``Dataset`` and override the following
methods:

-  ``__len__`` so that ``len(dataset)`` returns the size of the dataset.
-  ``__getitem__`` to support the indexing such that ``dataset[i]`` can
   be used to get $i$\ th sample.

Let's create a dataset class for our face landmarks dataset. We will
read the csv in ``__init__`` but leave the reading of images to
``__getitem__``. This is memory efficient because all the images are not
stored in the memory at once but read as required.

Sample of our dataset will be a dict
``{'image': image, 'landmarks': landmarks}``. Our dataset will take an
optional argument ``transform`` so that any required processing can be
applied on the sample. We will see the usefulness of ``transform`` in the
next section.




In [317]:
class EyeLandmarksDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, csv_file, transform=None):#root_dir = img_dir
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.landmarks_frame = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.landmarks_frame)#same as image label

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # name = os.path.join(self.root_dir, self.landmarks_frame.iloc[idx, 0])
        name = self.landmarks_frame.iloc[idx, 0]
        labels = np.zeros(4)
        for i in range(4):
            labels[i]=name
        labels = torch.Tensor(labels).long()
        landmarks = self.landmarks_frame.iloc[idx, 1:]
        marks = []
        for mark in range(len(landmarks)):
            arr = literal_eval(landmarks[mark])[0]
            arr1 = []
            for i in range(len(arr)):
                for j in range(len(arr[i])):
                    # print(arr[i][j])
                    arr1.append(arr[i][j])
            marks.append(np.array(arr1))  
            # return
                          
        marks = torch.Tensor(marks)
        
        sample = {'label': labels, 'marks': marks}
        if self.transform:
            sample = self.transform(sample)
        return sample

In [318]:
class BodyLandmarksDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, csv_file, transform=None):#root_dir = img_dir
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.landmarks_frame = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.landmarks_frame)#same as image label

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # name = os.path.join(self.root_dir, self.landmarks_frame.iloc[idx, 0])
        name = self.landmarks_frame.iloc[idx, 0]
        labels = np.zeros(4)
        for i in range(4):
            labels[i]=name
        labels = torch.Tensor(labels).long()
        landmarks = self.landmarks_frame.iloc[idx, 1:]
        marks = []
        for mark in range(len(landmarks)):
            arr=literal_eval(landmarks[mark])
            # print(arr)
            # arr = literal_eval(landmarks[mark])[0]
            # print(arr)
            arr = np.array(arr)
            # print(arr)
            marks.append(arr)
            # return
            
                          
        marks = torch.Tensor(marks)
        
        sample = {'label': labels, 'marks': marks}
        if self.transform:
            sample = self.transform(sample)
        return sample

In [319]:
eyedataset = EyeLandmarksDataset(csv_file='CSVs/EyeTracking.csv')
eyedataset.__len__(), eyedataset.__getitem__(0)['label'].shape, eyedataset.__getitem__(0)['marks'].shape

(224, torch.Size([4]), torch.Size([125, 8]))

In [320]:
bodydataset = BodyLandmarksDataset(csv_file='CSVs/BodyTracking.csv')
bodydataset.__len__(), bodydataset.__getitem__(0)['label'].shape, bodydataset.__getitem__(0)['marks'].shape

(224, torch.Size([4]), torch.Size([181, 36]))

In [321]:
eyedataset.__getitem__(5)['marks'][:5]

tensor([[16., 24., 26., 26., 55., 28., 23., 23.],
        [12., 22., 25., 25., 52., 25., 24., 24.],
        [14., 24., 25., 25., 54., 28., 24., 24.],
        [15., 24., 25., 25., 54., 28., 24., 24.],
        [13., 23., 25., 25., 53., 28., 23., 23.]])

In [322]:
bodydataset.__getitem__(5)['marks'][:5]

tensor([[ -1.,  -1., 306., 360., 222., 360.,  -1.,  -1.,  -1.,  -1., 417., 360.,
         417., 532., 389., 594.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [ -1.,  -1., 306., 360., 222., 360.,  -1.,  -1.,  -1.,  -1., 417., 360.,
         417., 532., 389., 579.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1., 361., 266.],
        [ -1.,  -1., 306., 360., 222., 360.,  -1.,  -1.,  -1.,  -1., 417., 360.,
         417., 532., 389., 579.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [ -1.,  -1., 306., 360., 222., 360.,  -1.,  -1.,  -1.,  -1., 417., 360.,
         417., 532., 389., 594.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [306., 219., 306

In [323]:
#NN Data

def train_total_dataset(dataset, total_split=0.30): #Training set be 70% of the dataset
    train_idx, total_idx = train_test_split(list(range(len(dataset))), test_size=total_split)
    datasets = {}
    datasets['train'] = Subset(dataset, train_idx)
    datasets['total_split'] = Subset(dataset, total_idx)
    return datasets
def test_val_dataset(dataset, total_split=0.5): #Testing and Validation set be 15% (50% of the remaining 30% of the dataset) 
    test_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=total_split)
    datasets = {}
    datasets['test'] = Subset(dataset, test_idx)
    datasets['validation'] = Subset(dataset, val_idx)
    return datasets

# hyperparams for the network
batch_size = 8

print('Starting Eye NN dataset -------------------------------------')
eye_datasets = train_total_dataset(eyedataset)
eye_train = eye_datasets['train']
eye_datasets_split = test_val_dataset(eye_datasets['total_split'])
eye_test = eye_datasets_split['test']
eye_validation = eye_datasets_split['validation']
# The original dataset is available in the Subset class
print(len(eye_datasets['train'].dataset), eye_datasets['train'].dataset)
print(len(eye_train), eye_train)
print(len(eye_test), eye_test)
print(len(eye_validation), eye_validation)

print('Starting Eye NN loader -------------------------------------')
eye_train_loader = DataLoader(dataset=eye_train, batch_size=batch_size, shuffle=True)
eye_test_loader = DataLoader(dataset=eye_test, batch_size=batch_size, shuffle=False)
eye_validation_loader = DataLoader(dataset=eye_validation, batch_size=batch_size, shuffle=False)
print(len(eye_train_loader), eye_train_loader)
print(len(eye_test_loader), eye_test_loader)
print(len(eye_validation_loader), eye_validation_loader)
print('Finish  -------------------------------------')

print('Starting Body NN dataset -------------------------------------')
body_datasets = train_total_dataset(bodydataset)
body_train = body_datasets['train']
body_datasets_split = test_val_dataset(body_datasets['total_split'])
body_test = body_datasets_split['test']
body_validation = body_datasets_split['validation']
# The original dataset is available in the Subset class
print(len(body_datasets['train'].dataset), body_datasets['train'].dataset)
print(len(body_train), body_train)
print(len(body_test), body_test)
print(len(body_validation), body_validation)

print('Starting Body NN loader -------------------------------------')
body_train_loader = DataLoader(dataset=body_train, batch_size=batch_size, shuffle=True)
body_test_loader = DataLoader(dataset=body_test, batch_size=batch_size, shuffle=False)
body_validation_loader = DataLoader(dataset=body_validation, batch_size=batch_size, shuffle=False)
print(len(body_train_loader), body_train_loader)
print(len(body_test_loader), body_test_loader)
print(len(body_validation_loader), body_validation_loader)
print('Finish  -------------------------------------')

Starting Eye NN dataset -------------------------------------
224 <__main__.EyeLandmarksDataset object at 0x7fb263e52190>
156 <torch.utils.data.dataset.Subset object at 0x7fb26a946a60>
34 <torch.utils.data.dataset.Subset object at 0x7fb26a946790>
34 <torch.utils.data.dataset.Subset object at 0x7fb26a946f70>
Starting Eye NN loader -------------------------------------
20 <torch.utils.data.dataloader.DataLoader object at 0x7fb26a05b730>
5 <torch.utils.data.dataloader.DataLoader object at 0x7fb26914a430>
5 <torch.utils.data.dataloader.DataLoader object at 0x7fb2691470d0>
Finish  -------------------------------------
Starting Body NN dataset -------------------------------------
224 <__main__.BodyLandmarksDataset object at 0x7fb26bb373d0>
156 <torch.utils.data.dataset.Subset object at 0x7fb2699aa580>
34 <torch.utils.data.dataset.Subset object at 0x7fb2699aa7c0>
34 <torch.utils.data.dataset.Subset object at 0x7fb2699aab80>
Starting Body NN loader -------------------------------------
20 <to

In [324]:
eye_train

<torch.utils.data.dataset.Subset at 0x7fb26a946a60>

#### Eye Tracking LSTM

In [329]:
#Init NN
input_size = 2
sequence_length = 125
hidden_size = 64
num_layers = 1
num_classes = 2  # TODO: Determine this from the data
learning_rate = 0.0001
num_epochs = 100

# The network
eye_model = BiRNN(input_size, hidden_size, num_layers, num_classes).to(device)
eye_model.is_training = True

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(eye_model.parameters(), lr=learning_rate)

epoch_loss = 5000000000.

In [327]:
for i, (sample) in enumerate(eye_train_loader):
    labels, marks = sample['label'], sample['marks']
    # marks = marks.flatten()
    print(labels[0])
    print(marks[0][:5])
    # print(sample)
    break

tensor([1, 1, 1, 1])
tensor([[-1., -1., -1., -1., -1., -1., -1., -1.],
        [-1., -1., -1., -1., -1., -1., -1., -1.],
        [-1., -1., -1., -1., -1., -1., -1., -1.],
        [-1., -1., -1., -1., -1., -1., -1., -1.],
        [-1., -1., -1., -1., -1., -1., -1., -1.]])


In [330]:
for epoch in range(num_epochs):
    loss_total = 0.
    iteration_count = 0.
    for i, sample in enumerate(eye_train):
        iteration_count += 1.
        labels, marks = sample['label'], sample['marks'] 
        marks = Variable(marks.view(-1, sequence_length, input_size))
        labels = Variable(labels)
        if cuda_enabled:
            marks = marks.cuda()
            labels = labels.cuda()

        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = eye_model(marks)

        loss = criterion(outputs, labels)
        loss_total += loss.item()
        loss.backward()
        optimizer.step()

        if (i + 1) % 10 == 0:
            print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                  % (epoch + 1, num_epochs, i + 1, len(eye_train) // batch_size, loss.item()))
    current_epoch_loss = loss_total / iteration_count
    print('Epoch %d; loss = %0.4f' % (epoch, current_epoch_loss))
    epoch_loss = current_epoch_loss


Epoch [1/100], Step [10/19], Loss: 0.7329
Epoch [1/100], Step [20/19], Loss: 0.6933
Epoch [1/100], Step [30/19], Loss: 0.7716
Epoch [1/100], Step [40/19], Loss: 0.7849
Epoch [1/100], Step [50/19], Loss: 0.6933
Epoch [1/100], Step [60/19], Loss: 0.6950
Epoch [1/100], Step [70/19], Loss: 0.7063
Epoch [1/100], Step [80/19], Loss: 0.6934
Epoch [1/100], Step [90/19], Loss: 0.6960
Epoch [1/100], Step [100/19], Loss: 0.6700
Epoch [1/100], Step [110/19], Loss: 0.5971
Epoch [1/100], Step [120/19], Loss: 0.7401
Epoch [1/100], Step [130/19], Loss: 0.6588
Epoch [1/100], Step [140/19], Loss: 0.7064
Epoch [1/100], Step [150/19], Loss: 0.6081
Epoch 0; loss = 0.6886
Epoch [2/100], Step [10/19], Loss: 0.6262
Epoch [2/100], Step [20/19], Loss: 0.6469
Epoch [2/100], Step [30/19], Loss: 0.6434
Epoch [2/100], Step [40/19], Loss: 0.5082
Epoch [2/100], Step [50/19], Loss: 0.6540
Epoch [2/100], Step [60/19], Loss: 0.7923
Epoch [2/100], Step [70/19], Loss: 0.7050
Epoch [2/100], Step [80/19], Loss: 0.7334
Epoch

In [332]:
# Save the Model
torch.save(eye_model.state_dict(), 'ShouldIDrive_eye_tracking.pkl')

#### Body Tracking LSTM

In [88]:
#Init NN
input_size = 2
sequence_length = 121
hidden_size = 64
num_layers = 1
num_classes = 2  # TODO: Determine this from the data
learning_rate = 0.0001
num_epochs = 300

# The network
model = BiRNN(input_size, hidden_size, num_layers, num_classes).to(device)
model.is_training = True

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

epoch_loss = 5000000000.

In [99]:
for i, (sample) in enumerate(body_train_loader):
    labels, marks = sample['label'], sample['marks']
    print(labels[0])
    print(marks[0][:5])
    break

tensor([1, 1, 1, 1])
tensor([[806., 313.,  -1.,  -1., 500., 626.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1., 723., 281.,  -1.,  -1., 640., 344.,  -1.,  -1.],
        [806., 328.,  -1.,  -1., 500., 626.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1., 723., 281.,  -1.,  -1., 640., 344.,  -1.,  -1.],
        [806., 313.,  -1.,  -1., 500., 626.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1., 723., 281., 834., 281., 640., 344.,  -1.,  -1.],
        [806., 313.,  -1.,  -1., 500., 626.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1., 723., 281., 834., 281., 640., 344.,  -1.,  -1.],
   

In [109]:
# Train the model
# total_step = len(train_loader)
for epoch in range(num_epochs):
    loss_total = 0.
    iteration_count = 0.
    for i, sample in enumerate(eye_train_loader):
        iteration_count += 1.
        labels, marks = sample['label'], sample['marks'] 
        marks = marks.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        if cuda_enabled:
            marks = marks.cuda()
            labels = labels.cuda()

        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = model(marks)
        
#         loss = criterion(outputs, labels)
#         loss_total += loss.data[0]
#         loss.backward()
#         optimizer.step()
        
#         if (i+1) % 10 == 0:
#             print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
#                    % (epoch + 1, num_epochs, i + 1, len(train) // batch_size, loss.data[0]))
#                    # .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
#     current_epoch_loss = loss_total / iteration_count
#     print('Epoch %d; loss = %0.4f' % (epoch, current_epoch_loss))

#     epoch_loss = current_epoch_loss

# timing['training'] = datetime.datetime.now() - timing['training']

RuntimeError: shape '[-1, 121, 2]' is invalid for input of size 8000

In [None]:
# # Test the Model
model.is_training = False
timing['testing'] = datetime.datetime.now()
print('Testing -----------------------------------------------')
correct = 0.0
total = 0.0
predicted_list = []
label_list = []
for mfcc, labels in test_loader:#test_loader
    mfcc = Variable(mfcc.view(-1, sequence_length, input_size))
    if cuda_enabled:
        mfcc = mfcc.cuda()

    outputs = rnn(mfcc)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    for p, l in zip(predicted, labels):
        predicted_list.append(p)
        label_list.append(l)
        if p == l:
            correct += 1.0

timing['testing'] = datetime.datetime.now() - timing['testing']

In [52]:
# Save the Model
torch.save(model.state_dict(), 'ShouldIDrive_body_tracking.pkl')

In [53]:
# Analyze NN
print('Timing (feature extraction, training, timing)')
print('=============================================')
print(timing['features'])
print(timing['training'])
print(timing['testing'])
print('')
print('=============================================')
print('')
print('Confusion Matrix')
print('================')
# print(train.get_encoder().classes_)
print(confusion_matrix(label_list, predicted_list))
print('=============================================')
print('Accuracy = %0.4f' % (accuracy_score(label_list, predicted_list)))
print('=============================================')

In [7]:
# # # Train NN
# for epoch in range(num_epochs):
#     loss_total = 0.
#     iteration_count = 0.
#     for i, (mfcc, labels) in enumerate(train_loader):
#         iteration_count += 1.
#         mfcc = Variable(mfcc.view(-1, sequence_length, input_size))
#         labels = Variable(labels)
#         if cuda_enabled:
#             mfcc = mfcc.cuda()
#             labels = labels.cuda()

#         # Forward + Backward + Optimize
#         optimizer.zero_grad()
#         outputs = model(mfcc)

#         loss = criterion(outputs, labels)
#         loss_total += loss.data[0]
#         loss.backward()
#         optimizer.step()

#         if (i + 1) % 10 == 0:
#             print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
#                   % (epoch + 1, num_epochs, i + 1, len(train) // batch_size, loss.data[0]))
#     current_epoch_loss = loss_total / iteration_count
#     print('Epoch %d; loss = %0.4f' % (epoch, current_epoch_loss))

#     epoch_loss = current_epoch_loss

#     timing['training'] = datetime.datetime.now() - timing['training']

In [7]:
# Test the model
# with torch.no_grad():
#     model.is_training = False
#     correct = 0
#     total = 0
#     for images, labels in test_loader:
#         images = images.reshape(-1, sequence_length, input_size).to(device)
#         labels = labels.to(device)
#         outputs = model(images)
#         _, predicted = torch.max(outputs.data, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()

#     print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) 

Test Accuracy of the model on the 10000 test images: 97.39 %


Transforms
----------

One issue we can see from the above is that the samples are not of the
same size. Most neural networks expect the images of a fixed size.
Therefore, we will need to write some preprocessing code.
Let's create three transforms:

-  ``Rescale``: to scale the image
-  ``RandomCrop``: to crop from image randomly. This is data
   augmentation.
-  ``ToTensor``: to convert the numpy images to torch images (we need to
   swap axes).

We will write them as callable classes instead of simple functions so
that parameters of the transform need not be passed everytime it's
called. For this, we just need to implement ``__call__`` method and
if required, ``__init__`` method. We can then use a transform like this:

::

    tsfm = Transform(params)
    transformed_sample = tsfm(sample)

Observe below how these transforms had to be applied both on the image and
landmarks.




In [4]:
class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, sample):
        image, landmarks = sample['image'], sample['landmarks']

        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)

        img = transform.resize(image, (new_h, new_w))

        # h and w are swapped for landmarks because for images,
        # x and y axes are axis 1 and 0 respectively
        landmarks = landmarks * [new_w / w, new_h / h]

        return {'image': img, 'landmarks': landmarks}


class RandomCrop(object):
    """Crop randomly the image in a sample.

    Args:
        output_size (tuple or int): Desired output size. If int, square crop
            is made.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        if isinstance(output_size, int):
            self.output_size = (output_size, output_size)
        else:
            assert len(output_size) == 2
            self.output_size = output_size

    def __call__(self, sample):
        image, landmarks = sample['image'], sample['landmarks']

        h, w = image.shape[:2]
        new_h, new_w = self.output_size

        top = np.random.randint(0, h - new_h)
        left = np.random.randint(0, w - new_w)

        image = image[top: top + new_h,
                      left: left + new_w]

        landmarks = landmarks - [left, top]

        return {'image': image, 'landmarks': landmarks}


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, landmarks = sample['image'], sample['landmarks']

        # swap color axis because
        # numpy image: H x W x C
        # torch image: C x H x W
        image = image.transpose((2, 0, 1))
        return {'image': torch.from_numpy(image).float(),
                'landmarks': torch.from_numpy(landmarks).float()}

In [234]:
# # Display image and label.
# sample = next(iter(train_loader))
# images, labels = sample['image'], sample['landmarks']
# # train_labels
# print(f"Images batch shape: {images.size()}")
# # print(images[0].shape)
# print(f"Labels batch shape: {labels.size()}")
# img = images[0].squeeze()
# # print(images)
# print(images.squeeze().shape)
# label = labels[0]
# plt.imshow(img[0], cmap="gray")
# plt.show()
# print(f"Label: {label}")

In [168]:
# Helper function to show a batch
# def show_landmarks_batch(sample_batched):
#     """Show image with landmarks for a batch of samples."""
#     images_batch, landmarks_batch = \
#             sample_batched['image'], sample_batched['landmarks']
#     batch_size = len(images_batch)
#     im_size = images_batch.size(2)
#     grid_border_size = 2

#     grid = utils.make_grid(images_batch)
#     plt.imshow(grid.numpy().transpose((1, 2, 0)))

#     for i in range(batch_size):
#         plt.scatter(landmarks_batch[i, :, 0].numpy() + i * im_size + (i + 1) * grid_border_size,
#                     landmarks_batch[i, :, 1].numpy() + grid_border_size,
#                     s=10, marker='.', c='r')

#         plt.title('Batch from dataloader')

# # if __name__ == '__main__':
# for i_batch, sample_batched in enumerate(train_loader):
#     print(i_batch, sample_batched['image'].size(),
#           sample_batched['landmarks'].size())

#     # observe 4th batch and stop.
#     if i_batch == 3:
#         plt.figure()
#         show_landmarks_batch(sample_batched)
#         plt.axis('off')
#         plt.ioff()
#         plt.show()
#         break