## Data Preprocessing Part

In [37]:
# imports
import pandas as pd
from PIL import Image
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
import torch
import torchvision
import torch.optim as optim
import os
from tqdm import tqdm
from torchvision.ops import box_iou

In [19]:
# check if cuda is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [39]:
# load in data

train_annotation = pd.read_csv('data/train/_annotations.csv')
val_annotation = pd.read_csv('data/valid/_annotations.csv')
test_annotation = pd.read_csv('data/test/_annotations.csv')

# print the number of samples of the data
print("Number of training, validation and testing samples")
print(len(train_annotation))
print(len(val_annotation))
print(len(test_annotation))

# check the dimension of image is the same
print("min and max width and height of training images: ", train_annotation['width'].min(), train_annotation['width'].max(), train_annotation['height'].min(), train_annotation['height'].max())
print("min and max width and height of validation images: ", val_annotation['width'].min(), val_annotation['width'].max(), val_annotation['height'].min(), val_annotation['height'].max())
print("min and max width and height of testing images: ", test_annotation['width'].min(), test_annotation['width'].max(), test_annotation['height'].min(), test_annotation['height'].max())

Number of training, validation and testing samples
1512
144
72
min and max width and height of training images:  512 512 512 512
min and max width and height of validation images:  512 512 512 512
min and max width and height of testing images:  512 512 512 512


In [21]:
# function map the class name to an integer
def charToVal(c):
    return ord(c) - 64

def valToChar(v):
    return chr(v + 64)

## Dataset Class

In [22]:
# image dimension
img_width = 512
img_height = 512

In [75]:
# create the dataset class

class ASLDataset(Dataset):
    """
    Custom ASL dataset needed for training
    """
    def __init__(self, annotation, img_dir, transforms=None):
        # initialize the annotation, paths to images, and transforms
        self.annotation = annotation
        self.img_dir = img_dir
        self.transforms = transforms

    def __len__(self):
        # return the length of the dataset
        return len(self.annotation)
    
    def __getitem__(self, idx):
        filename = self.annotation.at[idx, 'filename']
        
        class_name = self.annotation.at[idx, 'class']
        xmin = self.annotation.at[idx, 'xmin']
        xmax = self.annotation.at[idx, 'xmax']
        ymin = self.annotation.at[idx, 'ymin']
        ymax = self.annotation.at[idx, 'ymax']

        # read in the image
        path = os.path.join(self.img_dir, filename)
        img = Image.open(path).convert("RGB")

        if self.transforms is not None:
            img = self.transforms(img)

        # target
        target = {}
        target['boxes'] = torch.as_tensor([[xmin, ymin, xmax, ymax]], dtype=torch.float32)
        target['labels'] = torch.as_tensor([charToVal(class_name)], dtype=torch.int64)

        return img, target

In [76]:
# define the transforms
data_transform = transforms.Compose([
    transforms.ToTensor(),
])

# create the dataset
train_dataset = ASLDataset(train_annotation, 'data/train', data_transform)
val_dataset = ASLDataset(val_annotation, 'data/valid', data_transform)
test_dataset = ASLDataset(test_annotation, 'data/test', data_transform)

## Model Training Part

In [77]:
def find_best_iou(pred_boxes, target_box):
    best_iou = 0
    best_index = -1
    for i, box in enumerate(pred_boxes):
        curr_iou = box_iou(target_box, box)
        if curr_iou >= best_iou:
            best_iou = curr_iou
            best_index = i
    return best_iou, best_index

In [78]:
# collate function
def collate_fn(batch):
    images, targets = zip(*batch)
    images = list(images)
    targets = list(targets)
    return images, targets

In [84]:
# initialize dataloaders
batch_size = 4
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn)

In [85]:
# check the dataloader
images, targets = next(iter(train_dataloader))
print(images)
print(targets)

[tensor([[[0.1725, 0.1725, 0.1725,  ..., 0.1647, 0.1647, 0.1647],
         [0.1725, 0.1725, 0.1725,  ..., 0.1647, 0.1686, 0.1686],
         [0.1765, 0.1765, 0.1765,  ..., 0.1725, 0.1725, 0.1725],
         ...,
         [0.1686, 0.1686, 0.1686,  ..., 0.7961, 0.7961, 0.7961],
         [0.1686, 0.1686, 0.1686,  ..., 0.8078, 0.8039, 0.8000],
         [0.1686, 0.1686, 0.1686,  ..., 0.8196, 0.8118, 0.8039]],

        [[0.1725, 0.1725, 0.1725,  ..., 0.1686, 0.1686, 0.1686],
         [0.1686, 0.1686, 0.1686,  ..., 0.1686, 0.1686, 0.1686],
         [0.1608, 0.1608, 0.1608,  ..., 0.1686, 0.1686, 0.1686],
         ...,
         [0.1686, 0.1686, 0.1686,  ..., 0.5529, 0.5529, 0.5529],
         [0.1686, 0.1686, 0.1686,  ..., 0.5647, 0.5608, 0.5569],
         [0.1686, 0.1686, 0.1686,  ..., 0.5765, 0.5686, 0.5608]],

        [[0.1647, 0.1647, 0.1647,  ..., 0.1843, 0.1843, 0.1765],
         [0.1608, 0.1608, 0.1608,  ..., 0.1765, 0.1765, 0.1765],
         [0.1569, 0.1569, 0.1569,  ..., 0.1608, 0.1608, 0

In [86]:
# define the model
model = fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = 27
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [87]:
# training preparation
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 20

H = {}
H['train_loss'] = []
H['mean_ious'] = []

In [88]:
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for images, targets in tqdm(train_dataloader):
        optimizer.zero_grad()
        images = list(image.to(device) for image in images)
        
        
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        #print(len(images))
        #print(len(targets))
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        losses.backward()
        optimizer.step()

        train_loss += losses.item()
    print(f"Epoch {epoch + 1} Loss: {train_loss/len(train_dataloader)}")
    H["training_loss"].append(train_loss/len(train_dataloader))

    model.eval()
    mean_ious = 0.0
    with torch.no_grad():
        for images, targets in tqdm(val_dataloader):
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            outputs = model(images)

            ious = []
            iou_idxs = []

            for i, output in enumerate(outputs):
                target_box = targets[i]['boxes'][0].cpu().numpy()
                pred_boxes = output['boxes'].cpu().numpy()

                best_iou, best_idx = find_best_iou(pred_boxes, target_box)
                ious.append(best_iou)
                iou_idxs.append(best_idx)
            mean_ious += sum(ious)/len(ious)
    print(f"Epoch {epoch + 1} Mean IOU: {mean_ious/len(val_dataloader)}")

 23%|██▎       | 87/378 [02:53<09:44,  2.01s/it]