## Data Preprocessing Part

In [57]:
# imports
import sys
print(sys.executable)
import os
import pandas as pd
import numpy as np
from PIL import Image
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
import torch
import torchvision
import torch.optim as optim
from tqdm import tqdm
from torchvision.ops import box_iou


c:\Users\frank\projects\ASL_539_Project\.venv\Scripts\python.exe


In [58]:
# check if cuda is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print(torch.__version__)
print(device)
if device.type == 'cuda':
    print(torch.version.cuda)
    print(torch.cuda.current_device())
    print(torch.cuda.get_device_name(torch.cuda.current_device()))
    print(torch._C._cuda_getCompiledVersion())


2.1.0+cu121
cuda
12.1
0
NVIDIA GeForce RTX 3070 Ti
12010


In [59]:
# load in data

train_annotation = pd.read_csv('data/train/_annotations.csv')
val_annotation = pd.read_csv('data/valid/_annotations.csv')
test_annotation = pd.read_csv('data/test/_annotations.csv')

# print the number of samples of the data
print("Number of training, validation and testing samples")
print(len(train_annotation))
print(len(val_annotation))
print(len(test_annotation))

# check the dimension of image is the same
print("min and max width and height of training images: ", train_annotation['width'].min(), train_annotation['width'].max(), train_annotation['height'].min(), train_annotation['height'].max())
print("min and max width and height of validation images: ", val_annotation['width'].min(), val_annotation['width'].max(), val_annotation['height'].min(), val_annotation['height'].max())
print("min and max width and height of testing images: ", test_annotation['width'].min(), test_annotation['width'].max(), test_annotation['height'].min(), test_annotation['height'].max())

Number of training, validation and testing samples
1512
144
72
min and max width and height of training images:  512 512 512 512
min and max width and height of validation images:  512 512 512 512
min and max width and height of testing images:  512 512 512 512


In [60]:
# function map the class name to an integer
def charToVal(c):
    return ord(c) - 64

def valToChar(v):
    return chr(v + 64)

## Dataset Class

In [61]:
# image dimension
img_width = 512
img_height = 512

In [62]:
# create the dataset class

class ASLDataset(Dataset):
    """
    Custom ASL dataset needed for training
    """
    def __init__(self, annotation, img_dir, transforms=None):
        # initialize the annotation, paths to images, and transforms
        self.annotation = annotation
        self.img_dir = img_dir
        self.transforms = transforms

    def __len__(self):
        # return the length of the dataset
        return len(self.annotation)
    
    def __getitem__(self, idx):
        filename = self.annotation.at[idx, 'filename']
        
        class_name = self.annotation.at[idx, 'class']
        xmin = self.annotation.at[idx, 'xmin']
        xmax = self.annotation.at[idx, 'xmax']
        ymin = self.annotation.at[idx, 'ymin']
        ymax = self.annotation.at[idx, 'ymax']

        # read in the image
        path = os.path.join(self.img_dir, filename)
        img = Image.open(path).convert("RGB")

        if self.transforms is not None:
            img = self.transforms(img)

        # target
        target = {}
        target['boxes'] = torch.as_tensor([[xmin, ymin, xmax, ymax]], dtype=torch.float32)
        target['labels'] = torch.as_tensor([charToVal(class_name)], dtype=torch.int64)

        return img, target

In [63]:
# define the transforms
data_transform = transforms.Compose([
    transforms.ToTensor(),
])

# create the dataset
train_dataset = ASLDataset(train_annotation, 'data/train', data_transform)
val_dataset = ASLDataset(val_annotation, 'data/valid', data_transform)
test_dataset = ASLDataset(test_annotation, 'data/test', data_transform)

## Model Training Part

In [64]:
def find_best_iou(pred_boxes, target_box):
    best_iou = 0
    best_index = -1
    if(pred_boxes.shape[0] == 0):
        return 0, 0
    #print(pred_boxes.shape, target_box.shape)
    ious = box_iou(torch.from_numpy(target_box), torch.from_numpy(pred_boxes))
    #print(ious)
    best_iou = torch.max(ious[0])
    best_index = torch.argmax(ious[0])


    return best_iou, best_index

#test find best iou

target_box = np.array([[37., 65., 395., 512.], [32., 15., 263., 326.,]])
pred_boxes = np.array([[75.232, 161.32132, 401.234, 486.213]])

best_iou, best_idx = find_best_iou(pred_boxes, target_box)
print(best_iou, best_idx)

tensor(0.6411, dtype=torch.float64) tensor(0)


In [65]:
# collate function
def collate_fn(batch):
    images, targets = zip(*batch)
    images = list(images)
    targets = list(targets)
    return images, targets

In [66]:
# initialize dataloaders
batch_size = 2#setting to one for now to decrease time to test TODO: change back to 4
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn)

In [67]:
# check the dataloader
images, targets = next(iter(train_dataloader))
print(images)
print(targets)

[tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.3569, 0.3608, 0.3569],
         [0.0000, 0.0000, 0.0000,  ..., 0.0471, 0.0431, 0.0314],
         [0.0000, 0.0000, 0.0000,  ..., 0.0157, 0.0157, 0.0157]],

        [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.3255, 0.3216, 0.3255],
         [0.0000, 0.0000, 0.0000,  ..., 0.0275, 0.0118, 0.0118],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0

In [68]:
# define the model
model = fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = 27
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)



In [69]:
# training preparation
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 20

H = {}
H['train_loss'] = []
H['mean_ious'] = []
H['val_loss'] = []

In [70]:
best_loss = 100.0
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for images, targets in tqdm(train_dataloader):
        optimizer.zero_grad()
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        #print(len(images))
        #print(len(targets))
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        losses.backward()
        optimizer.step()

        train_loss += losses.item()
    print(f"Epoch {epoch + 1} Training Loss: {train_loss/len(train_dataloader)}")
    H["train_loss"].append(train_loss/len(train_dataloader))

    model.eval()
    val_loss = 0.0

    mean_ious = 0.0
    with torch.no_grad():
        for images, targets in tqdm(val_dataloader):
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            print(loss_dict)
            losses = sum(loss for loss in loss_dict[0].values())
            val_loss += losses.item()

            outputs = model(images)


            ious = []
            iou_idxs = []
            for i, output in enumerate(outputs):
                target_box = targets[i]['boxes'][0].cpu().numpy()
                target_box = target_box[None, :]
                pred_boxes = output['boxes'].cpu().numpy()

                best_iou, best_idx = find_best_iou(pred_boxes, target_box)
                ious.append(best_iou)
                iou_idxs.append(best_idx)
            mean_ious += sum(ious)/len(ious)
    tempvalidloss = val_loss/len(val_dataloader) 

    #loss would randomly spike at some points, this will hopefully fix it the hard way
    if tempvalidloss < best_loss:
        best_loss = tempvalidloss
        torch.save(model.state_dict(), 'models/best-model-parameters.pt')
    if tempvalidloss > 2 * best_loss and tempvalidloss > 5:
        print("previous state loaded instead")
        model.load_state_dict(torch.load('models/best-model-parameters.pt'))
    print(f"Epoch {epoch + 1} Validation Loss: {tempvalidloss}")
    H["val_loss"].append(tempvalidloss)
    print(f"Epoch {epoch + 1} Mean IOU: {mean_ious/len(val_dataloader)}")
    H["mean_ious"].append(mean_ious/len(val_dataloader))


  0%|          | 0/756 [00:00<?, ?it/s]

  0%|          | 0/756 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# save model

torch.save(model.state_dict(), 'models/trained_model.pth')

In [None]:
#testing
from sklearn.metrics import confusion_matrix, accuracy_score

model.eval()
mean_ious = 0.0
with torch.no_grad():
    for images, targets in tqdm(val_dataloader):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        val_loss += losses.item()

        outputs = model(images)


        ious = []
        iou_idxs = []
        for i, output in enumerate(outputs):
            target_box = targets[i]['boxes'][0].cpu().numpy()
            target_box = target_box[None, :]
            pred_boxes = output['boxes'].cpu().numpy()

            best_iou, best_idx = find_best_iou(pred_boxes, target_box)
            ious.append(best_iou)
            iou_idxs.append(best_idx)
        mean_ious += sum(ious)/len(ious)
tempvalidloss = val_loss/len(val_dataloader) 
print(f"Epoch {epoch + 1} Validation Loss: {tempvalidloss}")
print(f"Epoch {epoch + 1} Mean IOU: {mean_ious/len(val_dataloader)}")


  0%|          | 0/18 [00:00<?, ?it/s]


AttributeError: 'list' object has no attribute 'values'