# Download data file

To download, run the following script in the repo main directory 
> python3 download.py

Courtesy of TACO toolkit\
[Source](https://github.com/pedropro/TACO)

# Preview the dataset

Data images exploration for number of image classes. Refer to:
> demo.ipynb

Courtesy of TACO toolkit\
[Source](https://github.com/pedropro/TACO)

# Train Test Split

Run the following script to generate N random train, validation and test subsets.\
Set the directory, specify the test percentage, specify the validation percentage and provide the number of train-test-split trials
> python3 split_dataset.py --dataset_dir ../data --test_percentage 10 --val_percentage 10 --nr_trials 1

Courtesy of TACO toolkit\
[Source](https://github.com/pedropro/TACO)

# Import Libraries

In [1]:
import os
import torch
import torch.utils.data
import torchvision
from PIL import Image
from pycocotools.coco import COCO
import torchvision.transforms.functional as F

# Dataloader requires custom Dataset class
from dataclass import myOwnDataset
from collate import collate_fn

# Pre-processing data for training

Define transformation to tensor for training.\
Option to resize images for consistency. Possible step for future tuning.

In [2]:
# since Faster RCNN allows for varied input image sizes, no need to resize images
def get_transform():
    custom_transforms = []
    custom_transforms.append(torchvision.transforms.ToTensor())
    
    return torchvision.transforms.Compose(custom_transforms)

Set file path

In [3]:
# data folder
dataset_path = '../data'

# train val test annotations
train_file_path = dataset_path + '/train/' + 'annotations_0_train.json'
val_file_path = dataset_path + '/val/' + 'annotations_0_val.json'
test_file_path = dataset_path + '/test/' + 'annotations_0_test.json'

Verify file path

In [4]:
train_file_path

'../data/train/annotations_0_train.json'

Dataloader wraps an iterable to allow for transformation and manage batches, etc.

In [5]:
# create own Dataset
my_dataset = myOwnDataset(root=dataset_path,
                          annotation=train_file_path,
                          transforms=get_transform()
                          )

# Batch size
train_batch_size = 4

# own DataLoader
data_loader = torch.utils.data.DataLoader(my_dataset,
                                          batch_size=train_batch_size,
                                          shuffle=True,
                                          num_workers=6,
                                          collate_fn=collate_fn)


loading annotations into memory...
Done (t=0.03s)
creating index...
index created!


Check DataLoader
Let’s check whether our DataLoader pulls images and annotations iteratively.

In [6]:
# select device (whether GPU or CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# DataLoader is iterable over Dataset
# Below code takes some time to run for 1200 images

# for imgs, annotations in data_loader:
#     imgs = list(img.to(device) for img in imgs)
#     annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
#     print(annotations)

[{'boxes': tensor([[ 342., 2916.,  412., 3049.],
        [1288., 1590., 1523., 1788.],
        [  93., 1702.,  157., 1826.]]), 'labels': tensor([37, 15, 37]), 'image_id': tensor([1463]), 'area': tensor([ 4331.5000, 27895.5000,  4697.5000]), 'iscrowd': tensor([0, 0, 0])}, {'boxes': tensor([[ 686.,   21., 1555., 1053.],
        [ 686.,   19., 1549.,  744.],
        [   0.,  923., 2097., 3259.]]), 'labels': tensor([21, 28, 35]), 'image_id': tensor([852]), 'area': tensor([ 666909.5000,  491755.5000, 3047088.5000]), 'iscrowd': tensor([0, 0, 0])}, {'boxes': tensor([[ 323.,  704., 2175., 1965.],
        [ 286.,  555., 2297., 2647.]]), 'labels': tensor([15, 37]), 'image_id': tensor([823]), 'area': tensor([1812929.5000, 1251678.0000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[1299., 1332., 1397., 1401.]]), 'labels': tensor([22]), 'image_id': tensor([1221]), 'area': tensor([4695.]), 'iscrowd': tensor([0])}]
[{'boxes': tensor([[ 969., 1910., 1665., 2559.]]), 'labels': tensor([58]), 'image_i

[{'boxes': tensor([[ 797., 1128., 1411., 1740.]]), 'labels': tensor([46]), 'image_id': tensor([1130]), 'area': tensor([329836.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 996., 2026., 1329., 2185.],
        [1106., 1220., 1321., 1399.],
        [1090., 1816., 1235., 2036.],
        [1374., 1944., 1467., 2000.]]), 'labels': tensor([37, 37, 58, 58]), 'image_id': tensor([954]), 'area': tensor([43937.5000, 22246.5000,  9989.5000,  2927.5000]), 'iscrowd': tensor([0, 0, 0, 0])}, {'boxes': tensor([[1269., 2918., 1403., 3030.],
        [ 926., 2028., 1199., 2246.],
        [1569., 2641., 1633., 2717.],
        [2105.,  103., 2157.,  214.],
        [ 437.,  505.,  550.,  583.],
        [1662., 2403., 1920., 2666.],
        [2379., 1151., 2405., 1246.]]), 'labels': tensor([60, 30, 59, 60, 59, 59, 60]), 'image_id': tensor([1429]), 'area': tensor([ 5939.0000, 22855.5000,  2516.5000,  4487.5000,  5503.5000, 28980.5000,
         2123.5000]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0])}, {'boxes

[{'boxes': tensor([[1076., 1559., 1348., 1838.],
        [1083., 1594., 1132., 1650.]]), 'labels': tensor([13, 51]), 'image_id': tensor([1309]), 'area': tensor([49065.5000,  1700.0000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[1650., 1465., 1805., 1582.],
        [1613., 1409., 1643., 1670.],
        [1595.,  466., 1710.,  683.]]), 'labels': tensor([58, 30, 30]), 'image_id': tensor([1285]), 'area': tensor([11343.0000,  4516.0000, 20194.5000]), 'iscrowd': tensor([0, 0, 0])}, {'boxes': tensor([[1055.,  839., 1425., 1159.],
        [1546., 2846., 1974., 3167.]]), 'labels': tensor([60, 60]), 'image_id': tensor([490]), 'area': tensor([47785.0000, 50472.5000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[ 205., 1936.,  510., 2356.],
        [3019., 1647., 3761., 1761.]]), 'labels': tensor([56, 56]), 'image_id': tensor([376]), 'area': tensor([10457.5000, 11271.5000]), 'iscrowd': tensor([0, 0])}]
[{'boxes': tensor([[1279., 1541., 1580., 1739.]]), 'labels': tensor([40]), 'image_id': t

[{'boxes': tensor([[1099., 1403., 1520., 1568.],
        [1126., 1414., 1348., 1618.],
        [1756., 3201., 1829., 3264.]]), 'labels': tensor([56, 28, 59]), 'image_id': tensor([1036]), 'area': tensor([ 7278.5000, 34272.0000,  2536.5000]), 'iscrowd': tensor([0, 0, 0])}, {'boxes': tensor([[1245.,  956., 1594., 1362.]]), 'labels': tensor([15]), 'image_id': tensor([812]), 'area': tensor([42696.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 320., 1490., 2659., 3459.]]), 'labels': tensor([58]), 'image_id': tensor([464]), 'area': tensor([2903887.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 395., 1032., 4153., 2074.]]), 'labels': tensor([7]), 'image_id': tensor([468]), 'area': tensor([3044175.]), 'iscrowd': tensor([0])}]
[{'boxes': tensor([[ 483., 1509., 2341., 3327.],
        [2137., 3136., 2342., 3327.]]), 'labels': tensor([6, 8]), 'image_id': tensor([524]), 'area': tensor([1198214.,   16828.]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[ 956., 2559., 1670., 3379.],
        [

[{'boxes': tensor([[1205., 1905., 1416., 2176.],
        [1776., 1222., 1814., 1232.]]), 'labels': tensor([21, 60]), 'image_id': tensor([56]), 'area': tensor([29272.,   280.]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[ 930., 1688., 1645., 3241.]]), 'labels': tensor([17]), 'image_id': tensor([449]), 'area': tensor([669607.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 398.,  593., 1230., 1349.]]), 'labels': tensor([21]), 'image_id': tensor([684]), 'area': tensor([469191.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[1932., 1198., 2363., 1535.],
        [ 740.,  828.,  814.,  889.]]), 'labels': tensor([40, 58]), 'image_id': tensor([1287]), 'area': tensor([66675.,  2703.]), 'iscrowd': tensor([0, 0])}]
[{'boxes': tensor([[ 259., 1791.,  990., 2372.],
        [ 486., 1423.,  941., 1736.],
        [ 434., 1217.,  834., 1561.],
        [ 516.,  681., 1034., 1248.],
        [ 838.,  865., 1350., 1448.],
        [1026.,  967., 1670., 1402.],
        [ 867., 1201., 1835., 2317.]

[{'boxes': tensor([[ 710., 1504.,  854., 1606.],
        [ 676., 1669.,  806., 1792.],
        [ 580., 1993.,  669., 2109.],
        [1094., 1408., 1107., 1437.],
        [ 833., 1420.,  865., 1432.],
        [ 601., 2262.,  619., 2297.],
        [ 564., 2224.,  608., 2255.],
        [ 570., 2296.,  597., 2333.],
        [1055.,  216., 1081.,  229.]]), 'labels': tensor([15, 15, 40, 60, 60, 60, 60, 60, 60]), 'image_id': tensor([1486]), 'area': tensor([8767.0000, 9429.0000, 7492.5000,  349.5000,  187.0000,  570.0000,
         826.5000,  660.5000,  318.5000]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0])}, {'boxes': tensor([[1361., 1377., 1886., 1979.],
        [2378., 4791., 2463., 5214.]]), 'labels': tensor([ 8, 59]), 'image_id': tensor([342]), 'area': tensor([242326.0000,  26482.5000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[2139.,  942., 2238., 1221.]]), 'labels': tensor([37]), 'image_id': tensor([1263]), 'area': tensor([22828.5000]), 'iscrowd': tensor([0])}, {'boxes': tenso

[{'boxes': tensor([[ 750., 1540., 1067., 1871.]]), 'labels': tensor([30]), 'image_id': tensor([908]), 'area': tensor([4440.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 884., 2770., 1307., 3304.]]), 'labels': tensor([33]), 'image_id': tensor([609]), 'area': tensor([51695.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 779.,  651., 1571., 1346.],
        [ 745., 2445., 1621., 3258.]]), 'labels': tensor([39, 18]), 'image_id': tensor([210]), 'area': tensor([326176.5000, 333698.0000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[1151., 1902., 1714., 2801.],
        [1254.,  857., 1452., 1156.],
        [1037.,  894., 1224., 1013.]]), 'labels': tensor([40, 37, 37]), 'image_id': tensor([568]), 'area': tensor([295807.5000,  40527.5000,  12468.5000]), 'iscrowd': tensor([0, 0, 0])}]
[{'boxes': tensor([[1055., 1724., 1396., 2385.],
        [ 506., 1954.,  544., 2012.],
        [ 192., 2686.,  238., 2772.]]), 'labels': tensor([ 6, 60, 60]), 'image_id': tensor([1045]), 'area': tenso

[{'boxes': tensor([[ 604., 1052.,  791., 1372.]]), 'labels': tensor([13]), 'image_id': tensor([31]), 'area': tensor([27762.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 873.3333, 2352.9048, 1127.3334, 2609.9048],
        [1532.3334, 1806.4762, 1641.3334, 1905.4762]]), 'labels': tensor([37, 37]), 'image_id': tensor([1430]), 'area': tensor([47839.0234,  4895.0000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[1694., 1094., 2476., 1680.],
        [ 848., 1462.,  872., 1484.]]), 'labels': tensor([30, 60]), 'image_id': tensor([959]), 'area': tensor([2.4259e+05, 2.3400e+02]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[ 274., 1261., 2431., 3289.]]), 'labels': tensor([37]), 'image_id': tensor([374]), 'area': tensor([2697243.]), 'iscrowd': tensor([0])}]
[{'boxes': tensor([[1611.,  490., 2464., 1359.]]), 'labels': tensor([16]), 'image_id': tensor([1332]), 'area': tensor([319780.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 787., 1695., 1058., 1922.]]), 'labels': tensor([37]), 'image

[{'boxes': tensor([[1773., 1325., 2014., 1880.]]), 'labels': tensor([52]), 'image_id': tensor([934]), 'area': tensor([28500.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[1119., 1640., 1339., 1884.],
        [ 989., 1556., 1422., 1943.]]), 'labels': tensor([32, 15]), 'image_id': tensor([189]), 'area': tensor([35714.0000, 93893.5000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[ 821., 1428., 2340., 3774.]]), 'labels': tensor([40]), 'image_id': tensor([441]), 'area': tensor([2672527.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 994.,  579., 1602.,  894.],
        [2478., 1995., 2745., 2340.]]), 'labels': tensor([6, 6]), 'image_id': tensor([1081]), 'area': tensor([98612.5000, 48975.5000]), 'iscrowd': tensor([0, 0])}]
[{'boxes': tensor([[ 835.8000, 1831.8000, 1024.8000, 2454.8000],
        [ 885.0000, 2377.0000,  987.0000, 2454.0000]]), 'labels': tensor([6, 8]), 'image_id': tensor([1385]), 'area': tensor([100530.5000,   6560.0000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[12

[{'boxes': tensor([[1110., 1024., 1503., 1430.],
        [1499., 2005., 1984., 2502.],
        [1838., 2314., 2128., 2572.]]), 'labels': tensor([47, 47, 59]), 'image_id': tensor([844]), 'area': tensor([113615., 143818.,  29176.]), 'iscrowd': tensor([0, 0, 0])}, {'boxes': tensor([[1329.,  720., 1679.,  887.],
        [1352.,  757., 1378.,  824.]]), 'labels': tensor([13, 51]), 'image_id': tensor([761]), 'area': tensor([30381.5000,  1181.5000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[ 985., 1188., 1479., 1575.]]), 'labels': tensor([40]), 'image_id': tensor([1086]), 'area': tensor([103979.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 775., 1350.,  800., 1371.],
        [ 973., 1626., 1058., 1734.]]), 'labels': tensor([8, 6]), 'image_id': tensor([1479]), 'area': tensor([ 345.5000, 6901.0000]), 'iscrowd': tensor([0, 0])}]
[{'boxes': tensor([[1272., 1706., 1668., 2223.],
        [1489., 2992., 1607., 3092.]]), 'labels': tensor([22, 59]), 'image_id': tensor([350]), 'area': tensor([1

[{'boxes': tensor([[1002., 1141., 2057., 1671.]]), 'labels': tensor([56]), 'image_id': tensor([967]), 'area': tensor([28073.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[1084., 1784., 1331., 2152.],
        [1139.,  924., 1456., 1144.]]), 'labels': tensor([13, 13]), 'image_id': tensor([710]), 'area': tensor([65083.5000, 45490.5000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[1071.3333, 1921.1818, 1372.5739, 2199.0000]]), 'labels': tensor([49]), 'image_id': tensor([228]), 'area': tensor([43138.8750]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 818., 1260.,  926., 1348.]]), 'labels': tensor([9]), 'image_id': tensor([1201]), 'area': tensor([7475.]), 'iscrowd': tensor([0])}]
[{'boxes': tensor([[ 973., 2258., 1491., 2769.]]), 'labels': tensor([8]), 'image_id': tensor([286]), 'area': tensor([198913.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[1837., 2158., 3539., 2819.]]), 'labels': tensor([30]), 'image_id': tensor([554]), 'area': tensor([498157.]), 'iscrowd': tensor([0])}, {

[{'boxes': tensor([[1.3950e+03, 1.2810e+03, 2.1480e+03, 1.8870e+03],
        [3.4500e+02, 1.8680e+03, 1.2500e+03, 2.5220e+03],
        [2.7100e+02, 1.4290e+03, 9.4000e+02, 1.9960e+03],
        [0.0000e+00, 1.5440e+03, 3.5700e+02, 2.1280e+03],
        [1.0000e+00, 1.9510e+03, 3.3700e+02, 2.3930e+03]]), 'labels': tensor([13, 13, 40, 37, 15]), 'image_id': tensor([715]), 'area': tensor([258465.0000, 388417.5000, 160247.0000,  95678.5000,  83614.5000]), 'iscrowd': tensor([0, 0, 0, 0, 0])}, {'boxes': tensor([[ 643.,  551.,  759.,  602.],
        [ 257., 1386.,  373., 1414.],
        [1224.,  994., 1630., 1194.],
        [1224., 1159., 1257., 1196.],
        [1217.,  429., 1356.,  488.],
        [   7.,  689.,  109.,  771.]]), 'labels': tensor([59, 30,  6,  8, 59, 37]), 'image_id': tensor([103]), 'area': tensor([ 2815.5000,  1307.0000, 35171.0000,   763.0000,  4823.0000,  5881.5000]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0])}, {'boxes': tensor([[ 795., 1102.,  947., 1405.],
        [ 933., 1058.

[{'boxes': tensor([[ 879., 1475., 1690., 1940.],
        [ 314., 1752.,  388., 1788.]]), 'labels': tensor([18, 60]), 'image_id': tensor([997]), 'area': tensor([283111.,   1188.]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[1374., 1308., 1802., 1669.],
        [1862., 1467., 2351., 1968.],
        [1864., 1465., 2366., 1873.],
        [1416., 1356., 1899., 1777.]]), 'labels': tensor([28, 46, 37, 59]), 'image_id': tensor([694]), 'area': tensor([119093., 186076., 152140.,  68518.]), 'iscrowd': tensor([0, 0, 0, 0])}, {'boxes': tensor([[ 657.,   15., 1210.,  598.],
        [ 850., 1772., 1274., 2190.]]), 'labels': tensor([38, 28]), 'image_id': tensor([77]), 'area': tensor([ 71303.0000, 134279.5000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[ 992.,  926., 1075., 1003.],
        [ 805., 1839., 1114., 2408.],
        [ 691., 2114., 1162., 2372.],
        [1116.,  784., 1140.,  864.],
        [1908., 1048., 1940., 1126.],
        [1396., 3050., 1424., 3124.],
        [1730., 2934., 17

[{'boxes': tensor([[2010.,  938., 2279., 1350.],
        [ 945., 1276., 1058., 1463.],
        [ 720., 1219.,  838., 1312.],
        [1211., 1659., 1368., 1810.],
        [1330., 1632., 1377., 1661.],
        [1506., 1506., 1703., 1770.],
        [ 659., 1477.,  752., 1534.]]), 'labels': tensor([37, 30, 59, 30, 30, 30, 30]), 'image_id': tensor([1280]), 'area': tensor([21431.5000, 12201.5000,  8490.0000,  4692.5000,   633.5000, 21520.0000,
         2639.0000]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0])}, {'boxes': tensor([[1163., 2064., 1536., 2552.]]), 'labels': tensor([22]), 'image_id': tensor([911]), 'area': tensor([70258.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[2781., 2216., 2858., 2296.],
        [2720., 2220., 2783., 2288.],
        [ 891., 2433.,  992., 2533.],
        [ 679., 2591.,  781., 2695.],
        [ 738., 2684.,  847., 2793.],
        [ 598., 1024., 1454., 1808.],
        [2525., 3113., 2566., 3161.],
        [ 333., 1448.,  480., 1522.]]), 'labels': tensor([30, 30

[{'boxes': tensor([[1615.,  732., 1840.,  898.],
        [1222., 1300., 2031., 1492.],
        [ 394.,  900.,  893., 1301.]]), 'labels': tensor([58, 37,  1]), 'image_id': tensor([976]), 'area': tensor([19814.0000, 92940.5000, 82750.0000]), 'iscrowd': tensor([0, 0, 0])}, {'boxes': tensor([[1.1510e+03, 7.1800e+02, 1.4480e+03, 8.8000e+02],
        [2.2920e+03, 8.6800e+02, 2.4040e+03, 9.4300e+02],
        [2.2920e+03, 9.2000e+02, 2.3140e+03, 9.4400e+02],
        [1.0000e+00, 7.8800e+02, 7.4000e+01, 8.2600e+02]]), 'labels': tensor([41,  6,  8, 59]), 'image_id': tensor([1227]), 'area': tensor([32756.0000,  4897.0000,   405.5000,  2089.5000]), 'iscrowd': tensor([0, 0, 0, 0])}, {'boxes': tensor([[ 723.,  688., 2048., 2532.]]), 'labels': tensor([44]), 'image_id': tensor([825]), 'area': tensor([1856797.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 549.,  955., 1114., 1442.]]), 'labels': tensor([1]), 'image_id': tensor([1090]), 'area': tensor([164776.]), 'iscrowd': tensor([0])}]
[{'boxes': ten

[{'boxes': tensor([[ 664.,  619.,  903.,  889.],
        [1179.,  662., 1465.,  927.],
        [ 753.,  722.,  834.,  835.],
        [1318.,  677., 1383.,  775.]]), 'labels': tensor([11, 11, 51, 51]), 'image_id': tensor([33]), 'area': tensor([50284.0000, 54620.0000,  4381.0000,  3012.5000]), 'iscrowd': tensor([0, 0, 0, 0])}, {'boxes': tensor([[ 684., 1423., 1994., 2011.]]), 'labels': tensor([37]), 'image_id': tensor([529]), 'area': tensor([453026.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 532., 1120., 1466., 1575.],
        [1419., 2199., 2272., 2845.]]), 'labels': tensor([37, 58]), 'image_id': tensor([270]), 'area': tensor([226412., 275552.]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[1039., 1287., 2120., 2787.]]), 'labels': tensor([22]), 'image_id': tensor([373]), 'area': tensor([1092646.5000]), 'iscrowd': tensor([0])}]
[{'boxes': tensor([[ 864., 1320., 1963., 2318.],
        [1084., 3356., 1941., 4159.]]), 'labels': tensor([5, 5]), 'image_id': tensor([435]), 'area': t

[{'boxes': tensor([[1355.,  907., 1597., 1104.]]), 'labels': tensor([59]), 'image_id': tensor([738]), 'area': tensor([5903.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[1033., 2507., 1786., 3231.]]), 'labels': tensor([40]), 'image_id': tensor([287]), 'area': tensor([271009.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 330.,  769., 1490., 1879.],
        [ 732., 1191., 2154., 2574.],
        [ 848., 1983., 2124., 3138.],
        [ 941., 2269., 1690., 2887.],
        [ 806., 1591., 1632., 2083.],
        [  22., 1565., 1036., 3174.],
        [ 454., 1859.,  855., 2794.]]), 'labels': tensor([43, 40, 39, 34, 15, 19, 37]), 'image_id': tensor([689]), 'area': tensor([652220.0000, 961491.0000, 481815.5000, 290151.0000,  54788.5000,
        980115.0000, 213815.5000]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0])}, {'boxes': tensor([[ 942.,  630., 3183., 2836.]]), 'labels': tensor([46]), 'image_id': tensor([499]), 'area': tensor([3890205.5000]), 'iscrowd': tensor([0])}]
[{'boxes': tensor([[1756

[{'boxes': tensor([[ 958., 2139., 1624., 2635.]]), 'labels': tensor([37]), 'image_id': tensor([1418]), 'area': tensor([185691.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[1409., 1558., 1547., 1686.],
        [1978., 1146., 2171., 1230.],
        [ 793., 2266.,  901., 2338.]]), 'labels': tensor([40, 37, 37]), 'image_id': tensor([1226]), 'area': tensor([8985.0000, 8792.5000, 4724.0000]), 'iscrowd': tensor([0, 0, 0])}, {'boxes': tensor([[1146., 2174., 1871., 2693.]]), 'labels': tensor([13]), 'image_id': tensor([319]), 'area': tensor([193975.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[3.9800e+02, 4.1000e+02, 1.5320e+03, 1.4080e+03],
        [1.3360e+03, 4.1000e+02, 1.5320e+03, 6.2000e+02],
        [0.0000e+00, 1.0000e+00, 3.3800e+02, 6.6500e+02],
        [0.0000e+00, 3.6900e+02, 4.8700e+02, 6.8800e+02]]), 'labels': tensor([ 6,  8, 37, 46]), 'image_id': tensor([13]), 'area': tensor([437552.5000,  14820.0000, 159217.0000,  55172.0000]), 'iscrowd': tensor([0, 0, 0, 0])}]
[{'boxes': ten

[{'boxes': tensor([[ 800., 2520., 1196., 2671.],
        [1176., 2532., 1196., 2593.]]), 'labels': tensor([6, 8]), 'image_id': tensor([198]), 'area': tensor([40387.,   818.]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[1429.5000, 2413.0000, 1620.5000, 2551.0000]]), 'labels': tensor([6]), 'image_id': tensor([113]), 'area': tensor([13775.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[2107., 1414., 2344., 1597.],
        [1708.,   92., 1763.,  192.]]), 'labels': tensor([13, 37]), 'image_id': tensor([1002]), 'area': tensor([27579.,  3658.]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[ 454., 1145., 1197., 1526.],
        [  25.,  444., 2114., 2564.]]), 'labels': tensor([50, 19]), 'image_id': tensor([787]), 'area': tensor([  67606.5000, 2084558.0000]), 'iscrowd': tensor([0, 0])}]
[{'boxes': tensor([[1944., 3468., 2387., 3840.],
        [ 987., 2341., 1369., 2747.],
        [1222., 2028., 1482., 2098.],
        [2334., 3619., 2431., 3756.]]), 'labels': tensor([15, 21, 37,  1]), '

[{'boxes': tensor([[1132.9500, 1990.8000, 1643.2500, 2734.2000]]), 'labels': tensor([40]), 'image_id': tensor([248]), 'area': tensor([343000.9688]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 912., 1235., 1318., 1879.],
        [1144., 1472., 1479., 1863.],
        [1510.,  730., 1634.,  907.],
        [1757., 2179., 1819., 2320.]]), 'labels': tensor([41, 41, 30, 59]), 'image_id': tensor([107]), 'area': tensor([128737.0000,  62131.5000,  14330.5000,   4889.0000]), 'iscrowd': tensor([0, 0, 0, 0])}, {'boxes': tensor([[ 623., 1655.,  978., 2052.]]), 'labels': tensor([37]), 'image_id': tensor([1445]), 'area': tensor([60376.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 700., 1524.,  953., 1746.]]), 'labels': tensor([37]), 'image_id': tensor([169]), 'area': tensor([33598.5000]), 'iscrowd': tensor([0])}]
[{'boxes': tensor([[ 919., 1627., 1224., 1805.]]), 'labels': tensor([13]), 'image_id': tensor([1256]), 'area': tensor([46588.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 475., 16

[{'boxes': tensor([[1517., 1255., 1639., 1386.]]), 'labels': tensor([8]), 'image_id': tensor([960]), 'area': tensor([12421.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 951., 1516., 1188., 1672.],
        [2202., 1638., 2458., 1739.],
        [ 360., 1917.,  465., 1978.],
        [1316., 1943., 1425., 2041.]]), 'labels': tensor([ 6,  6, 59, 59]), 'image_id': tensor([1145]), 'area': tensor([20410.5000, 18159.0000,  3151.0000,  4456.5000]), 'iscrowd': tensor([0, 0, 0, 0])}, {'boxes': tensor([[1445.,  874., 1567., 1346.],
        [1546.,  182., 1633.,  366.]]), 'labels': tensor([56, 59]), 'image_id': tensor([1225]), 'area': tensor([7076.5000, 1796.5000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[ 836., 1331., 1564., 2112.],
        [   0., 2358.,  114., 2466.]]), 'labels': tensor([41, 59]), 'image_id': tensor([1129]), 'area': tensor([328952.,   6522.]), 'iscrowd': tensor([0, 0])}]
[{'boxes': tensor([[1329.,  964., 1935., 2687.],
        [1293.,  989., 2854., 1558.]]), 'labels

[{'boxes': tensor([[1852., 3080., 1947., 3170.],
        [1655., 1930., 1746., 2017.]]), 'labels': tensor([30,  8]), 'image_id': tensor([891]), 'area': tensor([4455.5000, 6130.5000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[ 879., 1411., 2422., 2332.]]), 'labels': tensor([37]), 'image_id': tensor([1167]), 'area': tensor([880609.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[9.7529e+02, 3.4430e+03, 1.3473e+03, 4.0450e+03],
        [1.0480e+03, 9.7905e+02, 1.1950e+03, 1.0780e+03],
        [1.1960e+03, 3.0476e+00, 1.7990e+03, 5.7505e+02]]), 'labels': tensor([13,  8,  5]), 'image_id': tensor([427]), 'area': tensor([125548.5312,   8505.5000, 169470.5000]), 'iscrowd': tensor([0, 0, 0])}, {'boxes': tensor([[1.0000e+00, 2.1300e+02, 1.1430e+03, 3.7950e+03],
        [1.1260e+03, 1.3630e+03, 1.8220e+03, 2.5400e+03]]), 'labels': tensor([58, 18]), 'image_id': tensor([205]), 'area': tensor([2005818.5000,  623638.5000]), 'iscrowd': tensor([0, 0])}]
[{'boxes': tensor([[ 570., 1226., 2034., 250

[{'boxes': tensor([[1774.,  884., 2032., 1374.],
        [1778., 1338., 2105., 1631.],
        [3034., 2035., 3125., 2113.],
        [2732., 1889., 2806., 1950.],
        [2923., 1788., 2982., 1827.]]), 'labels': tensor([ 6, 58, 58, 58, 58]), 'image_id': tensor([1283]), 'area': tensor([69731.5000, 50302.0000,  4194.0000,  2972.5000,   981.0000]), 'iscrowd': tensor([0, 0, 0, 0, 0])}, {'boxes': tensor([[1302., 1804., 1665., 2216.]]), 'labels': tensor([40]), 'image_id': tensor([246]), 'area': tensor([57408.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[1850.,  571., 2164., 1134.]]), 'labels': tensor([15]), 'image_id': tensor([988]), 'area': tensor([109736.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[1169.,  286., 2664., 2206.]]), 'labels': tensor([40]), 'image_id': tensor([357]), 'area': tensor([1149914.5000]), 'iscrowd': tensor([0])}]
[{'boxes': tensor([[1324., 2115., 1829., 2445.]]), 'labels': tensor([40]), 'image_id': tensor([585]), 'area': tensor([45429.5000]), 'iscrowd': tens

[{'boxes': tensor([[ 661.,  438., 1305., 1141.],
        [1736., 1834., 1945., 2030.],
        [1717., 2027., 1929., 2355.],
        [1065., 1188., 1799., 1988.],
        [1833., 3042., 2674., 4137.]]), 'labels': tensor([22, 37, 37, 19, 41]), 'image_id': tensor([388]), 'area': tensor([229560.0000,  25236.5000,  33624.0000, 352105.5000, 674374.0000]), 'iscrowd': tensor([0, 0, 0, 0, 0])}, {'boxes': tensor([[ 981., 1718., 2433., 2895.],
        [2004., 2194., 2165., 2334.]]), 'labels': tensor([43, 58]), 'image_id': tensor([656]), 'area': tensor([1464271.0000,   13521.5000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[ 583., 1544.,  714., 1597.]]), 'labels': tensor([6]), 'image_id': tensor([124]), 'area': tensor([4624.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 714., 1093.,  948., 1330.]]), 'labels': tensor([21]), 'image_id': tensor([17]), 'area': tensor([32486.]), 'iscrowd': tensor([0])}]
[{'boxes': tensor([[1040., 1226., 1157., 1343.],
        [ 963., 1282., 1093., 1329.]]), 'la

[{'boxes': tensor([[ 826.,  679., 2322., 1670.]]), 'labels': tensor([40]), 'image_id': tensor([871]), 'area': tensor([1399000.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 374., 1542.,  643., 1882.],
        [2279., 1151., 2414., 1332.],
        [2941., 1158., 3744., 1793.]]), 'labels': tensor([58, 15, 58]), 'image_id': tensor([454]), 'area': tensor([ 69593.0000,  14383.5000, 183740.0000]), 'iscrowd': tensor([0, 0, 0])}, {'boxes': tensor([[ 433., 1314., 1129., 1926.]]), 'labels': tensor([37]), 'image_id': tensor([922]), 'area': tensor([190575.]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 928., 1567., 1975., 3040.]]), 'labels': tensor([37]), 'image_id': tensor([565]), 'area': tensor([929940.8125]), 'iscrowd': tensor([0])}]
[{'boxes': tensor([[6.6900e+02, 1.8000e+01, 1.2880e+03, 4.7500e+02],
        [2.0270e+03, 5.7500e+02, 2.3700e+03, 8.2500e+02],
        [1.0000e+00, 4.2200e+02, 2.9880e+03, 4.1020e+03]]), 'labels': tensor([34, 40, 41]), 'image_id': tensor([340]), 'area': tens

[{'boxes': tensor([[ 852., 1713., 2058., 2836.]]), 'labels': tensor([7]), 'image_id': tensor([540]), 'area': tensor([452806.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[1145., 1472., 1373., 1603.],
        [2057.,  269., 2099.,  323.]]), 'labels': tensor([30,  8]), 'image_id': tensor([1297]), 'area': tensor([10483.5000,  1701.5000]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[1567., 1375., 1829., 1497.],
        [ 841., 1834., 1267., 2149.],
        [1611., 1676., 1877., 1825.],
        [1320., 1558., 1359., 1577.],
        [1367., 1616., 1402., 1646.],
        [1527., 1610., 1659., 1639.],
        [1559., 1589., 1599., 1606.],
        [1504., 1390., 1586., 1485.]]), 'labels': tensor([ 5, 55, 40, 30, 30, 30, 30, 30]), 'image_id': tensor([962]), 'area': tensor([26115.5000, 73133.5000, 23010.0000,   331.0000,   603.5000,  2044.0000,
          472.5000,  2466.0000]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0, 0])}, {'boxes': tensor([[ 940.,  837., 1432., 1548.]]), 'labels': tensor

[{'boxes': tensor([[3050.8120,  322.0000, 3460.9805,  525.0000],
        [1862.6219, 2358.0000, 2501.5637, 2889.0000],
        [1135.0000,  494.0000, 1422.0000,  661.0000]]), 'labels': tensor([19, 19, 59]), 'image_id': tensor([240]), 'area': tensor([39447.1484, 74082.4844, 12864.0000]), 'iscrowd': tensor([0, 0, 0])}, {'boxes': tensor([[ 803., 2011., 1130., 2402.]]), 'labels': tensor([13]), 'image_id': tensor([1387]), 'area': tensor([78314.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[1510., 2313., 1832., 2957.],
        [1524., 2315., 1601., 2403.]]), 'labels': tensor([6, 8]), 'image_id': tensor([1251]), 'area': tensor([107507.,   4007.]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([[1190.,  863., 1540., 2952.],
        [ 870., 1238., 1849., 3354.]]), 'labels': tensor([56, 37]), 'image_id': tensor([442]), 'area': tensor([ 154814.5000, 1453023.0000]), 'iscrowd': tensor([0, 0])}]
[{'boxes': tensor([[1751.,  916., 3274., 2400.]]), 'labels': tensor([40]), 'image_id': tensor([452]),

[{'boxes': tensor([[2088., 1079., 2404., 1535.],
        [2098., 1078., 2146., 1109.],
        [2550., 1593., 2631., 1632.]]), 'labels': tensor([ 6,  8, 40]), 'image_id': tensor([1369]), 'area': tensor([70617.5000,   664.5000,  1714.0000]), 'iscrowd': tensor([0, 0, 0])}, {'boxes': tensor([[1163.,  979., 1383., 1291.],
        [1986., 1447., 2067., 1516.],
        [1644., 1051., 1682., 1131.],
        [1684.,  400., 1786.,  491.]]), 'labels': tensor([37, 60, 60, 37]), 'image_id': tensor([1411]), 'area': tensor([49871.5000,  2692.0000,  1832.5000,  5276.0000]), 'iscrowd': tensor([0, 0, 0, 0])}, {'boxes': tensor([[ 857.,  565., 3148., 2022.]]), 'labels': tensor([37]), 'image_id': tensor([306]), 'area': tensor([1734631.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 797.,  729., 2044., 2944.]]), 'labels': tensor([37]), 'image_id': tensor([265]), 'area': tensor([1470078.]), 'iscrowd': tensor([0])}]
[{'boxes': tensor([[  61.,  503., 2275., 2893.]]), 'labels': tensor([5]), 'image_id': te

[{'boxes': tensor([[1720., 1503., 1776., 1559.],
        [1751., 1434., 1807., 1491.],
        [1629., 1383., 1671., 1419.],
        [2313., 1157., 2345., 1212.],
        [ 624., 1432.,  683., 1467.],
        [ 187., 2311.,  211., 2387.],
        [1681., 1701., 1724., 1742.],
        [2519., 1294., 2550., 1363.],
        [1567., 1210., 1625., 1263.]]), 'labels': tensor([51,  9, 51, 60, 60, 60, 51, 59,  9]), 'image_id': tensor([1454]), 'area': tensor([1559.5000, 2051.0000,  975.5000,  861.5000, 1080.5000, 1491.5000,
        1281.5000, 1125.0000, 1879.0000]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0])}, {'boxes': tensor([[ 626.0000,  955.5000, 3750.0000, 3095.0000]]), 'labels': tensor([40]), 'image_id': tensor([379]), 'area': tensor([3693554.7500]), 'iscrowd': tensor([0])}, {'boxes': tensor([[2165., 1445., 2272., 1547.]]), 'labels': tensor([8]), 'image_id': tensor([749]), 'area': tensor([8414.5000]), 'iscrowd': tensor([0])}, {'boxes': tensor([[ 582.,  949.,  810., 1165.]]), 'labels':

Instantiate the Model.\
Set epochs and resume training if continuing from a specified checkpoint.

In [7]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

def get_model_instance_segmentation(num_classes):
    # load an instance segmentation pre-trained model
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model
    
# 60 classes + 1 background
num_classes = 61
num_epochs = 5
model = get_model_instance_segmentation(num_classes)

# move model to the right device
model.to(device)
    
# parameters
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# dataloader length
len_dataloader = len(data_loader)

# for resuming training from a fixed checkpoint
PATH = 'model_25_34.pt'

checkpoint = torch.load(PATH, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

# set model to training instance
model.train()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

Model Training\
Test run to confirm no runtime errors\
Interrupt to stop run

In [9]:
for epoch in range(num_epochs):
    model.train()
    i = 0
    total_loss = 0
    for imgs, annotations in data_loader:
        i += 1
        imgs = list(img.to(device) for img in imgs)
        annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
        loss_dict = model(imgs, annotations)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        print(f'Epoch: {epoch}, Iteration: {i}/{len_dataloader}, Loss: {losses}')
        total_loss += losses
    
    ave_loss = total_loss / len(data_loader)
    print(f'Epoch: {epoch}, Avg loss: {ave_loss}')
    
    # save model after each epochs to preserve integrity (due to frequency crashing)
    PATH = 'model_35_39_' + str(epoch) + '_.pt'
    torch.save({
            'epoch': epoch+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': ave_loss,
            }, PATH)

Epoch: 0, Iteration: 1/300, Loss: 0.07377630472183228
Epoch: 0, Iteration: 2/300, Loss: 0.046626150608062744


KeyboardInterrupt: 

In [12]:
# save trained model instance
EPOCH = num_epochs
PATH = 'model_35_39.pt'
LOSS = 0.11

torch.save({
            'epoch': EPOCH,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': LOSS,
            }, PATH)