<a href="https://colab.research.google.com/github/vidurp/notebooks/blob/main/PASCAL_VOC_Annotation_Parsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Parse PASCAL Annotation Format v1.0

Noteboook to implement code to parse annotation files for PASCAL VOC Dataset

http://host.robots.ox.ac.uk/pascal/VOC/

In [101]:
#Example Annotation Text File

String = '''# PASCAL Annotation Version 1.00

Image filename : "VOC2005_1/PNGImages/ETHZ_motorbike-testset/motorbikes005.png"
Image size (X x Y x C) : 640 x 480 x 3
Database : "The VOC2005 Dataset 1 Database (ETHZ)"
Objects with ground truth : 1 { "PASmotorbikeSide" }

# Note that there might be other objects in the image
# for which ground truth data has not been provided.

# Top left pixel co-ordinates : (1, 1)

# Details for object 1 ("PASmotorbikeSide")
Original label for object 1 "PASmotorbikeSide" : "motorbikeSide"
Bounding box for object 1 "PASmotorbikeSide" (Xmin, Ymin) - (Xmax, Ymax) : (206, 242) - (427, 365)
'''
print(String)

# PASCAL Annotation Version 1.00

Image filename : "VOC2005_1/PNGImages/ETHZ_motorbike-testset/motorbikes005.png"
Image size (X x Y x C) : 640 x 480 x 3
Database : "The VOC2005 Dataset 1 Database (ETHZ)"
Objects with ground truth : 1 { "PASmotorbikeSide" }

# Note that there might be other objects in the image
# for which ground truth data has not been provided.

# Top left pixel co-ordinates : (1, 1)

# Details for object 1 ("PASmotorbikeSide")
Original label for object 1 "PASmotorbikeSide" : "motorbikeSide"
Bounding box for object 1 "PASmotorbikeSide" (Xmin, Ymin) - (Xmax, Ymax) : (206, 242) - (427, 365)



In [102]:
import re

def ParsePascalString( str ):
    Lines = str.split('\n')
    for line in Lines:
        if('filename' in line):
            line = line.split(':')
            FileName = re.findall(r'(?<=["\']).*?(?=["\'])', line[1])
        if('Original' in line):
            line = line.split(':')
            Label = re.findall(r'(?<=["\']).*?(?=["\'])', line[1])
        if('Bounding' in line):
            line = line.split(':')
            bbox = re.findall('\d+',line[1])

    Dict = {
        'FilePath' : FileName[0],
        'class' : Label[0],
        'xmin': bbox[0],
        'ymin': bbox[1],
        'xmax': bbox[2],
        'ymax': bbox[3]
    }
    return ( Dict )

In [103]:

print(ParseString(String))

{'FilePath': 'VOC2005_1/PNGImages/ETHZ_motorbike-testset/motorbikes005.png', 'class': 'motorbikeSide', 'xmin': '206', 'ymin': '242', 'xmax': '427', 'ymax': '365'}


# Get the dataset

In [53]:
!wget http://host.robots.ox.ac.uk/pascal/VOC/download/voc2005_1.tar.gz

--2025-01-12 01:43:38--  http://host.robots.ox.ac.uk/pascal/VOC/download/voc2005_1.tar.gz
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 487614356 (465M) [application/x-gzip]
Saving to: ‘voc2005_1.tar.gz’


2025-01-12 01:43:56 (25.7 MB/s) - ‘voc2005_1.tar.gz’ saved [487614356/487614356]



In [54]:
import tarfile
with tarfile.open('voc2005_1.tar.gz', 'r:gz') as tar:
        tar.extractall(path='.')


In [104]:
import json
import os
def CreateJSONFromPascalDataSet( RootFilePath, JSONFileName ):
    """
    Creates a JSON File from a Tree Structure of PASCAL VOC image data
    PASCAL - Pattern Analysis, Statistical Modeling & Computational Learning
             built by University of Oxford. The root directory is expected in the
             following format
              VOC
              +->Annotations
                    +->Class1
                    +->Class2
              +->GTMasks
                    +->Class1
                    +->Class2
              +->PNGImages
                    +->Class1
                    +->Class2

    Args:
      RootFilePath - Dataset Root directory
      JSONFileName - JSON File to save

    Returns:
       void
    """
    # Write the data to a JSON file
    JsonData = {}
    Idx = 0
    with open(JSONFileName, "w") as outfile:
        for Root, Dirs, Files in os.walk( RootFilePath ):
            for File in Files:
                with open(Root + '/' + File, 'r') as TextFile:
                    Text = TextFile.read()
                    TextFile.close()
                    tokens = ParsePascalString( Text )
                    JsonData[ Idx ] = tokens
                    Idx = Idx + 1

        # convert dict to json
        json_string = json.dumps(JsonData)
        # write json  to disc
        outfile.write(json_string)
        outfile.close()




In [100]:
CreateJSONFromPascalDataSet('/content/VOC2005_1/Annotations','test.json')