In [2]:
import xml.etree.ElementTree as ET

# Load the XML file
tree = ET.parse('CREMP_OCTO_RawData_Table.xml')

# Get the root element
root = tree.getroot()

# Example: Print tag names
for child in root:
    print(child.tag, child.attrib)

idinfo {}
dataqual {}
eainfo {}
distinfo {}
metainfo {}


In [4]:
for child in root:
    print(f"Tag: {child.tag}")
    for elem in child:
        print(f"  Subtag: {elem.tag}, Text: {elem.text}")

Tag: idinfo
  Subtag: citation, Text: 

  Subtag: descript, Text: 

  Subtag: timeperd, Text: 

  Subtag: status, Text: 

  Subtag: spdom, Text: 

  Subtag: keywords, Text: 

  Subtag: accconst, Text: Available without restriction
  Subtag: useconst, Text: FWC-FWRI must be credited. This is not a survey data set and should not be utilized as such. These data are not to be used for navigation.
  Subtag: ptcontac, Text: 

  Subtag: datacred, Text: Florida Fish and Wildlife Conservation Commission - Fish and Wildlife Research Institute and the CREMP Team and Collaborators. The Coral Reef Evaluation and Monitoring Project (CREMP) in the Florida Keys is managed by the Corals Research Program and the Fish and Wildlife Research Institute and funded by the Environmental Protection Agency as part of the Florida Keys National Marine Sanctuary Water Quality Protection Program.
  Subtag: secinfo, Text: 

  Subtag: native, Text: ESRI ArcGIS 10.0.5.4400
Tag: dataqual
  Subtag: logic, Text: These dat

In [6]:
def print_element(element, level=0):
    indent = "  " * level
    print(f"{indent}Tag: {element.tag}, Text: {element.text.strip() if element.text else 'None'}")
    for subelement in element:
        print_element(subelement, level + 1)

# Start from the root
print_element(root)

Tag: metadata, Text: 
  Tag: idinfo, Text: 
    Tag: citation, Text: 
      Tag: citeinfo, Text: 
        Tag: origin, Text: Florida Fish and Wildlife Conservation Commission-Fish and Wildlife Research Institute
        Tag: pubdate, Text: 20230719
        Tag: title, Text: CREMP OCTO Colonies Table
        Tag: geoform, Text: vector digital data
        Tag: pubinfo, Text: 
          Tag: pubplace, Text: St. Petersburg, FL
          Tag: publish, Text: Florida Fish and Wildlife Conservation Commission-Fish and Wildlife Research Institute
        Tag: onlink, Text: https://myfwc.com/research/habitat/coral/cremp/
        Tag: onlink, Text: https://geodata.myfwc.com
    Tag: descript, Text: 
      Tag: abstract, Text: The primary goal of the Coral Reef Evaluation and Monitoring Project (CREMP) is to measure the status and trends of these communities to assist managers in understanding, protecting, and restoring the living marine resources of the Florida Keys National Marine Sanctuary. Da

In [8]:
# Navigate to the detailed attribute section
detailed = root.find('.//detailed')

# List to hold all attributes
attributes = []

# Loop over each attribute
for attr in detailed.findall('attr'):
    attr_name = attr.find('attrlabl').text if attr.find('attrlabl') is not None else None
    attr_description = attr.find('attrdef').text if attr.find('attrdef') is not None else None
    attributes.append((attr_name, attr_description))

# Print all extracted columns
for name, description in attributes:
    print(f"Column: {name}\nDescription: {description}\n")


Column: OBJECTID
Description: Internal feature number.

Column: Year
Description: Sample Year

Column: Date
Description: None

Column: Subregion
Description: CREMP monitoring is separated into three project regions with different funding sources.  Each project region has designated subregions

Column: Habitat
Description: CREMP monitoring is separated in several habitat areas.  These are different depending on the project region.

Column: SiteID
Description: A unique site identifier for all CREMP sites.  These values do not overlap between project regions.

Column: Site_name
Description: A site name.  Some of these are assigned by FWRI, whereas others, particularly in the Florida Keys and Dry Tortugas, are official site names found on charts.

Column: StationID
Description: A unique identifier for all CREMP stations.  CREMP follows a nested design where each site has 2-4 survey stations.  This value is the Siteid plus the station number for that site.

Column: SPP_Code
Description: A f

In [10]:
# --- Extract Temporal Range ---
start_date = root.findtext('.//timeperd/timeinfo/rngdates/begdate')
end_date = root.findtext('.//timeperd/timeinfo/rngdates/enddate')

print(f"Temporal Range: {start_date} to {end_date}")

Temporal Range: 19960101 to 20210101


In [12]:
# --- Extract Geographic Bounding Box ---
west = root.findtext('.//spdom/bounding/westbc')
east = root.findtext('.//spdom/bounding/eastbc')
north = root.findtext('.//spdom/bounding/northbc')
south = root.findtext('.//spdom/bounding/southbc')

print(f"Geographic Bounds:\n  West: {west}, East: {east}, North: {north}, South: {south}")

Geographic Bounds:
  West: -83.004273, East: -80.000655, North: 27.181928, South: 24.411863


In [14]:
# --- Extract Data Quality Information ---
logic_consistency = root.findtext('.//dataqual/logic')
completeness = root.findtext('.//dataqual/complete')

print(f"Data Quality:\n  Logic Consistency: {logic_consistency}\n  Completeness: {completeness}")

Data Quality:
  Logic Consistency: These data appear to be logically consistent
  Completeness: These data appear to be complete through the survey period.


In [16]:
# --- Extract Access Constraints ---
access_constraint = root.findtext('.//accconst')
usage_constraint = root.findtext('.//useconst')

print(f"Access:\n  {access_constraint}\n  {usage_constraint}")

Access:
  Available without restriction
  FWC-FWRI must be credited. This is not a survey data set and should not be utilized as such. These data are not to be used for navigation.


In [18]:
# --- Extract Download Links ---
download_links = [elem.text for elem in root.findall('.//onlink')]
print("Download Links:")
for link in download_links:
    print(f"  {link}")

Download Links:
  https://myfwc.com/research/habitat/coral/cremp/
  https://geodata.myfwc.com


In [20]:
# --- Extract Contact Information ---
email = root.findtext('.//ptcontac/cntinfo/cntemail')
phone = root.findtext('.//ptcontac/cntinfo/cntvoice')

print(f"Contact:\n  Email: {email}\n  Phone: {phone}")

Contact:
  Email: GISLibrarian@MyFWC.com
  Phone: 727-896-8626


In [22]:
from lxml import etree

def parse_cremp_metadata(xml_file):
    """
    Parses the CREMP OCTO RawData XML metadata into a structured Python dictionary.

    Args:
        xml_file (str): Path to the XML file.

    Returns:
        dict: Dictionary containing columns, temporal range, bounding box, access, download links, and contact info.
    """
    tree = etree.parse(xml_file)
    root = tree.getroot()

    # --- Extract columns and descriptions ---
    columns = []
    for attr in root.xpath('.//detailed/attr'):
        name = attr.xpath('./attrlabl/text()')
        description = attr.xpath('./attrdef/text()')
        col_name = name[0] if name else None
        col_description = description[0] if description else None
        columns.append({'Column': col_name, 'Description': col_description})

    # --- Extract Temporal Range ---
    temporal_range = {
        'StartDate': root.xpath('string(.//timeperd/timeinfo/rngdates/begdate)'),
        'EndDate': root.xpath('string(.//timeperd/timeinfo/rngdates/enddate)')
    }

    # --- Extract Geographic Bounding Box ---
    bounding_box = {
        'West': root.xpath('string(.//spdom/bounding/westbc)'),
        'East': root.xpath('string(.//spdom/bounding/eastbc)'),
        'North': root.xpath('string(.//spdom/bounding/northbc)'),
        'South': root.xpath('string(.//spdom/bounding/southbc)')
    }

    # --- Extract Data Quality ---
    data_quality = {
        'LogicConsistency': root.xpath('string(.//dataqual/logic)'),
        'Completeness': root.xpath('string(.//dataqual/complete)')
    }

    # --- Extract Access Information ---
    access = {
        'AccessConstraint': root.xpath('string(.//accconst)'),
        'UseConstraint': root.xpath('string(.//useconst)')
    }

    # --- Extract Download Links ---
    download_links = [link for link in root.xpath('.//onlink/text()')]

    # --- Extract Contact Information ---
    contact_info = {
        'Email': root.xpath('string(.//ptcontac/cntinfo/cntemail)'),
        'Phone': root.xpath('string(.//ptcontac/cntinfo/cntvoice)')
    }

    # --- Bundle everything into a dictionary ---
    metadata = {
        'Columns': columns,
        'TemporalRange': temporal_range,
        'BoundingBox': bounding_box,
        'DataQuality': data_quality,
        'Access': access,
        'DownloadLinks': download_links,
        'ContactInfo': contact_info
    }

    return metadata

In [24]:
# Parse your XML file
metadata = parse_cremp_metadata('CREMP_OCTO_RawData_Table.xml')

# Now you can easily access:
print(metadata['Columns'])            # List of columns and descriptions
print(metadata['TemporalRange'])       # {'StartDate': ..., 'EndDate': ...}
print(metadata['BoundingBox'])         # {'West': ..., 'East': ..., 'North': ..., 'South': ...}
print(metadata['DataQuality'])          # Logic and completeness
print(metadata['Access'])              # Access constraints
print(metadata['DownloadLinks'])       # List of download links
print(metadata['ContactInfo'])         # Email and phone

[{'Column': 'OBJECTID', 'Description': 'Internal feature number.'}, {'Column': 'Year', 'Description': 'Sample Year'}, {'Column': 'Date', 'Description': None}, {'Column': 'Subregion', 'Description': 'CREMP monitoring is separated into three project regions with different funding sources.  Each project region has designated subregions'}, {'Column': 'Habitat', 'Description': 'CREMP monitoring is separated in several habitat areas.  These are different depending on the project region.'}, {'Column': 'SiteID', 'Description': 'A unique site identifier for all CREMP sites.  These values do not overlap between project regions.'}, {'Column': 'Site_name', 'Description': 'A site name.  Some of these are assigned by FWRI, whereas others, particularly in the Florida Keys and Dry Tortugas, are official site names found on charts.'}, {'Column': 'StationID', 'Description': 'A unique identifier for all CREMP stations.  CREMP follows a nested design where each site has 2-4 survey stations.  This value is