In [None]:
# Import required libraries

import pdal
import json
from glob import glob
import os
import h3
from shapely.geometry import Polygon
import geopandas as gpd
import pandas as pd

In [None]:
# Function definitions

def parse_coords(file_path, reader_type):
    """
    Reads a .las/z or ascii xyz file using PDAL andreturns arrays containing latitude and longitude values
    
    Parameters:
    file_path (str): Path to the data file
    reader_type (str): the PDAL reader appropriate for the file; in this case use "las" or "text"
    
    Returns:
    two arrays containing lat and lon values
    """
    
    # Define the PDAL pipeline to read the .laz file
    pipeline = {
        "pipeline": [
            {
                "type": f"readers.{reader_type}",
                "filename": file_path
            }
        ]
    }

    # Create PDAL pipeline manager
    pipeline_manager = pdal.Pipeline(json.dumps(pipeline))

    # Execute the pipeline
    pipeline_manager.execute()

    # Fetch the numpy array containing point cloud data
    arrays = pipeline_manager.arrays
    
    # Extract x, y coordinates from the point cloud
    if reader_type == "las":
        lats = arrays[0]['Y']
        lons = arrays[0]['X']
        
    elif reader_type == "text":
        lats = arrays[0]['LATITUDE']
        lons = arrays[0]['LONGITUDE']
    
    return lats, lons

def create_h3_hex(h3_res, lats, lons):
    """
    Creates WKT geometry representing the h3 hexagons corresponding to spatial extent of input data file  
    
    Parameters:
    h3_res (str): Desired H3 level
    lats (str): array of latitude coordinates
    lons (str): array of longitude coordinates
    
    Returns:
    List of WKT geometries 
    """
    
    # Convert the latitude and longitude arrays to H3 indexes
    h3_indices = set(h3.geo_to_h3(lat, lon, h3_res) for lat, lon in zip(lats, lons))
    
    # Create hexagon geometry by converting H3 indices to polygons
    h3_polygons = [Polygon(h3.h3_to_geo_boundary(h, geo_json=True)) for h in h3_indices]
    
    return h3_polygons

def gdf2poly(h3_polys, surveyid, outfile):
    # Convert H3 polygons to a geodataframe, converting to a vector file format and write to disk
    gdf = gpd.GeoDataFrame({'survey_id': surveyid, 'geometry': h3_polys}, crs = "EPSG:4326")
    gdf.to_file(outfile)
    
def json_extract_platformID_date(jsonfile):    
    with open(jsonfile, 'r') as f:
        data = json.load(f)
        platform_info = data['platform']
        temporal_info = data['submissionInfo']
        
    return platform_info['uniqueID'], temporal_info['timeCode']

### Processing 

In [None]:
# User definition of variables

inpath = r"/mnt/c/Users/mike/glos_data/scratch"
outpath = inpath
survname = "test"
lof = glob(os.path.join(inpath, "*.xyz"))

In [None]:
# Iterate through list of files defined above and create H3 hexagon outputs

for f in lof:
    print(f"processing file {f}") 
    coords = parse_coords(f, "text")
    
    for hex_level in range(6, 11):
        output = os.path.join(outpath, f"{survname}_h3-{str(hex_level)}.geojson")
        h3_hex = create_h3_hex(hex_level, coords[0], coords[1])
        hexpoly = gdf2poly(h3_hex, survname, output)
    

In [None]:
# Aggregate multiple H3 hexagon vectors (surveys consisting of multiple files) and remove duplicative records
# A final aggregated vector file is written to disk

for i in range(6,11):
    print(i)
    
    gjson_lof = glob(os.path.join(outpath, f"*{i}.geojson"))
    gjson_gdf = []

    # Read each geojson file into a geodataframe
    for j in gjson_lof:
        gdf = gpd.read_file(j)
        gjson_gdf.append(gdf)
    
    # Create aggregated geodataframe
    comb_gdf = gpd.GeoDataFrame(pd.concat(gjson_gdf, ignore_index=True))

    # Deduplicate based on all columns
    deduplicated_gdf = comb_gdf.drop_duplicates().reset_index(drop=True)
   
    # Write to disk
    formatted_value = f"0{i}" if i < 10 else str(i) 
    deduplicated_gdf.to_file(os.path.join(outpath,"level" + formatted_value + "_dedup.geojson"))
    deduplicated_gdf.to_file(os.path.join(outpath,"level" + formatted_value + "_dedup.shp"))
    gjson_gdf = []

#### optional processing required to query CSB metadata and pull distinctive attributes

In [None]:
# User definition of variables

inpath = r"/mnt/c/Users/mike/glos_data/ncei_csb/datafiles/"
lod = os.listdir(inpath)
lod[:-1]

In [None]:
for i in lod:
    jsonf = os.path.join(inpath, i, "metadata.json")
    surveyf = os.path.join(inpath, i, "formatted", "data.xyz")
    metadata = json_extract_platformID_date(jsonf)
    coords = parse_coords(surveyf)
    
    for hex_level in range(6, 11):
        survname = "IHO DCDB Crowd-Sourced Bathymetry"
        output = os.path.join(inpath, "geojson", f"{i}_h3-" + str(hex_level) + ".geojson")
        h3_hex = create_h3_hex(hex_level, coords[0], coords[1])
        hexpoly = gdf2poly(h3_hex, survname, output)