# 2. LAZ to Raster Processing - Sierra Nevada Giant Forest

This notebook processes LAZ point cloud files into raster products (DEM, DSM, CHM) using PDAL and GDAL.

**Input:** LAZ files downloaded in `1_download_data.ipynb`

**Output:**
- DEM (Digital Elevation Model) - Ground surface
- DSM (Digital Surface Model) - Top of canopy
- CHM (Canopy Height Model) = DSM - DEM
- Cloud-Optimized GeoTIFFs (COG)

## Setup

In [None]:
import os
import json
import subprocess
from pathlib import Path
from datetime import datetime
import numpy as np

# Check for required libraries
try:
    import pdal
    print(f"PDAL version: {pdal.info.version}")
except ImportError:
    print("PDAL not installed. Install with: conda install -c conda-forge pdal python-pdal")

try:
    import rioxarray as rxr
    print("rioxarray available")
except ImportError:
    print("rioxarray not installed. Install with: conda install -c conda-forge rioxarray")

try:
    from osgeo import gdal
    print(f"GDAL version: {gdal.VersionInfo()}")
except ImportError:
    print("GDAL not installed.")

In [None]:
# Data directories
DATA_ROOT = Path.home() / "data-store" / "data" / "output" / "sierra-nevada"
RAW_DIR = DATA_ROOT / "raw"
LAZ_DIR = RAW_DIR / "laz"
PROCESSED_DIR = DATA_ROOT / "processed"
DEM_DIR = PROCESSED_DIR / "dem"
DSM_DIR = PROCESSED_DIR / "dsm"
CHM_DIR = PROCESSED_DIR / "chm"
COG_DIR = PROCESSED_DIR / "cog"

# Ensure output directories exist
for d in [DEM_DIR, DSM_DIR, CHM_DIR, COG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Processing parameters
RESOLUTION = 0.5  # meters (matches BCI for comparison)
CRS = "EPSG:32611"  # UTM Zone 11N

print(f"LAZ input: {LAZ_DIR}")
print(f"DEM output: {DEM_DIR}")
print(f"DSM output: {DSM_DIR}")
print(f"CHM output: {CHM_DIR}")
print(f"Resolution: {RESOLUTION}m")

## Inventory LAZ Files

In [None]:
# Find all LAZ files
laz_files = sorted(LAZ_DIR.glob('*.laz'))
print(f"Found {len(laz_files)} LAZ files")

# Calculate total size
total_size = sum(f.stat().st_size for f in laz_files)
print(f"Total size: {total_size / (1024**3):.2f} GB")

# List files
if laz_files:
    print("\nLAZ files:")
    for f in laz_files[:10]:
        size_mb = f.stat().st_size / (1024**2)
        print(f"  {f.name}: {size_mb:.1f} MB")
    if len(laz_files) > 10:
        print(f"  ... and {len(laz_files) - 10} more")
else:
    print("\n⚠ No LAZ files found. Run 1_download_data.ipynb first.")

## Inspect LAZ Metadata

Examine one LAZ file to understand the point cloud structure.

In [None]:
def get_laz_info(laz_path):
    """
    Get metadata from a LAZ file using PDAL.
    """
    pipeline = pdal.Pipeline(json.dumps({
        "pipeline": [
            str(laz_path)
        ]
    }))
    
    # Get metadata without reading all points
    pipeline.execute()
    metadata = pipeline.metadata
    
    return metadata

if laz_files:
    sample_file = laz_files[0]
    print(f"Inspecting: {sample_file.name}\n")
    
    try:
        # Quick stats using pdal info
        result = subprocess.run(
            ['pdal', 'info', '--summary', str(sample_file)],
            capture_output=True,
            text=True
        )
        if result.returncode == 0:
            info = json.loads(result.stdout)
            summary = info.get('summary', {})
            
            print(f"Point count: {summary.get('num_points', 'N/A'):,}")
            
            bounds = summary.get('bounds', {})
            if bounds:
                print(f"\nBounds:")
                print(f"  X: {bounds.get('minx', 0):.2f} to {bounds.get('maxx', 0):.2f}")
                print(f"  Y: {bounds.get('miny', 0):.2f} to {bounds.get('maxy', 0):.2f}")
                print(f"  Z: {bounds.get('minz', 0):.2f} to {bounds.get('maxz', 0):.2f}")
            
            srs = summary.get('srs', {})
            if srs:
                print(f"\nCRS: {srs.get('wkt', 'Unknown')[:100]}...")
        else:
            print(f"Error: {result.stderr}")
    except Exception as e:
        print(f"Error inspecting file: {e}")

In [None]:
# Get classification info
if laz_files:
    sample_file = laz_files[0]
    
    try:
        result = subprocess.run(
            ['pdal', 'info', '--stats', str(sample_file)],
            capture_output=True,
            text=True
        )
        if result.returncode == 0:
            info = json.loads(result.stdout)
            stats = info.get('stats', {}).get('statistic', [])
            
            # Find Classification stats
            for stat in stats:
                if stat.get('name') == 'Classification':
                    print("Classification statistics:")
                    counts = stat.get('counts', [])
                    if counts:
                        for count_str in counts:
                            # Format: "class_value/count"
                            parts = count_str.split('/')
                            if len(parts) == 2:
                                class_val = int(float(parts[0]))
                                count = int(parts[1])
                                class_name = {
                                    0: "Never classified",
                                    1: "Unassigned",
                                    2: "Ground",
                                    3: "Low Vegetation",
                                    4: "Medium Vegetation",
                                    5: "High Vegetation",
                                    6: "Building",
                                    7: "Low Point (noise)",
                                    9: "Water",
                                    17: "Bridge Deck",
                                    18: "High Noise"
                                }.get(class_val, f"Class {class_val}")
                                print(f"  {class_val}: {class_name}: {count:,}")
                    break
    except Exception as e:
        print(f"Error: {e}")

## Process LAZ to DEM (Ground Surface)

Extract ground-classified points and create a Digital Elevation Model.

In [None]:
def process_laz_to_dem(laz_path, output_path, resolution=0.5):
    """
    Process a LAZ file to DEM using ground-classified points.
    
    Uses PDAL pipeline:
    1. Read LAZ file
    2. Filter to ground points (classification 2)
    3. Write to GeoTIFF using IDW interpolation
    """
    pipeline_json = {
        "pipeline": [
            {
                "type": "readers.las",
                "filename": str(laz_path)
            },
            {
                "type": "filters.range",
                "limits": "Classification[2:2]"  # Ground only
            },
            {
                "type": "filters.outlier",
                "method": "statistical",
                "mean_k": 12,
                "multiplier": 2.2
            },
            {
                "type": "writers.gdal",
                "filename": str(output_path),
                "gdaldriver": "GTiff",
                "output_type": "idw",  # Inverse distance weighting
                "resolution": resolution,
                "data_type": "float32",
                "gdalopts": "COMPRESS=DEFLATE,TILED=YES"
            }
        ]
    }
    
    pipeline = pdal.Pipeline(json.dumps(pipeline_json))
    count = pipeline.execute()
    
    return count

# Test with first file
if laz_files:
    test_laz = laz_files[0]
    test_dem = DEM_DIR / f"{test_laz.stem}_dem.tif"
    
    print(f"Processing: {test_laz.name}")
    print(f"Output: {test_dem}")
    
    try:
        count = process_laz_to_dem(test_laz, test_dem, RESOLUTION)
        print(f"\n✓ Processed {count:,} ground points")
        
        if test_dem.exists():
            size_mb = test_dem.stat().st_size / (1024**2)
            print(f"Output size: {size_mb:.1f} MB")
    except Exception as e:
        print(f"Error: {e}")

## Process LAZ to DSM (Top of Canopy)

Extract first returns to create a Digital Surface Model.

In [None]:
def process_laz_to_dsm(laz_path, output_path, resolution=0.5):
    """
    Process a LAZ file to DSM using first returns.
    
    Uses PDAL pipeline:
    1. Read LAZ file
    2. Filter to first returns
    3. Write to GeoTIFF using max value (highest point per cell)
    """
    pipeline_json = {
        "pipeline": [
            {
                "type": "readers.las",
                "filename": str(laz_path)
            },
            {
                "type": "filters.returns",
                "groups": "first,only"  # First and only returns
            },
            {
                "type": "filters.outlier",
                "method": "statistical",
                "mean_k": 12,
                "multiplier": 2.2
            },
            {
                "type": "writers.gdal",
                "filename": str(output_path),
                "gdaldriver": "GTiff",
                "output_type": "max",  # Maximum height per cell
                "resolution": resolution,
                "data_type": "float32",
                "gdalopts": "COMPRESS=DEFLATE,TILED=YES"
            }
        ]
    }
    
    pipeline = pdal.Pipeline(json.dumps(pipeline_json))
    count = pipeline.execute()
    
    return count

# Test with first file
if laz_files:
    test_laz = laz_files[0]
    test_dsm = DSM_DIR / f"{test_laz.stem}_dsm.tif"
    
    print(f"Processing: {test_laz.name}")
    print(f"Output: {test_dsm}")
    
    try:
        count = process_laz_to_dsm(test_laz, test_dsm, RESOLUTION)
        print(f"\n✓ Processed {count:,} first return points")
        
        if test_dsm.exists():
            size_mb = test_dsm.stat().st_size / (1024**2)
            print(f"Output size: {size_mb:.1f} MB")
    except Exception as e:
        print(f"Error: {e}")

## Compute CHM (Canopy Height Model)

CHM = DSM - DEM

In [None]:
def compute_chm(dem_path, dsm_path, output_path):
    """
    Compute CHM = DSM - DEM using rioxarray.
    """
    # Load rasters
    dem = rxr.open_rasterio(dem_path, masked=True).squeeze()
    dsm = rxr.open_rasterio(dsm_path, masked=True).squeeze()
    
    # Compute CHM
    chm = dsm - dem
    
    # Clip negative values (artifacts)
    chm = chm.where(chm >= 0, 0)
    
    # Save
    chm.rio.to_raster(
        output_path,
        driver="GTiff",
        dtype="float32",
        compress="deflate",
        tiled=True
    )
    
    return chm

# Test CHM computation
if laz_files:
    test_laz = laz_files[0]
    test_dem = DEM_DIR / f"{test_laz.stem}_dem.tif"
    test_dsm = DSM_DIR / f"{test_laz.stem}_dsm.tif"
    test_chm = CHM_DIR / f"{test_laz.stem}_chm.tif"
    
    if test_dem.exists() and test_dsm.exists():
        print(f"Computing CHM: {test_chm.name}")
        
        try:
            chm = compute_chm(test_dem, test_dsm, test_chm)
            
            print(f"\n✓ CHM computed")
            print(f"  Min height: {float(chm.min()):.2f} m")
            print(f"  Max height: {float(chm.max()):.2f} m")
            print(f"  Mean height: {float(chm.mean()):.2f} m")
            
            if test_chm.exists():
                size_mb = test_chm.stat().st_size / (1024**2)
                print(f"  Output size: {size_mb:.1f} MB")
        except Exception as e:
            print(f"Error: {e}")
    else:
        print("DEM and/or DSM not yet generated. Run cells above first.")

## Batch Process All LAZ Files

In [None]:
def process_all_laz_files(laz_files, dem_dir, dsm_dir, chm_dir, resolution=0.5):
    """
    Process all LAZ files to DEM, DSM, and CHM.
    """
    results = []
    
    for i, laz_path in enumerate(laz_files):
        print(f"\n[{i+1}/{len(laz_files)}] Processing {laz_path.name}")
        
        stem = laz_path.stem
        dem_path = dem_dir / f"{stem}_dem.tif"
        dsm_path = dsm_dir / f"{stem}_dsm.tif"
        chm_path = chm_dir / f"{stem}_chm.tif"
        
        result = {
            'laz': laz_path.name,
            'dem_status': 'skipped',
            'dsm_status': 'skipped',
            'chm_status': 'skipped'
        }
        
        try:
            # Process DEM
            if not dem_path.exists():
                print(f"  Creating DEM...")
                process_laz_to_dem(laz_path, dem_path, resolution)
                result['dem_status'] = 'created'
            else:
                result['dem_status'] = 'exists'
            
            # Process DSM
            if not dsm_path.exists():
                print(f"  Creating DSM...")
                process_laz_to_dsm(laz_path, dsm_path, resolution)
                result['dsm_status'] = 'created'
            else:
                result['dsm_status'] = 'exists'
            
            # Compute CHM
            if not chm_path.exists() and dem_path.exists() and dsm_path.exists():
                print(f"  Computing CHM...")
                compute_chm(dem_path, dsm_path, chm_path)
                result['chm_status'] = 'created'
            elif chm_path.exists():
                result['chm_status'] = 'exists'
            
            print(f"  ✓ Complete")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
            result['error'] = str(e)
        
        results.append(result)
    
    return results

In [None]:
# Process all files (uncomment to run)
# WARNING: This may take a long time depending on the number and size of files

if laz_files:
    print(f"Processing {len(laz_files)} LAZ files...")
    print(f"Resolution: {RESOLUTION}m")
    print("="*60)
    
    # Uncomment the line below to process all files
    # results = process_all_laz_files(laz_files, DEM_DIR, DSM_DIR, CHM_DIR, RESOLUTION)
    
    print("\nTo process all files, uncomment the line above.")
else:
    print("No LAZ files found. Run 1_download_data.ipynb first.")

## Merge Tiles into Single Rasters

Merge individual tile rasters into whole-area DEM, DSM, and CHM.

In [None]:
def merge_rasters(input_dir, output_path, pattern="*.tif"):
    """
    Merge multiple raster tiles into a single file using GDAL.
    """
    input_files = list(input_dir.glob(pattern))
    
    if not input_files:
        print(f"No files matching {pattern} in {input_dir}")
        return None
    
    print(f"Merging {len(input_files)} files...")
    
    # Create file list for GDAL
    file_list_path = input_dir / "_merge_list.txt"
    with open(file_list_path, 'w') as f:
        for fp in input_files:
            f.write(str(fp) + '\n')
    
    # Run gdal_merge.py or gdalbuildvrt + gdal_translate
    vrt_path = output_path.with_suffix('.vrt')
    
    # Build VRT (virtual raster)
    cmd_vrt = [
        'gdalbuildvrt',
        '-input_file_list', str(file_list_path),
        str(vrt_path)
    ]
    
    result = subprocess.run(cmd_vrt, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"VRT build error: {result.stderr}")
        return None
    
    # Convert VRT to GeoTIFF
    cmd_translate = [
        'gdal_translate',
        '-of', 'GTiff',
        '-co', 'COMPRESS=DEFLATE',
        '-co', 'TILED=YES',
        '-co', 'BIGTIFF=YES',
        str(vrt_path),
        str(output_path)
    ]
    
    result = subprocess.run(cmd_translate, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"Translation error: {result.stderr}")
        return None
    
    # Cleanup
    file_list_path.unlink()
    vrt_path.unlink()
    
    print(f"✓ Merged to {output_path}")
    return output_path

In [None]:
# Merge DEM tiles
dem_files = list(DEM_DIR.glob('*_dem.tif'))
if dem_files:
    merged_dem = PROCESSED_DIR / "giant_forest_dem.tif"
    merge_rasters(DEM_DIR, merged_dem, '*_dem.tif')
else:
    print("No DEM files to merge yet.")

In [None]:
# Merge DSM tiles
dsm_files = list(DSM_DIR.glob('*_dsm.tif'))
if dsm_files:
    merged_dsm = PROCESSED_DIR / "giant_forest_dsm.tif"
    merge_rasters(DSM_DIR, merged_dsm, '*_dsm.tif')
else:
    print("No DSM files to merge yet.")

In [None]:
# Merge CHM tiles
chm_files = list(CHM_DIR.glob('*_chm.tif'))
if chm_files:
    merged_chm = PROCESSED_DIR / "giant_forest_chm.tif"
    merge_rasters(CHM_DIR, merged_chm, '*_chm.tif')
else:
    print("No CHM files to merge yet.")

## Convert to Cloud-Optimized GeoTIFF (COG)

In [None]:
def convert_to_cog(input_path, output_path):
    """
    Convert a GeoTIFF to Cloud-Optimized GeoTIFF.
    """
    cmd = [
        'gdal_translate',
        '-of', 'COG',
        '-co', 'COMPRESS=DEFLATE',
        '-co', 'OVERVIEWS=AUTO',
        '-co', 'BIGTIFF=YES',
        str(input_path),
        str(output_path)
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        # Try rio-cogeo as fallback
        try:
            from rio_cogeo.cogeo import cog_translate
            from rio_cogeo.profiles import cog_profiles
            
            cog_translate(
                str(input_path),
                str(output_path),
                cog_profiles.get("deflate"),
                overview_level=5
            )
        except ImportError:
            print(f"COG conversion failed: {result.stderr}")
            return None
    
    print(f"✓ COG created: {output_path}")
    return output_path

# Convert merged files to COG
merged_dem = PROCESSED_DIR / "giant_forest_dem.tif"
merged_dsm = PROCESSED_DIR / "giant_forest_dsm.tif"
merged_chm = PROCESSED_DIR / "giant_forest_chm.tif"

if merged_dem.exists():
    convert_to_cog(merged_dem, COG_DIR / "giant_forest_dem_cog.tif")
if merged_dsm.exists():
    convert_to_cog(merged_dsm, COG_DIR / "giant_forest_dsm_cog.tif")
if merged_chm.exists():
    convert_to_cog(merged_chm, COG_DIR / "giant_forest_chm_cog.tif")

## Processing Summary

In [None]:
print("Processing Summary")
print("="*60)

# Count files
dem_count = len(list(DEM_DIR.glob('*.tif')))
dsm_count = len(list(DSM_DIR.glob('*.tif')))
chm_count = len(list(CHM_DIR.glob('*.tif')))
cog_count = len(list(COG_DIR.glob('*.tif')))

print(f"\nTile Products:")
print(f"  DEM tiles: {dem_count}")
print(f"  DSM tiles: {dsm_count}")
print(f"  CHM tiles: {chm_count}")

print(f"\nMerged Products:")
for name, path in [
    ('DEM', PROCESSED_DIR / "giant_forest_dem.tif"),
    ('DSM', PROCESSED_DIR / "giant_forest_dsm.tif"),
    ('CHM', PROCESSED_DIR / "giant_forest_chm.tif")
]:
    if path.exists():
        size_gb = path.stat().st_size / (1024**3)
        print(f"  {name}: {path.name} ({size_gb:.2f} GB)")
    else:
        print(f"  {name}: Not created yet")

print(f"\nCOG Products:")
for f in COG_DIR.glob('*.tif'):
    size_gb = f.stat().st_size / (1024**3)
    print(f"  {f.name} ({size_gb:.2f} GB)")

if cog_count == 0:
    print("  No COGs created yet")

## Save Processing Metadata

In [None]:
# Save processing metadata
processing_metadata = {
    'project': 'Sierra Nevada Giant Forest LiDAR',
    'processing_date': datetime.now().isoformat(),
    'resolution_m': RESOLUTION,
    'crs': CRS,
    'input': {
        'laz_files': len(laz_files),
        'laz_dir': str(LAZ_DIR)
    },
    'output': {
        'dem_tiles': dem_count,
        'dsm_tiles': dsm_count,
        'chm_tiles': chm_count,
        'cog_files': cog_count
    },
    'products': {
        'merged_dem': str(PROCESSED_DIR / "giant_forest_dem.tif") if (PROCESSED_DIR / "giant_forest_dem.tif").exists() else None,
        'merged_dsm': str(PROCESSED_DIR / "giant_forest_dsm.tif") if (PROCESSED_DIR / "giant_forest_dsm.tif").exists() else None,
        'merged_chm': str(PROCESSED_DIR / "giant_forest_chm.tif") if (PROCESSED_DIR / "giant_forest_chm.tif").exists() else None
    }
}

metadata_path = PROCESSED_DIR / "processing_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(processing_metadata, f, indent=2)

print(f"Saved processing metadata to {metadata_path}")

## Next Steps

After processing is complete:

1. **Run `3_chm_exploration.ipynb`** to analyze the generated CHM
2. **Run `4_fractal_analysis.ipynb`** to compute fractal dimensions

See `PLAN.md` for the complete project workflow.