This notebook steps through data preprocessing steps needed prior to DSM refinement. In particular:

- Crop all raster extents to the same bounds
- Reproject and resample all rasters to the same CRS and resolution
- Create a no-data mask file
- Calculate NDVI from HLS data and write out a new raster

In [None]:
# GIS imports
import rasterio
from rasterio.windows import from_bounds
from rasterio.warp import Resampling, reproject, transform_bounds
from shapely.geometry import box

# misc imports
from tqdm import tqdm
from pathlib import Path
import numpy as np

Our input dataset comprises of two ortho images, the initial DSM generated from these images (in our case, using [ASP](https://stereopipeline.readthedocs.io/en/latest/introduction.html)), co-incident NDVI values (derived from [HLS data](https://hls.gsfc.nasa.gov/)), and finally triangulation errors and nodata values obtained from the ASP processing of the ortho images.

Users will need to specify the path to the rasters as well as the specific filenames below:

In [None]:
# user specified filepaths and filenames
input_data_path = Path('/mnt/1.0_TB_VOLUME/karthikv/DeepDEM/data/baker_csm_stack/original_rasters/') # specify folder path containing source rasters
output_data_path = input_data_path.parent / 'processed_rasters' # preprocessed data will be written to this location
output_data_path.mkdir(exist_ok=True)

# specify filenames for the ortho images, initial DSM, HLS data, triangulation errors, and the ground truth LIDAR data
ortho_left_file = 'final_ortho_left_1.0m_holes_filled.tif'
ortho_right_file = 'final_ortho_right_1.0m_holes_filled.tif'
initial_dsm_file = 'try_pc_align_to_lidar_15m_maxdisp_rotationallowed-1.0m-DEM_holes_filled.tif'
hls_red_band_file = 'HLS.L30.T10UEV.2015254T185445.v2.0.B04.tif'
hls_nir_band_file = 'HLS.L30.T10UEV.2015254T185445.v2.0.B05.tif'
triangulation_error_file = 'try_pc_align_to_lidar_15m_maxdisp_rotationallowed-1.0m-IntersectionErr.tif'
lidar_data_file = 'mosaic_full128_USGS_LPC_WA_MtBaker_2015_*_LAS_2017_32610_first_filt_v1.3_1.0m-DEM_holes_filled.tif' 


# We define a dictionary using the user defined values provided above
input_file_dict = {
    'ortho_left':ortho_left_file,
    'ortho_right':ortho_right_file,
    'asp_dsm':initial_dsm_file,
    'hls_red':hls_red_band_file,
    'hls_nir':hls_nir_band_file,
    'triangulation_error':triangulation_error_file,
    'lidar_data':lidar_data_file
}

In [None]:
tmp_folder_path = input_data_path / 'tmp'
tmp_folder_path.mkdir(exist_ok=True)

In [None]:
# Open files, create shapely polygons from bounds, intersect them and find common overlap
# Then perform a windowed read of the files, and write out the cropped rasters

with rasterio.open(input_data_path / ortho_left_file) as ds:
    reference_crs = ds.crs
    reference_x_resolution = abs(ds.transform[0])
    reference_y_resolution = abs(ds.transform[4])

shapes = []
for key, value in input_file_dict.items():
    with rasterio.open(input_data_path / value) as ds:
        src_crs = ds.crs
        shapes.append(box(*transform_bounds(src_crs, reference_crs, *ds.bounds)))

intersection_shape =  shapes[0]
for s in shapes:
    intersection_shape = intersection_shape.intersection(s)

print(f"Bounds of intersection area across rasters in {reference_crs}: ", intersection_shape.bounds)

In [None]:
# We now perform windowed reads of the rasters and write out new files

def write_output_rasters(key_val):

    window_read_bounds = intersection_shape.bounds

    key, value = key_val
    output_file = output_data_path / f"final_{key}.tif"

    with rasterio.open(input_data_path / value) as ds:
        tmp_profile = ds.profile

        if ds.crs != reference_crs:
            # if the input data CRS does not match the reference CRS, we perform a windowed read of the 
            # relevant area, and then reproject + resample the data to the reference CRS.

            tmp_window_bounds = transform_bounds(reference_crs, ds.crs, *window_read_bounds)
            img = ds.read(1, window=from_bounds(*tmp_window_bounds, transform=ds.transform))
            
            # this is the transform associated with the windowed read in the reference crs
            src_transform = rasterio.transform.from_bounds(*tmp_window_bounds, img.shape[-1], img.shape[-2])
            
            # this is the transform associated with the windowed read in the reference crs
            width = int((window_read_bounds[2] - window_read_bounds[0])/reference_x_resolution)
            height = int((window_read_bounds[3] - window_read_bounds[1])/reference_y_resolution)

            dst_transform = rasterio.transform.from_bounds(*window_read_bounds, width, height)

            # create the reprojected+resampled raster
            dst_raster = np.zeros((height, width))
            
            reproject(
                source=img, 
                destination=dst_raster, 
                src_transform=src_transform, 
                src_crs=ds.crs, 
                dst_transform=dst_transform, 
                dst_crs=reference_crs,
                resampling=Resampling.nearest
            )

            # we will write out this img temporarily to perform a windowed read
            tmp_profile_copy = tmp_profile.copy()
            tmp_profile_copy.update({
                'transform':dst_transform,
                'width':width,
                'height':height,
                'crs':reference_crs
            })

            with rasterio.open(tmp_folder_path/'tmp.tif', 'w', **tmp_profile_copy) as output:
                output.write(dst_raster.reshape(1, *dst_raster.shape))

            # perform windowed read
            with rasterio.open(tmp_folder_path/'tmp.tif') as input:
                dst_raster = input.read(1, window=from_bounds(*window_read_bounds, transform=input.transform))

            # delete temporary files
            (tmp_folder_path/'tmp.tif').unlink()

        else:
            dst_raster = ds.read(1, window=from_bounds(*window_read_bounds, transform=ds.transform))

    transform = rasterio.transform.from_bounds(*window_read_bounds, width=dst_raster.shape[-1], height=dst_raster.shape[-2])

    tmp_profile.update({
        'width':dst_raster.shape[-1],
        'height':dst_raster.shape[-2],
        'transform':transform,
        'crs':reference_crs,
        'blockxsize':256, 
        'blockysize':256,
        'compress': 'deflate',
        'tiled': True,
        })
    
    if key == 'triangulation_error':
        dst_raster = np.where(dst_raster==tmp_profile['nodata'], 1, dst_raster)
        tmp_profile.update({
            'nodata':1
        })
    
    with rasterio.open(output_file, 'w', **tmp_profile) as ds:
        ds.write(dst_raster.reshape(1, *dst_raster.shape))
    
    print(f"Written out: {output_file.name}")

_ = list(map(write_output_rasters, tqdm(list(input_file_dict.items()))))
        

Let's calculate NDVI values from the HLS data and write them out

In [None]:
nir_file = output_data_path / 'final_hls_nir.tif'
red_file = output_data_path / 'final_hls_red.tif'

assert nir_file.exists(), "Error, processed HLS NIR band file missing"
assert red_file.exists(), "Error, processed HLS red band file missing"

In [None]:
with rasterio.open(nir_file) as ds:
    profile = ds.profile
    nodata_value = ds.profile['nodata']
    nir_img = ds.read(1)
    nir_img = np.ma.masked_where(nir_img == nodata_value, nir_img)

with rasterio.open(red_file) as ds:
    nodata_value = ds.profile['nodata']
    red_img = ds.read(1)
    red_img = np.ma.masked_where(red_img == nodata_value, red_img)

ndvi = (nir_img - red_img)/(nir_img + red_img)

# set bad values to zero.
ndvi = np.where(((ndvi<-1) | (ndvi>1)), 0, ndvi)

# ndvi can take float values
profile.update({'dtype':ndvi.dtype})

with rasterio.open(output_data_path/"final_ndvi.tif", 'w', **profile) as ds:
    ds.write(ndvi.reshape(1, *ndvi.shape))

Lastly, let's write out a no-data mask

In [None]:
nodata_mask = np.zeros_like(ndvi)

for i, file in enumerate(output_data_path.glob('*.tif')):
    
    # ignore contents of a mask file if it already exists
    if file.name == 'final_nodata_mask.tif':
        continue

    with rasterio.open(file) as src:
        if i == 0:
            profile = src.profile # for later use

        # any region marked as 'no-data' will have a non-zero value in the mask
        # valid data regions will be 0
        nodata_mask += np.where(src.read(1) == src.profile['nodata'], 1, 0)
    
# invert values so that nodata regions are marked as 0, valid data regions are marked 1
nodata_mask = np.where(nodata_mask != 0, 0, 1)
profile.update({'dtype':nodata_mask.dtype, 'nodata':None})

with rasterio.open(output_data_path/'final_nodata_mask.tif', 'w', **profile) as dst:
    dst.write(nodata_mask.reshape(1, *nodata_mask.shape))

In [None]:
# We expect 9 files at the end of processing (2 ortho images, 1 DSM, 2 HLS images, 1 NDVI, triangulation error, nodata mask, LIDAR DEM)
assert len(list(output_data_path.glob('*.tif')))==9, 'One or more files were not processed!'