This notebook steps through data preprocessing steps needed prior to DSM refinement. In particular:

- Crop all raster extents to the same bounds
- Reproject and resample all rasters to the same CRS and resolution
- Create a no-data mask file
- Calculate NDVI from HLS data and write out a new raster

In [None]:
# GIS imports
import rasterio
from rasterio.windows import from_bounds
from rasterio.warp import Resampling, reproject, transform_bounds
from shapely.geometry import box

# misc imports
from tqdm import tqdm
from pathlib import Path
import numpy as np
from skimage import exposure

Our input dataset comprises of two ortho images, the initial DSM generated from these images (in our case, using [ASP](https://stereopipeline.readthedocs.io/en/latest/introduction.html)), co-incident NDVI values (derived from [HLS data](https://hls.gsfc.nasa.gov/)), and finally triangulation errors and nodata values obtained from the ASP processing of the ortho images.

Users will need to specify the path to the rasters as well as the specific filenames below:

In [None]:
# user specified filepaths and filenames
input_data_path = Path('/mnt/working/karthikv/DeepDEM/data/mt_baker/WV01_20150911_1020010042D39D00_1020010043455300/original_rasters') # specify folder path containing source rasters
common_data_path = Path('/mnt/working/karthikv/DeepDEM/data/common_files')
output_data_path = input_data_path.parent / 'processed_rasters' # preprocessed data will be written to this location
output_data_path.mkdir(exist_ok=True)

# specify filenames for the ortho images, initial DSM, HLS data, triangulation errors, and the ground truth LIDAR data
ortho_channel1_file = '1020010042D39D00_left_ortho_1.0m_final_aligned_to_REFDEM.tif'
ortho_channel2_file = '1020010043455300_right_ortho_1.0m_final_aligned_to_REFDEM.tif'
initial_dsm_file = '20150911_2050_1020010042D39D00_1020010043455300_1.0m-DEM_trans_dx+2.40m_dy+1.51m_dz-0.36m__ASP_wt_avg_extrapolation_search_rad25_num_pass_3.tif'
hls_red_band_file = common_data_path / 'HLS.L30.T10UEV.2015254T185445.v2.0.B04.tif'
hls_nir_band_file = common_data_path / 'HLS.L30.T10UEV.2015254T185445.v2.0.B05.tif'
triangulation_error_file = '20150911_2050_1020010042D39D00_1020010043455300_1.0m-IntersectionErr_trans_dx+2.40m_dy+1.51m_dz+0.00m_final_extent.tif'
lidar_data_file = '/mnt/working/karthikv/DeepDEM/data/common_files/merged_dsm.tif' 


# We define a dictionary using the user defined values provided above
input_file_dict = {
    'ortho_channel1':ortho_channel1_file,
    'ortho_channel2':ortho_channel2_file,
    'asp_dsm':initial_dsm_file,
    'hls_red':hls_red_band_file,
    'hls_nir':hls_nir_band_file,
    'triangulation_error':triangulation_error_file,
    'lidar_dtm_data':lidar_data_file
}

In [None]:
tmp_folder_path = input_data_path / 'tmp'
tmp_folder_path.mkdir(exist_ok=True)

In [None]:
# Open files, create shapely polygons from bounds, intersect them and find common overlap
# Then perform a windowed read of the files, and write out the cropped rasters

with rasterio.open(input_data_path / ortho_channel1_file) as ds:
    reference_crs = ds.crs
    reference_x_resolution = abs(ds.transform[0])
    reference_y_resolution = abs(ds.transform[4])

shapes = []
for key, value in input_file_dict.items():
    with rasterio.open(input_data_path / value) as ds:
        src_crs = ds.crs
        shapes.append(box(*transform_bounds(src_crs, reference_crs, *ds.bounds)))

intersection_shape =  shapes[0]
for s in shapes:
    intersection_shape = intersection_shape.intersection(s)

print(f"Bounds of intersection area across rasters in {reference_crs}: ", intersection_shape.bounds)

In [None]:
# We now perform windowed reads of the rasters and write out new files

def write_output_rasters(key_val):

    window_read_bounds = intersection_shape.bounds

    key, value = key_val
    output_file = output_data_path / f"final_{key}.tif"

    with rasterio.open(input_data_path / value) as ds:
        tmp_profile = ds.profile

        if ds.crs != reference_crs:
            # if the input data CRS does not match the reference CRS, we perform a windowed read of the 
            # relevant area, and then reproject + resample the data to the reference CRS.

            tmp_window_bounds = transform_bounds(reference_crs, ds.crs, *window_read_bounds)
            img = ds.read(1, window=from_bounds(*tmp_window_bounds, transform=ds.transform))
            
            # this is the transform associated with the windowed read in the reference crs
            src_transform = rasterio.transform.from_bounds(*tmp_window_bounds, img.shape[-1], img.shape[-2])
            
            # this is the transform associated with the windowed read in the reference crs
            width = np.round((window_read_bounds[2] - window_read_bounds[0])/reference_x_resolution).astype(int)
            height = np.round((window_read_bounds[3] - window_read_bounds[1])/reference_y_resolution).astype(int)

            dst_transform = rasterio.transform.from_bounds(*window_read_bounds, width, height)

            # create the reprojected+resampled raster
            dst_raster = np.zeros((height, width))
            
            reproject(
                source=img, 
                destination=dst_raster, 
                src_transform=src_transform, 
                src_crs=ds.crs, 
                dst_transform=dst_transform, 
                dst_crs=reference_crs,
                resampling=Resampling.nearest
            )

            # we will write out this img temporarily to perform a windowed read
            tmp_profile_copy = tmp_profile.copy()
            tmp_profile_copy.update({
                'transform':dst_transform,
                'width':width,
                'height':height,
                'crs':reference_crs
            })

            with rasterio.open(tmp_folder_path/'tmp.tif', 'w', **tmp_profile_copy) as output:
                output.write(dst_raster.reshape(1, *dst_raster.shape))

            # perform windowed read
            with rasterio.open(tmp_folder_path/'tmp.tif') as input:
                dst_raster = input.read(1, window=from_bounds(*window_read_bounds, transform=input.transform))

            # delete temporary files
            (tmp_folder_path/'tmp.tif').unlink()

        else:
            dst_raster = ds.read(1, window=from_bounds(*window_read_bounds, transform=ds.transform))

    transform = rasterio.transform.from_bounds(*window_read_bounds, width=dst_raster.shape[-1], height=dst_raster.shape[-2])

    tmp_profile.update({
        'width':dst_raster.shape[-1],
        'height':dst_raster.shape[-2],
        'transform':transform,
        'crs':reference_crs,
        'blockxsize':256, 
        'blockysize':256,
        'compress': 'deflate',
        'tiled': True,
        })
    
    nodata_mask = np.where(dst_raster == tmp_profile['nodata'], 1, 0)
    
    # change nodata value from -9999 for triangulation error
    if key == 'triangulation_error':
        dst_raster = np.where(dst_raster==tmp_profile['nodata'], -1, dst_raster)
        tmp_profile.update({
            'nodata':-1
        })
    
    with rasterio.open(output_file, 'w', **tmp_profile) as ds:
        ds.write(dst_raster.reshape(1, *dst_raster.shape))
    
    print(f"Written out: {output_file.name}")

    return nodata_mask

nodata_mask_list = list(map(write_output_rasters, tqdm(list(input_file_dict.items()))))
        

In [None]:
# Aggregate no data values

nodata_mask = np.ones((1, nodata_mask_list[0].shape[0], nodata_mask_list[0].shape[1]), dtype=np.uint8)
for x in nodata_mask_list:
    nodata_mask = np.where(x==1, 0, nodata_mask)

Let's calculate NDVI values from the HLS data and write them out

In [None]:
nir_file = output_data_path / 'final_hls_nir.tif'
red_file = output_data_path / 'final_hls_red.tif'

assert nir_file.exists(), "Error, processed HLS NIR band file missing"
assert red_file.exists(), "Error, processed HLS red band file missing"

In [None]:
with rasterio.open(nir_file) as ds:
    profile = ds.profile
    nodata_value = ds.profile['nodata']
    nir_img = ds.read(1)
    nir_img = np.ma.masked_where(nir_img == nodata_value, nir_img)

with rasterio.open(red_file) as ds:
    nodata_value = ds.profile['nodata']
    red_img = ds.read(1)
    red_img = np.ma.masked_where(red_img == nodata_value, red_img)

ndvi = (nir_img - red_img)/(nir_img + red_img)

# append to nodata mask
nodata_mask = np.where(((ndvi<-1) | (ndvi>1)), 0, nodata_mask)

# set bad NDVI values to zero. This won't change training/inference since
# nodata mask will govern loss calculations
ndvi = np.where(((ndvi<-1) | (ndvi>1)), 0, ndvi)

# ndvi can take float values
profile.update({'dtype':ndvi.dtype})

with rasterio.open(output_data_path/"final_ndvi.tif", 'w', **profile) as ds:
    ds.write(ndvi.reshape(1, *ndvi.shape))

Let's write out the calculated no-data mask

In [None]:
with rasterio.open(output_data_path / 'final_ortho_channel1.tif') as ds:
    profile = ds.profile

profile.update({'dtype':str(nodata_mask.dtype), 'nodata':None})

with rasterio.open(output_data_path/'final_nodata_mask.tif', 'w', **profile) as dst:
    dst.write(nodata_mask)

Apply adaptive histogram equalization to the ortho images. This improves image contrast, and scales intensity to values between [0, 1]

In [None]:
filename = ['final_ortho_channel1.tif', 'final_ortho_channel2.tif']

for f in filename:
    with rasterio.open(output_data_path / f) as ds:
        img = ds.read()
        nodata_mask = np.where(img==ds.profile['nodata'], 0, 1)
        img = exposure.equalize_adapthist(img).astype(np.float32)*nodata_mask # multiply mask to ensure no-data pixels are zeroed out
        profile = ds.profile

    profile.update({
        'dtype':img.dtype
    })

    with rasterio.open(output_data_path / f, 'w', **profile) as ds:
        ds.write(img)

In [None]:
# Delete processed HLS NIR and Red images since NDVI file has been written out

for key in ['hls_red', 'hls_nir']:
    output_file = output_data_path / f"final_{key}.tif"
    output_file.unlink()
    output_file = output_data_path / f"final_{key}.tif.aux.xml"
    output_file.unlink()