Load sample areas with raw data from file.

In [9]:
import numpy as np
import nfhelpers as nf
import scipy.stats as st

sampleAreas_training = np.load("numpy_objects/training_data_set.npy")
sampleAreas_testing = np.load("numpy_objects/holdout_data_set.npy")
sampleAreas = np.concatenate((sampleAreas_training, sampleAreas_testing))

---------
Extract features.

In [32]:
import tqdm
from skimage import filters
from skimage import img_as_float
from skimage.feature import peak_local_max
import math
import matplotlib.pyplot as plt
import scipy.ndimage as ndi
import scipy.spatial as spt
import astropy.stats as ast
import cv2

pbar = tqdm.tqdm_notebook(total=len(sampleAreas))

short_name = {
    "height": "H",
    "crown_coverage": "CC",
    "basal_area": "BA",
    "mean_diameter": "MD",
    "biomass": "BM",
    "volume": "V",
    "upper_height": "UH",
    "mean_height": "MH"
}

for sampleArea in sampleAreas:
    for raster_name in sampleArea.rasters:
        for radius in sampleArea.rasters[raster_name]:
            data = sampleArea.rasters[raster_name][radius]
            
            if raster_name == "height_smoothed":
                # Remove data below Otsu threshold
                no_nan_data = list(filter(lambda value: not math.isnan(value), data.flatten()))
                threshold_mask = data > 10.2 # custom value proven to be better for this data
                thresholded_data = np.array(data)
                thresholded_data[~threshold_mask] = None

                # Extract tree tops
                data_floats = img_as_float(thresholded_data)
                tree_tops = peak_local_max(np.nan_to_num(data_floats), min_distance=3) # 3 = 0.75 m
                sampleArea.features[f"tops.number.{radius}"] = len(tree_tops)

                if len(tree_tops) > 1:
                    # Tree tops nearest neighbors 
                    kd_tree = spt.KDTree(tree_tops)
                    distances, _ = kd_tree.query(tree_tops, min(len(tree_tops), 2) ) # gets distances for itself and nearest neightbor
                    distances = distances[:, 1:] / 4 # remove distance to self, only include nearest neighbor, divide by 4 to get value in meters (since each pixel is 0.25 meters)
                    sampleArea.features[f"nndist.mean.{radius}"] = np.mean(distances)
                    sampleArea.features[f"nndist.sd.{radius}"] = np.std(distances)

                    # Number of tree tops within 5 meters 
                    neighbors_withing_5_m = kd_tree.query_ball_point(tree_tops, 5*4)
                    n_neighbors_withing_5_m = list(map(lambda neighbors: len(neighbors)-1, neighbors_withing_5_m)) # remove self from neighbor list and get length
                    sampleArea.features[f"n5m.mean.{radius}"] = np.mean(n_neighbors_withing_5_m)
                    sampleArea.features[f"n5m.sd.{radius}"] = np.std(n_neighbors_withing_5_m)

                    # Number of tree tops within 10 meters
                    neighbors_withing_10_m = kd_tree.query_ball_point(tree_tops, 10*4)
                    n_neighbors_withing_10_m = list(map(lambda neighbors: len(neighbors)-1, neighbors_withing_10_m)) # remove self from neighbor list and get length
                    sampleArea.features[f"n10m.mean.{radius}"] = np.mean(n_neighbors_withing_10_m)
                    sampleArea.features[f"n10m.sd.{radius}"] = np.std(n_neighbors_withing_10_m)

                    # Ripley's K
                    rk_est = ast.RipleysKEstimator(area=math.pi*math.pow((radius*4), 2), x_max=np.inf, y_max=np.inf, x_min=0, y_min=0)
                    rk_est_vals = rk_est(data=tree_tops, radii=range(0, 10*4+1, 4), mode='ripley')
                    for k, k_val in enumerate(rk_est_vals):
                        sampleArea.features[f"K.{k}.{radius}"] = k_val
                else:
                    # Currently uses -1 as none value for nearest neighbor distance
                    sampleArea.features[f"nndist.mean.{radius}"] = -1
                    sampleArea.features[f"nndist.sd.{radius}"] = -1
                    sampleArea.features[f"n5m.mean.{radius}"] = 0
                    sampleArea.features[f"n5m.sd.{radius}"] = 0
                    sampleArea.features[f"n10m.mean.{radius}"] = 0
                    sampleArea.features[f"n10m.sd.{radius}"] = 0
                    for k in range(10+1):
                        sampleArea.features[f"K.{k}.{radius}"] = 0
                
                # Tree crown segmentation
                if len(tree_tops) > 0:
                    ws_markers = np.ones_like(data)
                    ws_markers[threshold_mask == 1] = 0
                    for index, coord in enumerate(tree_tops):
                        ws_markers[coord[0], coord[1]] = index+2
                    img = np.stack((thresholded_data,)*3, axis=-1).astype(np.uint8)
                    segments = cv2.watershed(img, ws_markers.astype(np.int32))
                    unique, counts = np.unique(segments, return_counts=True)
                    crowns = dict(zip(unique, counts))
                    most_common_val = max(crowns, key=crowns.get)
                    crowns.pop(-1) # Removes lines between segments
                    crowns.pop(most_common_val) # Remove surrounding segment
                    crown_areas = list(filter(lambda size: size > 2, [crowns[key] for key in crowns]))
                    n_crowns = len(crown_areas)
                    sampleArea.features[f"crown.area.mean.{radius}"] = np.mean(crown_areas) if n_crowns else 0
                    sampleArea.features[f"crown.area.sd.{radius}"] = np.std(crown_areas) if n_crowns else 0
                    sampleArea.features[f"crown.area.sum.{radius}"] = np.sum(crown_areas) if n_crowns else 0
                else:
                    sampleArea.features[f"crown.area.mean.{radius}"] = 0
                    sampleArea.features[f"crown.area.sd.{radius}"] = 0
                    sampleArea.features[f"crown.area.sum.{radius}"] = 0

                # Gaps
                kernel = nf.create_circular_mask(4, 4, (1.5, 1.5), 2).astype(np.uint8)
                tempdata = np.array(data)
                tempdata[np.isnan(tempdata)] = 10

                # Gaps over 3m
                over_3m_mask = ~(tempdata > 30) # Value "30" is in decimeters
                opening_3m = cv2.morphologyEx(over_3m_mask.astype(np.uint8), cv2.MORPH_OPEN, kernel) # Open (descriminate/make smaller) the gaps
                labeled_3m, _ = ndi.label(opening_3m.astype(bool)) # Select contained gaps
                labeled_3m[opening_3m == 0] = 1 # Remove gaps stretching out of the area (seen as background)
                _, gap_count_3m = np.unique(labeled_3m, return_counts=True)
                gaps_3m = gap_count_3m[1:]
                n_gaps_3m = len(gaps_3m)
                sampleArea.features[f"gap3m.number.{radius}"] = n_gaps_3m
                sampleArea.features[f"gap3m.mean.{radius}"] = np.mean(gaps_3m) if n_gaps_3m else 0
                sampleArea.features[f"gap3m.sd.{radius}"] = np.std(gaps_3m) if n_gaps_3m else 0
                sampleArea.features[f"gap3m.sum.{radius}"] = np.sum(gaps_3m) if n_gaps_3m else 0

                # Gaps over 5m
                over_5m_mask = ~(tempdata > 50) # Value "50" is in decimeters
                opening_5m = cv2.morphologyEx(over_5m_mask.astype(np.uint8), cv2.MORPH_OPEN, kernel) # Open (descriminate/make smaller) the gaps
                labeled_5m, _ = ndi.label(opening_5m.astype(bool)) # Select contained gaps
                labeled_5m[opening_5m == 0] = 1 # Remove gaps stretching out of the area (seen as background)
                _, gap_count_5m = np.unique( labeled_5m, return_counts=True)
                gaps_5m = gap_count_5m[1:]
                n_gaps_5m = len(gaps_5m)
                sampleArea.features[f"gap5m.number.{radius}"] = n_gaps_5m
                sampleArea.features[f"gap5m.mean.{radius}"] = np.mean(gaps_5m) if n_gaps_5m else 0
                sampleArea.features[f"gap5m.sd.{radius}"] = np.std(gaps_5m) if n_gaps_5m else 0
                sampleArea.features[f"gap5m.sum.{radius}"] = np.sum(gaps_5m) if n_gaps_5m else 0
                
            else:

                raster_short_name = short_name[raster_name]
                
                sampleArea.features[f"{raster_short_name}.mean.{radius}"] = np.mean(data)
                sampleArea.features[f"{raster_short_name}.median.{radius}"] = np.median(data)
                sampleArea.features[f"{raster_short_name}.max.{radius}"] = np.amax(data)
                sampleArea.features[f"{raster_short_name}.min.{radius}"] = np.amin(data)
                sampleArea.features[f"{raster_short_name}.sd.{radius}"] = np.std(data)
                sampleArea.features[f"{raster_short_name}.skew.{radius}"] = st.skew(data)
                sampleArea.features[f"{raster_short_name}.kurt.{radius}"] = st.kurtosis(data)
                sampleArea.features[f"{raster_short_name}.cv.{radius}"] = st.variation(data)

                if raster_name == "height":
                    heights_over_2m = list(filter(lambda x: x >= 20, data)) # Value "20" is in decimeters
                    for percentile in range(10,100+1,10):
                        if len(heights_over_2m) > 0:
                            sampleArea.features[f"{raster_short_name}.{percentile}.{radius}"] = np.percentile(heights_over_2m, percentile)
                        else:
                            sampleArea.features[f"{raster_short_name}.{percentile}.{radius}"] = None
                            
    # Clear raster data from object
    sampleArea.rasters = {}
    pbar.update(1)

print("done!")

HBox(children=(IntProgress(value=0, max=2882), HTML(value='')))



done!


In [33]:
np.save("numpy_objects/training_data_set_with_features", sampleAreas_training)
np.save("numpy_objects/testing_data_set_with_features", sampleAreas_testing)