# Learning to Predict Land Cover from Agency Cover Typing
In this notebook, we'll gather samples of stands and their corresponding aerial imagery and some other layers to build a model that predicts land cover types:

* 1 = Water
* 2 = Forest
* 3 = Field
* 4 = Barren/Non-vegetated
* 5 = Developed

For each stand in the hand-drawn stand layers we load, we will extract a histogram of values including spectral and geographic/topographic features.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
! apt-get install -qq libspatialindex-dev 

Selecting previously unselected package libspatialindex4v5:amd64.
(Reading database ... 146425 files and directories currently installed.)
Preparing to unpack .../libspatialindex4v5_1.8.5-5_amd64.deb ...
Unpacking libspatialindex4v5:amd64 (1.8.5-5) ...
Selecting previously unselected package libspatialindex-c4v5:amd64.
Preparing to unpack .../libspatialindex-c4v5_1.8.5-5_amd64.deb ...
Unpacking libspatialindex-c4v5:amd64 (1.8.5-5) ...
Selecting previously unselected package libspatialindex-dev:amd64.
Preparing to unpack .../libspatialindex-dev_1.8.5-5_amd64.deb ...
Unpacking libspatialindex-dev:amd64 (1.8.5-5) ...
Setting up libspatialindex4v5:amd64 (1.8.5-5) ...
Setting up libspatialindex-c4v5:amd64 (1.8.5-5) ...
Setting up libspatialindex-dev:amd64 (1.8.5-5) ...
Processing triggers for libc-bin (2.27-3ubuntu1.3) ...
/sbin/ldconfig.real: /usr/local/lib/python3.6/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link



In [None]:
! pip install -q geopandas rasterio rtree scikit-image==0.18.rc0

[K     |████████████████████████████████| 972kB 5.4MB/s 
[K     |████████████████████████████████| 19.1MB 1.3MB/s 
[K     |████████████████████████████████| 1.0MB 46.9MB/s 
[K     |████████████████████████████████| 35.2MB 149kB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 6.5MB 52.0MB/s 
[K     |████████████████████████████████| 14.8MB 263kB/s 
[?25h  Building wheel for scikit-image (PEP 517) ... [?25l[?25hdone
[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.[0m


In [None]:
import skimage
skimage.__version__

'0.18.0rc0'

In [None]:
import os
import glob
import numpy as np
from collections import Counter
import geopandas as gpd
from shapely.geometry import box
import pandas as pd
import rasterio
from rasterio import windows, transform
from rasterio.plot import reshape_as_image, reshape_as_raster, show
from rasterio import features
from matplotlib import pyplot as plt
import pandas as pd

from skimage.measure import regionprops_table

from tqdm.notebook import tqdm

## The Workflow
We'll being by identifying all the stand layers which have been assigned a `COVER_TYPE` attribute based on the classification found in the original dataset by the managing agency. 

We'll then implement several data loading and processing functions to read the stack of raster and vector layers from disk and summarize each layer using a 16-bin histogram.

In [None]:
FILE_TYPE = 'stands_self_classified'
ROOT_DIR = '/content/drive/Shareddrives/stand_mapping/data/interim/training_tiles'

paths = []
for state_name in ['oregon', 'washington']:
    for root, dirnames, files in os.walk(os.path.join(ROOT_DIR, state_name, FILE_TYPE)):
        for f in files:
            if '.geojson' in f:
                paths.append(os.path.join(root, f))

len(paths)

1447

In [None]:
def parse_stand_path(path_to_file):
    dirname, basename = os.path.split(path_to_file)
    cell_id = int(basename.split('_')[0])
    year = int(basename.split('_')[-1].split('.')[0])
    if 'oregon' in dirname:
        state_name = 'oregon'
    elif 'washington' in dirname:
        state_name = 'washington'
    else:
        state_name = None
    agency = basename.split('_')[2]

    return dirname, cell_id, state_name, year, agency

In [None]:
stands_info = [parse_stand_path(path) for path in paths]
stands_df = pd.DataFrame(stands_info, columns=['DIRNAME', 'CELL_ID', 'STATE_NAME', 'YEAR', 'AGENCY'])
stands_df['PATH'] = paths
stands_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1447 entries, 0 to 1446
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   DIRNAME     1447 non-null   object
 1   CELL_ID     1447 non-null   int64 
 2   STATE_NAME  1447 non-null   object
 3   YEAR        1447 non-null   int64 
 4   AGENCY      1447 non-null   object
 5   PATH        1447 non-null   object
dtypes: int64(2), object(4)
memory usage: 68.0+ KB


In [None]:
pd.unique(stands_df.AGENCY)

array(['willamette-usfs', 'malheur-usfs', 'umatilla-usfs', 'mthood-usfs',
       'blm', 'gp-usfs', 'dnr'], dtype=object)

In [None]:
def get_yearly_path(layer_type, cell_id, state_name, stands_year):
    if layer_type not in ['naip', 'landsat-leaf-on', 'landsat-leaf-off', 
                          'landtrendr']:
        raise ValueError("Unrecognized layer type, use: \
                        'naip', 'landsat-leaf-on', 'landsat-leaf-off', \
                        'landtrendr'")
    if state_name == 'washington':
        YEARS = np.array([2009, 2011, 2015, 2017])
    elif state_name == 'oregon':
        YEARS = np.array([2009, 2011, 2014, 2016])
    best_year = YEARS[np.argmin(abs(YEARS - stands_year))]
    if layer_type[0:7] == 'landsat':
        layer_name = 'landsat'
    else:
        layer_name = layer_type
    dirname = ''.join([
        '/content/drive/Shareddrives/stand_mapping/data/',
        f'processed/training_tiles/{state_name}/{layer_name}/{best_year}'
        ])
    fname = f'{cell_id}_{layer_type}_{best_year}.tif'
    path_to_file = os.path.join(dirname, fname)
    return path_to_file

def get_fixed_layer_path(layer_type, cell_id, state_name, distance=False):
    if layer_type not in ['hydro', 'roads', 'buildings', 'dem', 
                          'tpi300', 'tpi2000',  'slope']:
        raise ValueErorr("Unrecognized layer type, use: \
                         'hydro', 'roads', 'buildings', 'tpi300', 'tpi2000', \
                          'slope', 'dem'")
    if layer_type[0:3] == 'tpi':
        layer_name = 'tpi'
    else:
        layer_name = layer_type
    dirname = ''.join(['/content/drive/Shareddrives/stand_mapping/data/',
                      f'processed/training_tiles/{state_name}/{layer_name}'])
    dist_name = f'{cell_id}_{layer_type}_distance.tif'
    bin_name= f'{cell_id}_{layer_type}.tif'
    fname = dist_name if distance else bin_name
    path_to_file = os.path.join(dirname, fname)
    
    return path_to_file

In [None]:
def load_data(stand_path, verbose=False):
    """Loads NAIP, LANDSAT, and stand delineation data"""
    dirname, cell_id, state_name, year, agency = parse_stand_path(stand_path)
    naip_path = get_yearly_path('naip', cell_id, state_name, year)
    landsat_path = get_yearly_path('landsat-leaf-on', cell_id, state_name, year)
    tpi300_path = get_fixed_layer_path('tpi300', cell_id, state_name)
    tpi2000_path = get_fixed_layer_path('tpi2000', cell_id, state_name)
    hydro_path = get_fixed_layer_path('hydro', cell_id, state_name, distance=True)
    roads_path = get_fixed_layer_path('roads', cell_id, state_name, distance=True)
    buildings_path = get_fixed_layer_path('buildings', cell_id, state_name, distance=True)

    img_stack = []
    i = 1

    def get_naip(naip_path):
        with rasterio.open(naip_path) as src:
            profile = src.profile
            height, width = src.shape
            height, width = height-10, width-10
            window = windows.Window(col_off=5, row_off=5, height=height, width=width)
            img = reshape_as_image(src.read(window=window)).astype(np.uint8)
            trf = src.transform
            bbox = windows.bounds(window, trf, height=height, width=width)
        return img, height, width, trf, bbox

    img, height, width, trf, bbox = get_naip(naip_path)
    stands = gpd.read_file(stand_path)
    stands = gpd.clip(stands, box(*bbox))
    stands['geometry'] = stands.geometry.buffer(0)
    stands = stands.dropna(subset=['geometry'])
    stands = stands.loc[~stands.geometry.is_empty]
    
    img_stack.append(img)      
    if verbose:
        print(i, os.path.basename(naip_path), img.shape)
    i += img.shape[-1]
    
    with rasterio.open(landsat_path) as src:
        xmin, ymax = bbox[0], bbox[-1]
        row_off, col_off = src.index(xmin, ymax)
        window = windows.Window(col_off, row_off, width=width, height=height)
                                  
        img = ((
            reshape_as_image(
                np.stack(
                    [src.read(band+1, window=window) for band in range(6)]
                    ))/3000).clip(0, 1)*255).astype(np.uint8)
        
        img_stack.append(img)
        if verbose:
            print(i, os.path.basename(landsat_path), img.shape)
        i += img.shape[-1]

    paths_to_load = [tpi300_path, tpi2000_path,
                     hydro_path, roads_path, 
                     buildings_path]

    for path in paths_to_load:
        with rasterio.open(path) as src:
            window = windows.Window(col_off, row_off, width=width, height=height)
            img = src.read(1, window=window).reshape(height, width, 1).astype(np.int16)
        img_stack.append(img)
        if verbose:
            print(i, os.path.basename(path), img.shape)
        i += img.shape[-1]

    stacked = np.dstack(img_stack) 

    return stacked, stands, trf, bbox

In [None]:
def rasterize_polygons(gdf, out_shape, transform):
    """Rasterizes a GeoDataFrame such that each distinct geometry is rendered
    with a distinct integer value in the output raster.

    Parameters
    ----------
    gdf : GeoDataFrame
      GeoDataFrame to be rasterized
    out_shape : 2-tuple or list-like
      (height, width) of desired output raster
    transform : Affine
      rasterio-style (not GDAL-style) affine transformation matrix which
      translates pixel coordinates to geographic coordinates

    Returns
    -------
    ras : arr
      array with each geometry labeled with a distinct integer
    """
    ras = np.zeros(out_shape, dtype=np.int16)
    for i, geom in enumerate(gdf.geometry):
        mask = features.geometry_mask([geom], out_shape=out_shape,
                                      transform=transform, invert=True)
        ras[mask] = i+1
        
    return ras

In [None]:
from functools import partial, update_wrapper

def wrapped_partial(func, *args, **kwargs):
    partial_func = partial(func, *args, **kwargs)
    update_wrapper(partial_func, func)
    return partial_func

def hist_intensity(region, intensities, bins=None):
    counts, edges = np.histogram(intensities[region], bins=bins)
    density = counts/counts.sum()
    return density

def summarize_labels(path, bbox, labels, image, verbose=False):
    dirname, cell_id, state_name, year, agency = parse_stand_path(path)
    COL_NAMES = {
              1: 'NAIP-R', 2: 'NAIP-G', 3: 'NAIP-B', 4: 'NAIP-NIR',
              5: 'L8-R', 6: 'L8-G', 7: 'L8-B', 8: 'L8-NIR', 
              9: 'L8-SWIR1', 10: 'L8-SWIR2',
              11: 'TPI300', 12: 'TPI2000',
              13: 'HYDRO', 14: 'ROADS', 15: 'BLDGS'
              }
    BINS = {'NAIP-R': np.linspace(0, 255, 17), 
            'NAIP-G': np.linspace(0, 255, 17), 
            'NAIP-B': np.linspace(0, 255, 17), 
            'NAIP-NIR': np.linspace(0, 255, 17),
            'L8-R': np.linspace(0, 255, 17), 
            'L8-G': np.linspace(0, 255, 17), 
            'L8-B': np.linspace(0, 255, 17), 
            'L8-NIR': np.linspace(0, 255, 17),
            'L8-SWIR1': np.linspace(0, 255, 17),
            'L8-SWIR2': np.linspace(0, 255, 17),
            'TPI300': np.linspace(-200,200,17), 
            'TPI2000': np.linspace(-500,500,17),
            'HYDRO': np.linspace(0, 100, 17), 
            'ROADS': np.linspace(0, 100, 17),
            'BLDGS': np.linspace(0, 100, 17)}

    all_props = []
    for channel in range(image.shape[-1]):
        if channel == 0:
            properties = ['label']
        else:
            properties = []
        col_name = COL_NAMES[channel+1]
        bins = BINS[col_name]
        bins[0], bins[-1] = -np.inf, np.inf
        make_hist = wrapped_partial(hist_intensity, bins=bins)
        props = regionprops_table(label_image=labels, intensity_image=image[:,:,channel], 
                                  properties=properties, 
                                  extra_properties=[make_hist]
                                  )
        props = pd.DataFrame(props)
        props.columns = [col.replace('hist_intensity', COL_NAMES[channel+1]+'_hist') for col in props.columns]
        props = props.rename({'label': 'STAND_LABEL'}, axis=1)
        
        all_props.append(props)
        if verbose:
            print(channel, end='...')

    results = pd.concat(all_props, axis=1)
    for i, col in enumerate(['PATH', 'CELL_ID', 'STATE_NAME', 'AGENCY', 
                             'YEAR', 'XMIN', 'YMIN', 'XMAX', 'YMAX']):
        results.insert(i, col, np.nan)
                            

    results[['PATH', 'CELL_ID', 'STATE_NAME']] = path, cell_id, state_name
    results[['AGENCY', 'YEAR']] = agency, year
    results[['XMIN', 'YMIN', 'XMAX', 'YMAX']] = bbox

    return results

# Splitting Data into Training and Validation Subsets
For each "agency" (USFS is subdivided by National Forests), we choose 80% of the tiles for model training and set 20% aside for model validation.

Once the train/test split has been implemented, it is commented out here so that it doesn't need to be re-run and to alter the train/test membership assignments.

In [None]:
# stands_df['train0_test1'] = -1
# for agency in pd.unique(stands_df.AGENCY):
#     agency_idx = stands_df.loc[stands_df.AGENCY == agency].index
#     train_idx = np.random.choice(agency_idx, int(len(agency_idx)*0.8), replace=False)
#     stands_df.loc[train_idx, 'train0_test1'] = 0
#     stands_df.loc[agency_idx.difference(train_idx), 'train0_test1'] = 1

# stands_df.to_csv('/content/drive/Shareddrives/stand_mapping/data/interim/land_cover_modeling/stand_layers.csv', index=False, header=True)

In [None]:
stands_df = pd.read_csv('/content/drive/Shareddrives/stand_mapping/data/interim/land_cover_modeling/stand_layers.csv')

In [None]:
stands_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1447 entries, 0 to 1446
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   DIRNAME       1447 non-null   object
 1   CELL_ID       1447 non-null   int64 
 2   STATE_NAME    1447 non-null   object
 3   YEAR          1447 non-null   int64 
 4   AGENCY        1447 non-null   object
 5   PATH          1447 non-null   object
 6   train0_test1  1447 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 79.3+ KB


## Collecting Train- and Test-Set Samples 
Here, we select a sample of polygons from each tile to summarize for our train and test datasets.

In [None]:
SAMPLE_STANDS_PER_TILE = 10
ALL_PATHS = stands_df.loc[stands_df.train0_test1 == 0, 'PATH'].values
OUT_CSV = '/content/drive/Shareddrives/stand_mapping/data/interim/land_cover_modeling/train_histograms.csv'
OVERWRITE = False

if os.path.exists(OUT_CSV) and not OVERWRITE:
    all_results = pd.read_csv(OUT_CSV)
    already_done = len([path for path in ALL_PATHS if path in all_results.PATH.values])
    left_to_do = [path for path in ALL_PATHS if path not in all_results.PATH.values]
else:
    all_results = pd.DataFrame()
    already_done = 0
    left_to_do = ALL_PATHS

with tqdm(total=len(ALL_PATHS), desc='loaded') as loaded, \
     tqdm(total=len(ALL_PATHS), desc='labeled') as labeled, \
     tqdm(total=len(ALL_PATHS), desc='summarized') as summarized, \
     tqdm(total=len(ALL_PATHS), desc='saved') as saved:
    
    loaded.update(already_done)
    labeled.update(already_done)
    summarized.update(already_done)
    saved.update(already_done)
    
    for path in left_to_do:
        try:
            stacked, stands, trf, bbox = load_data(path)
        except: # do one retry, else skip
            try:
                stacked, stands, trf, bbox = load_data(path)
            except:
                print('Failed to load', os.path.basename(path))
                continue
        stands = stands.sample(n=min(len(stands),SAMPLE_STANDS_PER_TILE), replace=False)
        cover_types = stands['COVER_TYPE'].values
        loaded.update()
        height, width = stacked.shape[0], stacked.shape[1]
        labels = rasterize_polygons(stands, out_shape=(height, width), transform=trf)
        labeled.update()
        try:
            results = summarize_labels(path, bbox, labels, stacked)
            results['COVER_TYPE'] = cover_types
        except:
            print('Failed to summarize', os.path.basename(path))
            continue
        summarized.update()
        all_results = all_results.append(results, ignore_index=True)
        all_results.to_csv(OUT_CSV, index=False, header=True)
        saved.update()

HBox(children=(FloatProgress(value=0.0, description='loaded', max=1154.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='labeled', max=1154.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='summarized', max=1154.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='saved', max=1154.0, style=ProgressStyle(description_width…

Failed to summarize 125655_stands_willamette-usfs_2011.geojson
Failed to summarize 320211_stands_willamette-usfs_2011.geojson
Failed to summarize 240668_stands_dnr_2009.geojson
Failed to summarize 218187_stands_dnr_2013.geojson
Failed to summarize 311523_stands_dnr_2013.geojson
Failed to summarize 260281_stands_dnr_2013.geojson
Failed to summarize 141049_stands_dnr_2013.geojson
Failed to summarize 195721_stands_dnr_2013.geojson
Failed to summarize 289035_stands_dnr_2013.geojson
Failed to summarize 288004_stands_dnr_2013.geojson
Failed to summarize 312428_stands_dnr_2013.geojson
Failed to summarize 157454_stands_dnr_2013.geojson
Failed to summarize 169129_stands_dnr_2013.geojson
Failed to summarize 183057_stands_dnr_2013.geojson
Failed to summarize 307382_stands_dnr_2013.geojson
Failed to summarize 271108_stands_dnr_2013.geojson
Failed to summarize 270358_stands_dnr_2013.geojson
Failed to summarize 289127_stands_dnr_2013.geojson
Failed to summarize 276784_stands_dnr_2013.geojson
Failed 

In [None]:
SAMPLE_STANDS_PER_TILE = 10
ALL_PATHS = stands_df.loc[stands_df.train0_test1 == 1, 'PATH'].values
OUT_CSV = '/content/drive/Shareddrives/stand_mapping/data/interim/land_cover_modeling/validation_histograms.csv'
OVERWRITE = False

if os.path.exists(OUT_CSV) and not OVERWRITE:
    all_results = pd.read_csv(OUT_CSV)
    already_done = len([path for path in ALL_PATHS if path in all_results.PATH.values])
    left_to_do = [path for path in ALL_PATHS if path not in all_results.PATH.values]
else:
    all_results = pd.DataFrame()
    already_done = 0
    left_to_do = ALL_PATHS

with tqdm(total=len(ALL_PATHS), desc='loaded') as loaded, \
     tqdm(total=len(ALL_PATHS), desc='labeled') as labeled, \
     tqdm(total=len(ALL_PATHS), desc='summarized') as summarized, \
     tqdm(total=len(ALL_PATHS), desc='saved') as saved:
    
    loaded.update(already_done)
    labeled.update(already_done)
    summarized.update(already_done)
    saved.update(already_done)
    
    for path in left_to_do:
        try:
            stacked, stands, trf, bbox = load_data(path)
        except: # do one retry, else skip
            try:
                stacked, stands, trf, bbox = load_data(path)
            except:
                print('Failed to load', os.path.basename(path))
                continue
        stands = stands.sample(n=min(len(stands),SAMPLE_STANDS_PER_TILE), replace=False)
        cover_types = stands['COVER_TYPE'].values
        loaded.update()
        height, width = stacked.shape[0], stacked.shape[1]
        labels = rasterize_polygons(stands, out_shape=(height, width), transform=trf)
        labeled.update()
        try:
            results = summarize_labels(path, bbox, labels, stacked)
            results['COVER_TYPE'] = cover_types
        except:
            print('Failed to summarize', os.path.basename(path))
            continue
        summarized.update()
        all_results = all_results.append(results, ignore_index=True)
        all_results.to_csv(OUT_CSV, index=False, header=True)
        saved.update()

HBox(children=(FloatProgress(value=0.0, description='loaded', max=293.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='labeled', max=293.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='summarized', max=293.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='saved', max=293.0, style=ProgressStyle(description_width=…

Failed to summarize 143584_stands_dnr_2013.geojson
Failed to summarize 141713_stands_dnr_2013.geojson




