In [1]:
! pip install geopandas -q

[K     |████████████████████████████████| 972kB 3.2MB/s 
[K     |████████████████████████████████| 10.9MB 10.7MB/s 
[K     |████████████████████████████████| 14.7MB 298kB/s 
[?25h

In [2]:
import ee
import os
import geopandas as gpd
import pandas as pd
import numpy as np

from datetime import datetime
from datetime import timedelta

## Get Earth Engine Running
To access GEE, we will need to authenticate our account, and then initialize a connection to a server. 

In [3]:
ee.Authenticate()

To authorize access needed by Earth Engine, open the following URL in a web browser and follow the instructions. If the web browser does not start automatically, please manually browse the URL below.

    https://accounts.google.com/o/oauth2/auth?client_id=517222506229-vsmmajv00ul0bs7p89v5m89qs8eb9359.apps.googleusercontent.com&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fearthengine+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdevstorage.full_control&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&response_type=code&code_challenge=lGX-skx9Aj5SRwBawz8ICXwSGJrWvyaJb5TbdoczRCs&code_challenge_method=S256

The authorization workflow will generate a code, which you should paste in the box below. 
Enter verification code: 4/3AFSQp5S9ZstcW3loJgssSpCZ2ImUYKpQmEFOIf5HjBaYlKObixwVpU

Successfully saved authorization token.


In [4]:
ee.Initialize()

# Mount Google Drive
We have shapefiles containing the geospatial boundaries of the map tiles we'll be gathering data from on our Google Drive.

To mount our Drive and access our files, we have to authenticate with Google Drive first.

In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


# Retrieve LANDSAT Tiles
For each of the tiles in a GeoDataFrame, and each year for which we want to get the LANDSAT imagery, we will filter the LANDSAT collection from GEE to identify the image that includes the centroid of our tile. We will then export that LANDSAT image to our Google Drive.

Given the frequency of LANDSAT imaging (LANDSAT 8 has a 16-day repeat cycle), we will composite LANDSAT images to match particular timeframes for each year we are interested in. 

We will composite images during these timeframes, avoiding scenes that have high cloud cover and masking pixels which are likely to be clouds.  There are three timeframes from which we want LANDSAT imagery:
1. **Near-NAIP:** Near the time when NAIP (aerial) 4-band imagery was acquired. The R, G, B, and NIR bands of the LANDSAT composite will be extracted and used to transform the NAIP imagery values (which are "digital numbers" from 1-255) into surface reflectance values using histogram matching. NAIP imagery is typically flown in the August-September timeframe in the years it is acquired.
2. **Leaf-On:** Composite (median value for each band) from April 1 - September 30. This image will be used to training models to recognize forest stand boundaries and other conditions. 
3. **Leaf-Off:** Composite (median value for each band) from October 1 - March 31. This image will be used to training models to recognize forest stand boundaries and other conditions.

In [6]:
naip = ee.ImageCollection('USDA/NAIP/DOQQ')
landsat8 = ee.ImageCollection('LANDSAT/LC08/C01/T1_SR')
landsat7 = ee.ImageCollection("LANDSAT/LE07/C01/T1_SR")

In [7]:
def mask_l8_sr(image):
    """Masks out pixels likely to be cloud or shadow in a LANDSAT 8 image"""
    cloud_shadow_bit_mask = (1 << 3)  # bit 3 is shadow in QA band
    clouds_bit_mask = (1 << 5)  # bit 5 is cloud in QA band
    
    # Get the pixel QA band.
    qa = image.select('pixel_qa')
    # Both flags should be set to zero, indicating clear conditions.
    mask = qa.bitwiseAnd(cloud_shadow_bit_mask).eq(0) and (qa.bitwiseAnd(clouds_bit_mask).eq(0))
    return image.updateMask(mask)

## Fetching LANDSAT, Part I: Near-NAIP
Get 4-band landsat imagery near the date of NAIP aerial imagery, and export the resulting imagery to the same folder that we're holding the raw NAIP imagery in. 

In [8]:
def get_near_naip(gdf, state, years):
    tasks = {}
    naip_crosswalk = {}

    print('Retrieving images for {:,d} features in GeoDataFrame'.format(len(gdf)))
    for year in years:
        folder = f'naip_tiles-{state}-{year}'
        print('\n\n', year)
        for idx, row in gdf.iterrows():
            # get centroid of the tile and find the NAIP image(s) it falls within
            bbox_point = row['geometry'].centroid.buffer(1).bounds
            geom_point = ee.Geometry.Rectangle(bbox_point, 
                                               proj=f'EPSG:{gdf.crs.to_epsg()}', 
                                               evenOdd=True, 
                                               geodesic=False)

            naip_coll = naip.filterBounds(geom_point).filterDate(f'{year}-01-01', 
                                                                 f'{year}-12-31')
            naip_list = naip_coll.toList(naip_coll.size())

            num_images = naip_coll.size().getInfo()
            if num_images == 0:
                print('\n', f'CELL_ID {row.CELL_ID} matches no NAIP tiles')
            if num_images > 1:
                print('\n', f'CELL_ID {row.CELL_ID} matches {num_images} NAIP tiles')

            for i in range(num_images):
                img = ee.Image(naip_list.get(i))

                try:
                    name = img.id().getInfo()
                    naip_crosswalk[name] = {'tile_id': row['CELL_ID'], 
                                            'nearnaip_landsat': False}
                except ee.EEException:  # no match was found
                    print('\n', 
                          f'CELL_ID {row.CELL_ID} erred on NAIP tile #{i}')
                    continue  # on to next image in img_list

                # get the date of the naip image with a +/- 2 week window for 
                # filtering landsat images
                naip_date = datetime.strptime(name.split('_')[-1], '%Y%m%d')
                early_date = (naip_date - timedelta(days=14)).strftime('%Y-%m-%d')
                late_date = (naip_date + timedelta(days=14)).strftime('%Y-%m-%d')

                # get the tile bounding box for filtering landsat images
                bbox_tile = row['geometry'].bounds
                xmin, ymin, xmax, ymax = bbox_tile
                (xmin, ymin) = np.floor((xmin, ymin))
                (xmax, ymax) = np.ceil((xmax, ymax))
                geom_tile = ee.Geometry.Rectangle(bbox_tile, 
                                                  proj=f'EPSG:{gdf.crs.to_epsg()}', 
                                                  evenOdd=True, 
                                                  geodesic=False)
                
                if year < 2013:
                  landsat = landsat7
                  RBGNIR = ['B3', 'B2', 'B1', 'B4']
                else:
                  landsat = landsat8
                  RBGNIR = ['B4', 'B3', 'B2', 'B5']
                
                # filter landsat collection
                landsat_coll = landsat.filterBounds(geom_tile).filterDate(early_date, late_date)
                # mask clouds and get R, G, B, NIR bands, calculating median value across this timeframe
                landsat_img = landsat_coll.map(mask_l8_sr).select(RBGNIR).reduce('median')
                
                # if no landsat tier 1 images are found in this 4-week window
                if len(landsat_img.bandNames().getInfo()) == 0:
                    print('\n', f'CELL_ID {row.CELL_ID} matches no LANDSAT tiles from {early_date} to {late_date}')
                    continue
                
                naip_crosswalk[name]['nearnaip_landsat'] = True 
                # submit a task to the server to export the image to our Drive
                if not os.path.exists(
                    os.path.join('/content/drive/My Drive', 
                                 folder, 
                                 name+'_landsat' + '.tif')):
                
                    task = ee.batch.Export.image.toDrive(landsat_img,
                                                        description=name+'_landsat', 
                                                        folder=folder,
                                                        region=geom_tile, 
                                                        crs=f'EPSG:{gdf.crs.to_epsg()}')
                    
                    task.start()
                    tasks[idx] = task  # keep track of our tasks in a dictionary

            # report progress
            if idx % 100 == 0 and idx > 0:
                print()
            if idx % 10 == 0:
                print(idx, end='')
            else:
                print('.', end='')
    return tasks, naip_crosswalk

In [9]:
SHP_DIR = '/content/drive/Shared drives/stand_mapping/data/interim'

WA11_SHP = 'washington_utm11n_training_quads_epsg6340.shp'
WA10_SHP = 'washington_utm10n_training_quads_epsg6339.shp'
OR10_SHP = 'oregon_utm10n_training_quads_epsg6339.shp'
OR11_SHP = 'oregon_utm11n_training_quads_epsg6340.shp'

or10_gdf = gpd.read_file(os.path.join(SHP_DIR, OR10_SHP))
or11_gdf = gpd.read_file(os.path.join(SHP_DIR, OR11_SHP))
wa10_gdf = gpd.read_file(os.path.join(SHP_DIR, WA10_SHP))
wa11_gdf = gpd.read_file(os.path.join(SHP_DIR, WA11_SHP))

## Go get the near-NAIP LANDSAT imagery...

In [10]:
GDF = wa10_gdf
STATE = 'WA'
YEARS = [2009, 2011, 2013, 2015, 2017]
wa10_tasks, wa10_crosswalk = get_near_naip(GDF, STATE, YEARS)

Retrieving images for 277 features in GeoDataFrame


 2009
0.........10.........20
 CELL_ID 215589 matches 2 NAIP tiles
.
 CELL_ID 215591 matches 2 NAIP tiles
........30.........40.........50.........60.........70.........80.........90.........
100....
 CELL_ID 133123 matches 2 NAIP tiles
.....110.........120.........130.........140.........150.........160.........170.........180.........190.........
200.........210.........220.........230.........240.........250.........260.........270......

 2011
0.........10.........20
 CELL_ID 215589 matches 2 NAIP tiles
.
 CELL_ID 215591 matches 2 NAIP tiles
........30.........40.........50.........60.........70.........80.........90.........
100....
 CELL_ID 133123 matches 2 NAIP tiles
.....110.........120.........130.........140.........150.........160.........170.........180.........190.........
200.........210.........220.........230.........240.........250.........260.........270......

 2013
0.........10.........20.........30
 CELL_ID 30523

In [11]:
GDF = wa11_gdf
STATE = 'WA'
YEARS = [2009, 2011, 2013, 2015, 2017]
wa11_tasks, wa11_crosswalk = get_near_naip(GDF, STATE, YEARS)

Retrieving images for 82 features in GeoDataFrame


 2009
0.........10.........20.........30.........40.........50.........60.........70.........80.

 2011
0.........10.........20.........30.........40.........50.........60.........70.........80.

 2013
0.........10.........20.........30.........40.........50.........60.........70.........80.

 2015
0.........10.........20.........30.........40.........50.........60.........70.........80.

 2017
0.........10.........20.........30.........40.........50.........60.........70.........80.

In [12]:
GDF = or11_gdf
STATE = 'OR'
YEARS = [2009, 2011, 2012, 2014, 2016]
or11_tasks, or11_crosswalk = get_near_naip(GDF, STATE, YEARS)

Retrieving images for 524 features in GeoDataFrame


 2009
0.........10.........20.........30.........40.....
 CELL_ID 224665 matches 2 NAIP tiles
....50.........60.........70.........80.........90.........
100.........110.........120.........130.........140.........150.........160.........170
 CELL_ID 130202 matches 2 NAIP tiles
.........180.........190.........
200.........210.........220.........230......
 CELL_ID 176776 matches 2 NAIP tiles
...240..
 CELL_ID 201528 matches 2 NAIP tiles
.......250.........260.........270.........280.........290.........
300.........310.........320.
 CELL_ID 265271 matches 2 NAIP tiles
.....
 CELL_ID 259170 matches 2 NAIP tiles
...330..
 CELL_ID 277967 matches 2 NAIP tiles
.......340.........350.........360.........370.........380.........390.........
400.........410.........420.........430........
 CELL_ID 156185 matches 2 NAIP tiles
.
 CELL_ID 156187 matches 2 NAIP tiles
440.........450.........460.....
 CELL_ID 155353 matches 2 NAIP tiles
....470.

In [13]:
GDF = or10_gdf
STATE = 'OR'
YEARS = [2009, 2011, 2012, 2014, 2016]
or10_tasks, or10_crosswalk = get_near_naip(GDF, STATE, YEARS)

Retrieving images for 607 features in GeoDataFrame


 2009
0.........10.........20.........30.........40.........50.........60.........70.........80.........90.........
100.........110.........120..
 CELL_ID 254524 matches 2 NAIP tiles
.......130.........140.........150.........160.........170.........180.........190.........
200.........210.........220.........230.........240.........250.........260.........270.........280.........290.........
300.........310.........320.........330.........340.........350.........360.........370.
 CELL_ID 256194 matches 2 NAIP tiles
..
 CELL_ID 264469 matches 2 NAIP tiles
......380.........390.........
400.........410.........420.........430.........440.........450.........460.........470.........480.........490.........
500.........510.........520.........530.........540.........550.........560.........570.........580.........590.........
600......

 2011
0.........10.........20.........30.........40.........50.........60.........70.........80......

## Fetching LANDSAT, Part II: Leaf-On and Leaf-Off
Get 6-bands (R, G, B, NIR, SWIR 1, and SWIR2) of LANDSAT imagery for leaf-on (Apr 1 - Sept 30 of current year) and leaf-off (Oct 1 - Mar 31 from preceding to current year) periods for each tile. Export the resulting imagery to new LANDSAT folders. 

In [18]:
def get_landsat(gdf, state, years):
    tasks = {}
    landsat_crosswalk = {}

    print('Retrieving images for {:,d} features in GeoDataFrame'.format(len(gdf)))
    for year in years:
        folder = f'landsat_tiles-{state}-{year}'
        print('\n\n', year)
        for idx, row in gdf.iterrows():
            name = str(row['CELL_ID'])
            landsat_crosswalk[row['CELL_ID']] = {year: {'leaf-on': False, 
                                                 'leaf-off': False}}
            tasks[name] = {year: {'leaf-on': None, 'leaf-off': None}}
            # get the tile bounding box for filtering landsat images
            bbox_tile = row['geometry'].bounds
            xmin, ymin, xmax, ymax = bbox_tile
            (xmin, ymin) = np.floor((xmin, ymin))
            (xmax, ymax) = np.ceil((xmax, ymax))
            geom_tile = ee.Geometry.Rectangle(bbox_tile, 
                                              proj=f'EPSG:{gdf.crs.to_epsg()}', 
                                              evenOdd=True, 
                                              geodesic=False)
            
            if year < 2013:
                landsat = landsat7
                BANDS = ['B3', 'B2', 'B1', 'B4', 'B5', 'B7']
            else:
                landsat = landsat8
                BANDS = ['B4', 'B3', 'B2', 'B5', 'B6', 'B7']
            
            # filter landsat collection for leaf-on images
            leafon_start, leafon_stop = f'{year}-04-01', f'{year}-09-30'
            leafon_landsat_coll = landsat.filterBounds(geom_tile).filterDate(leafon_start, leafon_stop)
            # mask clouds and calculate median value for all bands across this timeframe
            leafon_img = leafon_landsat_coll.map(mask_l8_sr).select(BANDS).reduce('median')
            
            # if no landsat tier 1 images are found in this window
            if len(leafon_img.bandNames().getInfo()) == 0:
                print('\n', f'CELL_ID {row.CELL_ID} matches no LANDSAT tiles from {leafon_start} to {leafon_stop}')
          
            if len(leafon_img.bandNames().getInfo()) > 0:
                landsat_crosswalk[row['CELL_ID']][year]['leaf-on'] = True
                if not os.path.exists(
                os.path.join('/content/drive/My Drive', 
                              folder, 
                              f'{name}_landsat-leaf-on_{year}.tif')):
            
                    task = ee.batch.Export.image.toDrive(leafon_img,
                                                        description=f'{name}_landsat-leaf-on_{year}', 
                                                        folder=folder,
                                                        region=geom_tile, 
                                                        crs=f'EPSG:{gdf.crs.to_epsg()}')
                    
                    task.start()
                    tasks[name][year]['leaf-on'] = task  # keep track of our tasks in a dictionary
            
            # filter landsat collection for leaf-off images
            leafoff_start, leafoff_stop = f'{year-1}-10-01', f'{year}-03-31'
            leafoff_landsat_coll = landsat.filterBounds(geom_tile).filterDate(leafoff_start, leafoff_stop)
            # mask clouds and calculate median value for all bands across this timeframe
            leafoff_img = leafoff_landsat_coll.map(mask_l8_sr).select(BANDS).reduce('median')
            
            
            # if no landsat tier 1 images are found in this window
            if len(leafoff_img.bandNames().getInfo()) == 0:
                print('\n', f'CELL_ID {row.CELL_ID} matches no LANDSAT tiles from {leafoff_start} to {leafoff_stop}')
            if len(leafoff_img.bandNames().getInfo()) > 0:
                landsat_crosswalk[row['CELL_ID']][year]['leaf-off'] = True
                if not os.path.exists(
                os.path.join('/content/drive/My Drive', 
                              folder, 
                              f'{name}_landsat-leaf-off_{year}.tif')):
            
                    task = ee.batch.Export.image.toDrive(leafoff_img,
                                                         description=f'{name}_landsat-leaf-off_{year}', 
                                                         folder=folder,
                                                         region=geom_tile, 
                                                         crs=f'EPSG:{gdf.crs.to_epsg()}')
                    task.start()
                    tasks[name][year]['leaf-off'] = task  # keep track of our tasks in a dictionary

            # report progress
            if idx % 100 == 0 and idx > 0:
                print()
            if idx % 10 == 0:
                print(idx, end='')
            else:
                print('.', end='')
    return tasks, landsat_crosswalk

In [24]:
GDF = wa10_gdf
STATE = 'WA'
YEARS = [2009, 2011, 2013, 2015, 2017]
wa10_landsat_tasks, wa10_landsat_crosswalk = get_landsat(GDF, STATE, YEARS)

Retrieving images for 277 features in GeoDataFrame


 2009
0.........10.........20.........30.........40.........50.........60.........70.........80.........90.........
100.........110.........120.........130.........140.........150.........160.........170.........180.........190.........
200.........210.........220.........230.........240.........250.........260.........270......

 2011
0.........10.........20.........30.........40.........50.........60.........70.........80.........90.........
100.........110.........120.........130.........140.........150.........160.........170.........180.........190.........
200.........210.........220.........230.........240.........250.........260.........270......

 2013
0.........10..
 CELL_ID 311521 matches no LANDSAT tiles from 2013-04-01 to 2013-09-30
.
 CELL_ID 311523 matches no LANDSAT tiles from 2013-04-01 to 2013-09-30
...
 CELL_ID 288003 matches no LANDSAT tiles from 2013-04-01 to 2013-09-30
...20.........
 CELL_ID 209289 matches no L

In [12]:
GDF = wa11_gdf
STATE = 'WA'
YEARS = [2009, 2011, 2013, 2015, 2017]
wa11_landsat_tasks, wa11_landsat_crosswalk = get_landsat(GDF, STATE, YEARS)

Retrieving images for 82 features in GeoDataFrame


 2009
0.........10.........20.........30.........40.........50.........60.........70.........80.

 2011
0.........10.........20.........30.........40.........50.........60.........70.........80.

 2013
0.....
 CELL_ID 300233 matches no LANDSAT tiles from 2012-10-01 to 2013-03-31
..
 CELL_ID 245170 matches no LANDSAT tiles from 2012-10-01 to 2013-03-31
..10
 CELL_ID 264095 matches no LANDSAT tiles from 2012-10-01 to 2013-03-31
.
 CELL_ID 282627 matches no LANDSAT tiles from 2012-10-01 to 2013-03-31
.......
 CELL_ID 245172 matches no LANDSAT tiles from 2012-10-01 to 2013-03-31
.20........
 CELL_ID 186717 matches no LANDSAT tiles from 2012-10-01 to 2013-03-31
.
 CELL_ID 185264 matches no LANDSAT tiles from 2012-10-01 to 2013-03-31
30.......
 CELL_ID 264093 matches no LANDSAT tiles from 2012-10-01 to 2013-03-31
.
 CELL_ID 264096 matches no LANDSAT tiles from 2012-10-01 to 2013-03-31
.40.........50.
 CELL_ID 319662 matches no LANDSAT tiles

In [19]:
GDF = or10_gdf
STATE = 'OR'
YEARS = [#2009, 2011, 
         2012, 2014, 2016]
or10_landsat_tasks, or10_landsat_crosswalk = get_landsat(GDF, STATE, YEARS)

Retrieving images for 607 features in GeoDataFrame


 2012
0.........10.........20.........30.........40.........50.........60.........70.........80.........90.........
100.........110.........120.........130.........140.........150.........160.........170.........180.........190.........
200.........210.........220.........230.........240.........250.........260.........270.........280.........290.........
300.........310.........320.........330.........340.........350.........360.........370.........380.........390.........
400.........410.........420.........430.........440.........450.........460.........470.........480.........490.........
500.........510.........520.........530.........540.........550.........560.........570.........580.........590.........
600......

 2014
0.........10.........20.........30.........40.........50.........60.........70.........80.........90.........
100.........110.........120.........130.........140.........150.........160.........170.........180

In [24]:
pd.DataFrame.from_dict({(i,j): or10_landsat_crosswalk[i][j] 
                        for i in or10_landsat_crosswalk.keys() 
                        for j in or10_landsat_crosswalk[i].keys()},
                       orient='index')

Unnamed: 0,Unnamed: 1,leaf-on,leaf-off
298861,2016,True,True
298863,2016,True,True
303209,2016,True,True
303211,2016,True,True
303228,2016,True,True
...,...,...,...
315407,2016,True,True
314977,2016,True,True
314979,2016,True,True
315749,2016,True,True


In [None]:
GDF = or11_gdf
STATE = 'OR'
YEARS = [2009, 2011, 2012, 2014, 2016]
or11_landsat_tasks, or11_landsat_crosswalk = get_landsat(GDF, STATE, YEARS)

## Compile Tiles with Missing and Extra Data
Some of our USGS Quarter Quad tiles did not have NAIP imagery, near-NAIP LANDSAT, or leaf-on or leaf-off LANDSAT images during the years and timespans we specified. We want to identify these tiles so we can ultimately remove them from the training dataset.

In [17]:
NAIP_CROSSWALKS = [wa10_crosswalk, wa11_crosswalk, or10_crosswalk, or11_crosswalk]
LANDSAT_CROWSSWALKS = [wa10_landsat_crosswalk, wa11_landsat_crosswalk, or10_landsat_crosswalk, or11_landsat_crosswalk]
OUTFILE = '/content/drive/Shared drives/stand_mapping/data/interim/naip_and_landsat_tiles.csv'

naip_dfs = [pd.DataFrame.from_dict(d, orient='index') for d in NAIP_CROSSWALKS]
naip_crosswalk = pd.concat(naip_dfs).reset_index().rename({'index': 'naip_id'}, axis=1)

landsat_dfs = [pd.DataFrame.from_dict(d, orient='index') for d in LANDSAT_CROWSSWALKS]
landsat_crosswalk = pd.concat(landsat_dfs).reset_index().rename({'index': 'tile_id'}, axis=1)

# crosswalk_df = naip_crosswalk.merge(landsat_crosswalk, on=['tile_id'], how='outer')
# crosswalk_df.to_csv(OUTFILE, index=False, header=True)

# crosswalk_df.head()

Unnamed: 0,naip_id,tile_id,nearnaip_landsat
0,m_4811925_se_11_1_20090630,293740,True
1,m_4811916_nw_11_1_20090630,215381,True
2,m_4811916_ne_11_1_20090630,215383,True
3,m_4811918_nw_11_1_20090630,288473,True
4,m_4811918_ne_11_1_20090630,288475,True
...,...,...,...
7478,m_4512142_ne_10_1_20160804,315407,True
7479,m_4312017_nw_10_1_20160621,314977,True
7480,m_4312017_ne_10_1_20160621,314979,True
7481,m_4512133_nw_10_1_20160804,315749,True
