In [None]:
! pip install geopandas rasterio -q

[K     |████████████████████████████████| 972kB 3.0MB/s 
[K     |████████████████████████████████| 18.3MB 1.3MB/s 
[K     |████████████████████████████████| 14.8MB 295kB/s 
[K     |████████████████████████████████| 10.9MB 42.8MB/s 
[?25h

In [None]:
import os
import glob

import numpy as np
import geopandas as gpd
import rasterio
import ee

import requests
from io import BytesIO
from zipfile import ZipFile
from concurrent.futures import as_completed, ThreadPoolExecutor
from tqdm.notebook import tqdm

from datetime import datetime
from datetime import timedelta

## Get Earth Engine Running
To access GEE, we will need to authenticate our account, and then initialize a connection to a server. 

In [None]:
ee.Authenticate()

To authorize access needed by Earth Engine, open the following URL in a web browser and follow the instructions. If the web browser does not start automatically, please manually browse the URL below.

    https://accounts.google.com/o/oauth2/auth?client_id=517222506229-vsmmajv00ul0bs7p89v5m89qs8eb9359.apps.googleusercontent.com&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fearthengine+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdevstorage.full_control&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&response_type=code&code_challenge=nViuAj4j99FcNEsSoqFBxA3J8EFSFkrB8HkXqHlGwpA&code_challenge_method=S256

The authorization workflow will generate a code, which you should paste in the box below. 
Enter verification code: 4/4gG6vbR0UPPk-D93jnxD3jFm-vQLDZ7eQWD8VxIKjy2IA5rM40t7RUQ

Successfully saved authorization token.


In [None]:
ee.Initialize()

# Mount Google Drive
We have shapefiles containing the geospatial boundaries of the map tiles we'll be gathering data from on our Google Drive.

To mount our Drive and access our files, we have to authenticate with Google Drive first.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Retrieve LANDSAT Tiles
For each of the tiles in a GeoDataFrame, and each year for which we want to get the LANDSAT imagery, we will filter the LANDSAT collection from GEE to our Area of Interest. Given the frequency of LANDSAT imaging (LANDSAT 8 has a 16-day repeat cycle), we will composite LANDSAT images using a medoid method, filtering out pixels with high cloud cover or cloud shadows. Our composite images will cover two timeframes in each year we are interested in:

1. **Leaf-On:** April 1 - September 30
2. **Leaf-Off:** October 1 (preceding year) - March 31 (current year)

## Image Processing Functions
The following are helper functions that work on individual images that we will integrate into the workflow.

In [None]:
def mask_stuff(image):
    """Masks out pixels likely to be cloud or shadow in a LANDSAT 8 image"""
    cloud_shadow_bit_mask = (1 << 3)  # bit 3 is shadow in QA band
    clouds_bit_mask = (1 << 5)  # bit 5 is cloud in QA band
    
    # Get the pixel QA band.
    qa = image.select('pixel_qa')
    # Both flags should be set to zero, indicating clear conditions.
    mask = qa.bitwiseAnd(cloud_shadow_bit_mask).eq(0).And(qa.bitwiseAnd(clouds_bit_mask).eq(0))
    return image.updateMask(mask)

def harmonize_to_oli(image):
    """Applies linear adjustments to transform earlier sensors to more closely
    match LANDSAT 8 OLI as described in:
    
        Roy et al. (2016). "Characterization of Landsat-7 to Landsat-8 
        reflective wavelength and normalized difference vegetation index 
        continuity." Remote Sensing of Environment (185): 57–70. 
        https://doi.org/10.1016/j.rse.2015.12.024
    """

    ROY_COEFS = { # re-ordered to be R, G, B, NIR, SWIR1, SWIR2
        'intercepts': ee.Image.constant(
            [0.0061, 0.0088, 0.0003, 0.0412, 0.0254, 0.0172]
            ).multiply(10000),  # this scales LS7ETM to match LS8OLI scaling
        'slopes': ee.Image.constant(
            [0.9047, 0.8483, 0.8474, 0.8462, 0.8937, 0.9071]
            )
        }
        
    harmonized = image.select(['R', 'G', 'B', 'NIR', 'SWIR1', 'SWIR2'])\
                 .multiply(ROY_COEFS['slopes'])\
                 .add(ROY_COEFS['intercepts'])\
                 .round()\
                 .toShort()
    
    return harmonized

# def export_landsat_to_drive(image, description, folder, region, crs, 
#                             overwrite=False):
#     """Starts a task on Google Earth Engine to export an image to Google Drive.
#     """
#     outpath = os.path.join('/content/drive/My Drive', folder, f'{description}.tif')
#     if (not os.path.exists(outpath)) or overwrite:
#         task = ee.batch.Export.image.toDrive(image,
#                                              description=description,
#                                              folder=folder,
#                                              scale=30,
#                                              region=region, 
#                                              crs=crs,
#                                              formatOptions={'cloudOptimized':True})
                
#           task.start()
#      return

## Collection Processing Functions
The following functions work on Google Earth Engine ImageCollections. 

In [None]:
def get_landsat_collection(aoi, year, start_date, end_date):
    """Returns a LANDSAT 5 or 7 collection filtered to a specific area of 
    interest and timeframe."""
    if year <= 2011:
        sensor = 'LT05'
    # we are not grabbing landsat imagery for 2012 or 2013 because we
    # can't get a single sensor that includes both windows 
    # other than LANDSAT 7 which is suffering from scan line errors
    # during this timeframe
    elif year >= 2014:
        sensor = 'LC08'

    # define the collection we'll get data from
    coll = ee.ImageCollection(f'LANDSAT/{sensor}/C01/T1_SR')\
           .filterBounds(aoi).filterDate(start_date, end_date)

    return coll  


def get_medoid(collection, bands=['R','G','B','NIR','SWIR1', 'SWIR2']):
    """Makes a medoid composite of images in an image collection.

    Adapted to Python from a Javascript version here:
    https://github.com/google/earthengine-community/blob/73178fa9e0fd370783f871fe73eb38912f4c8bb9/toolkits/landcover/impl/composites.js#L88
    """
    median = collection.select(bands).median()  # per-band median across collection
    
    def med_diff(image):
        """Calculates squared difference of each pixel from median of each band.
        This functions is nested in `get_medoid` because it uses the median of 
        the collection containing the image.
        """
        distance = image.select(bands).spectralDistance(median, 'sed')\
                   .multiply(-1.0)\
                   .rename('medoid_distance')

        return image.addBands(distance)

    indexed = collection.map(med_diff)
    
    # qualityMosaic selects pixels for a mosaic that have the highest value 
    # in the user-specified band
    mosaic = indexed.qualityMosaic('medoid_distance')
    band_names = mosaic.bandNames().remove('medoid_distance')
    
    return mosaic.select(band_names)

def get_landsat_composite(aoi, year, start_date, end_date):
    """Returns a single medoid composite image from LANDSAT 5 or 7 for an area 
    of interest within a specified timeframe. 

    Masks cloud and cloud shadow pixels and applies linear adjustment to 
    LANDSAT 5 images based on Roy et al. (2016) for consistency with 
    LANDSAT 8 images.
    """
    if year <= 2011:
        sensor, bands = 'LT05', ['B3', 'B2', 'B1', 'B4', 'B5', 'B7']
    elif year >= 2014:
        sensor, bands = 'LC08', ['B4', 'B3', 'B2', 'B5', 'B6', 'B7']
    
    coll = get_landsat_collection(aoi, year, start_date, end_date)
    
    # mask clouds and cloud shadows
    masked = coll.map(mask_stuff)\
             .select(bands, ['R','G','B','NIR','SWIR1','SWIR2'])
    
    # get a medoid composite image
    medoid = get_medoid(masked)

    if sensor != 'LC08':  
        # harmonize LANDSAT 5 reflectance to match LANDSAT 8
        img = harmonize_to_oli(medoid)
    else:
        img = medoid

    return img

## Bring it all together
The following function will fetch LANDSAT images for all the tiles we have in a GeoDataFrame and export them to Google Drive.

In [None]:
def get_landsat(gdf, state, years):
    """Iterates through features in a GeoDataFrame and exports medoid composite 
    images from LANDSAT 5 and 7 for each year requested.
    """
    print('Preparing download URLs for {:,d} tiles for {:,d} years'.format(len(gdf), len(years)))
    OUT_ROOT = '/content/drive/Shared drives/stand_mapping/data/interim/training_tiles/'
    to_download = []
    cell_ids = gdf['CELL_ID'].astype(str).values

    for year in years:
        print('\n', year, flush=True)
        out_dir = os.path.join(OUT_ROOT, state.lower(), 'landsat', str(year))
        already_done = [os.path.basename(x) for x in glob.glob(f'{out_dir}/*') if os.path.basename(x).split('_')[0] in cell_ids]
        print('Already have {:,.0f} tiles for {}'.format(len(already_done)/2, year))

        for idx, row in gdf.iterrows():
            cell_id = str(row['CELL_ID'])
            leafon_out = f'{cell_id}_landsat-leaf-on_{year}.tif'
            leafoff_out = f'{cell_id}_landsat-leaf-off_{year}.tif'
            # get the tile bounding box for filtering landsat images
            bbox_tile = row['geometry'].bounds
            xmin, ymin, xmax, ymax = bbox_tile
            (xmin, ymin) = np.floor((xmin, ymin))
            (xmax, ymax) = np.ceil((xmax, ymax))
            aoi = ee.Geometry.Rectangle((xmin, ymin, xmax, ymax), 
                                        proj=f'EPSG:{gdf.crs.to_epsg()}', 
                                        evenOdd=True, 
                                        geodesic=False)

            # get a composite leaf-on image
            leafon_start, leafon_stop = f'{year}-04-01', f'{year}-09-30'
            leafon_img = get_landsat_composite(aoi, year, leafon_start, leafon_stop)

            # get a composite leaf-off image
            leafoff_start, leafoff_stop = f'{year-1}-10-01', f'{year}-03-31'
            leafoff_img = get_landsat_composite(aoi, year, leafoff_start, leafoff_stop)
            

            # export_landsat_to_drive(leafon_img, 
            #                         description=f'{name}_landsat-leaf-on_{year}',
            #                         folder=folder,
            #                         region=aoi,
            #                         crs=f'EPSG:{gdf.crs.to_epsg()}'
            #                         )
            
            # export_landsat_to_drive(leafoff_img, 
            #                         description=f'{name}_landsat-leaf-off_{year}',
            #                         folder=folder,
            #                         region=aoi,
            #                         crs=f'EPSG:{gdf.crs.to_epsg()}'
            #                         )

            for outfile, img in zip([leafon_out, leafoff_out], [leafon_img, leafoff_img]):
                url_params = dict(name=outfile.split('.')[0],
                                  filePerBand=False,
                                  scale=30,
                                  crs=f'EPSG:{gdf.crs.to_epsg()}',
                                  formatOptions={'cloudOptimized':True})
                url = img.clip(aoi).getDownloadURL(url_params)
                to_download.append((url, outfile, out_dir))

            # report progress
            if idx % 100 == 0 and idx > 0:
                print()
            if (idx+1) % 10 == 0:
                print('{:,d}'.format(idx+1), end='')
            else:
                print('.', end='')
        print()
    return to_download

In [None]:
def fetch_unzip_from_url(url, filename, out_dir, check_valid=True, retry=True):
    """Fetches a zipfile from a URL and extracts the specified file 
    from the zip archive to out_dir.

    This is primarily intended to download a zipped GeoTiff.
    """
    out_path = os.path.join(out_dir, filename)
    
    if not os.path.exists(out_path):
        response = requests.get(url)
        try:
            zip = ZipFile(BytesIO(response.content))
        except: # downloaded zip is corrupt/failed
            return None
        out_path = zip.extract(filename, out_dir)    

    if check_valid:
        try:
            with rasterio.open(out_path) as src:
                ras = src.read(masked=True)
        except:
            print(f'Failed to fetch {filename}.')
            os.remove(out_path)

            if retry:
                return fetch_unzip_from_url(url, filename, out_dir, retry=False)
            else:
                return None

    return out_path

def multithreaded_download(to_download, num_threads=12):
    if len(to_download) > 0:
        with ThreadPoolExecutor(12) as executor:
            print('Starting to download files from Google Earth Engine.')
            jobs = [executor.submit(fetch_unzip_from_url, *params) for params in to_download]
            results = []
            
            for job in tqdm(as_completed(jobs), total=len(jobs)):
                results.append(job.result())
        return results
    else:
        return

## Go.

In [None]:
SHP_DIR = '/content/drive/Shared drives/stand_mapping/data/interim'

WA11_SHP = 'washington_utm11n_training_quads_epsg6340.shp'
WA10_SHP = 'washington_utm10n_training_quads_epsg6339.shp'
OR10_SHP = 'oregon_utm10n_training_quads_epsg6339.shp'
OR11_SHP = 'oregon_utm11n_training_quads_epsg6340.shp'

or10_gdf = gpd.read_file(os.path.join(SHP_DIR, OR10_SHP))
or11_gdf = gpd.read_file(os.path.join(SHP_DIR, OR11_SHP))
wa10_gdf = gpd.read_file(os.path.join(SHP_DIR, WA10_SHP))
wa11_gdf = gpd.read_file(os.path.join(SHP_DIR, WA11_SHP))

In [None]:
GDF = wa11_gdf
STATE = 'washington'
YEARS = [2009, 2011, 2015, 2017]
to_download = get_landsat(GDF, STATE, YEARS)
results = multithreaded_download(to_download)

Preparing download URLs for 82 tiles for 4 years

 2009
Already have 1 tiles for 2009
.........10.........20.........30.........40.........50.........60.........70.........80..

 2011
Already have 0 tiles for 2011
.........10.........20.........30.........40.........50.........60.........70.........80..

 2015
Already have 0 tiles for 2015
.........10.........20.........30.........40.........50.........60.........70.........80..

 2017
Already have 0 tiles for 2017
.........10.........20.........30.........40.........50.........60.........70.........80..
Starting to download files from Google Earth Engine.


HBox(children=(FloatProgress(value=0.0, max=656.0), HTML(value='')))




In [None]:
GDF = wa10_gdf
STATE = 'washington'
YEARS = [2009, 2011, 2015, 2017]
to_download = get_landsat(GDF, STATE, YEARS)
results = multithreaded_download(to_download)

Preparing download URLs for 277 tiles for 4 years

 2009
Already have 0 tiles for 2009
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........160.........170.........180.........190.........200
.........210.........220.........230.........240.........250.........260.........270.......

 2011
Already have 0 tiles for 2011
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........160.........170.........180.........190.........200
.........210.........220.........230.........240.........250.........260.........270.......

 2015
Already have 0 tiles for 2015
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........160.....

HBox(children=(FloatProgress(value=0.0, max=2216.0), HTML(value='')))




In [None]:
GDF = or10_gdf
STATE = 'oregon'
YEARS = [2009, 2011, 2014, 2016]
to_download = get_landsat(GDF, STATE, YEARS)
results = multithreaded_download(to_download)

Preparing download URLs for 607 tiles for 4 years

 2009
Already have 0 tiles for 2009
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........160.........170.........180.........190.........200
.........210.........220.........230.........240.........250.........260.........270.........280.........290.........300
.........310.........320.........330.........340.........350.........360.........370.........380.........390.........400
.........410.........420.........430.........440.........450.........460.........470.........480.........490.........500
.........510.........520.........530.........540.........550.........560.........570.........580.........590.........600
.......

 2011
Already have 0 tiles for 2011
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130...

HBox(children=(FloatProgress(value=0.0, max=4856.0), HTML(value='')))




In [None]:
GDF = or11_gdf
STATE = 'oregon'
YEARS = [2009, 2011, 2014, 2016]
to_download = get_landsat(GDF, STATE, YEARS)
results = multithreaded_download(to_download)

Preparing download URLs for 524 tiles for 4 years

 2009
Already have 0 tiles for 2009
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........160.........170.........180.........190.........200
.........210.........220.........230.........240.........250.........260.........270.........280.........290.........300
.........310.........320.........330.........340.........350.........360.........370.........380.........390.........400
.........410.........420.........430.........440.........450.........460.........470.........480.........490.........500
.........510.........520....

 2011
Already have 0 tiles for 2011
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........160.........170.........180.........190.........200
.........210......

HBox(children=(FloatProgress(value=0.0, max=4192.0), HTML(value='')))


