In [None]:
! pip install geopandas rasterio -q

[K     |████████████████████████████████| 972kB 8.0MB/s 
[K     |████████████████████████████████| 18.3MB 226kB/s 
[K     |████████████████████████████████| 14.8MB 313kB/s 
[K     |████████████████████████████████| 10.9MB 153kB/s 
[?25h

In [None]:
import os
import glob

import numpy as np
import geopandas as gpd
import rasterio
import ee

import requests
from io import BytesIO
from zipfile import ZipFile
from concurrent.futures import as_completed, ThreadPoolExecutor
from tqdm.notebook import tqdm

## Get Earth Engine Running
To access GEE, we will need to authenticate our account, and then initialize a connection to a server. 

In [None]:
ee.Authenticate()

To authorize access needed by Earth Engine, open the following URL in a web browser and follow the instructions. If the web browser does not start automatically, please manually browse the URL below.

    https://accounts.google.com/o/oauth2/auth?client_id=517222506229-vsmmajv00ul0bs7p89v5m89qs8eb9359.apps.googleusercontent.com&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fearthengine+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdevstorage.full_control&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&response_type=code&code_challenge=bItN5-liRLIqBub0vutbRNZtYiC6waEwIpcBYFP2YRo&code_challenge_method=S256

The authorization workflow will generate a code, which you should paste in the box below. 
Enter verification code: 4/4gFLu6ERGOyj7tQ9to7QWFZETBzTqgEYtnacI4hKdFMLXzXYjDnB45M

Successfully saved authorization token.


In [None]:
ee.Initialize()

# Mount Google Drive
We have shapefiles containing the geospatial boundaries of the map tiles we'll be gathering data from on our Google Drive.

To mount our Drive and access our files, we have to authenticate with Google Drive first.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Workhorse Function to Retrieve NAIP Quads
For each of the tiles in a GeoDataFrame, and each year for which we want to get the NAIP imagery, we will filter the NAIP collection from GEE to identify the best image and export it to our Google Drive.

In [None]:
def fetch_unzip_from_url(url, filename, out_dir, check_valid=True, retry=True):
    """Fetches a zipfile from a URL and extracts the specified file 
    from the zip archive to out_dir.

    This is primarily intended to download a zipped GeoTiff.
    """
    out_path = os.path.join(out_dir, filename)
    
    if not os.path.exists(out_path):
        response = requests.get(url)
        try:
            zip = ZipFile(BytesIO(response.content))
        except: # downloaded zip is corrupt/failed
            print('Zipfile is corrupted or URL creation failed.')
            return None
        out_path = zip.extract(filename, out_dir)    

        if check_valid:
            try:
                with rasterio.open(out_path) as src:
                    ras = src.read(masked=True)
            except:
                print(f'Failed to fetch {filename}.')
                os.remove(out_path)

                if retry:
                    return fetch_unzip_from_url(url, filename, out_dir, retry=False)
                else:
                    return None

    return out_path

def multithreaded_download(to_download, num_threads=12):
    num_downloads = len(to_download)
    print('\n', 'Attempting download of {:,d} images'.format(num_downloads))
    if num_downloads > 0:
        with ThreadPoolExecutor(12) as executor:
            print('Starting to download files from Google Earth Engine.')
            jobs = [executor.submit(fetch_unzip_from_url, *params) for params in to_download]
            results = []
            
            for job in tqdm(as_completed(jobs), total=len(jobs)):
                results.append(job.result())
        return results
    else:
        return

In [None]:
def split_in_six(bbox):
    xmin, ymin, xmax, ymax = bbox
    (xmin, ymin) = np.floor((xmin, ymin))
    (xmax, ymax) = np.ceil((xmax, ymax))

    box1 = [xmin, ymin, (xmax+xmin)/2, ymin + 1/3*(ymax-ymin)]
    box2 = [xmin, ymin + 1/3*(ymax-ymin), (xmax+xmin)/2, ymin + 2/3*(ymax-ymin)]
    box3 = [xmin, ymin + 2/3*(ymax-ymin), (xmax+xmin)/2, ymax]
    
    box4 = [(xmax+xmin)/2, ymin, xmax, ymin + 1/3*(ymax-ymin)]
    box5 = [(xmax+xmin)/2, ymin + 1/3*(ymax-ymin), xmax, ymin + 2/3*(ymax-ymin)]
    box6 = [(xmax+xmin)/2, ymin + 2/3*(ymax-ymin), xmax, ymax]

    return box1, box2, box3, box4, box5, box6

In [None]:
def get_naip(gdf, state, years):
    """Iterates through features in a GeoDataFrame and exports NAIP images that 
    include the centroid of each feature.
    """
    print('Preparing download URLs for {:,d} tiles for {:,d} years'.format(len(gdf), len(years)))
    OUT_ROOT = '/content/drive/Shared drives/stand_mapping/data/interim/training_tiles/'
    to_download = []
    cell_ids = gdf['CELL_ID'].astype(str).values

    for year in years:
        print('\n', year, flush=True)
        out_dir = os.path.join(OUT_ROOT, state.lower(), 'naip', str(year))
        already_done = [os.path.basename(x)[:-6] for x in glob.glob(f'{out_dir}/*') if os.path.basename(x).split('_')[0] in cell_ids]
        print('Already have {:,.0f} tiles for {}'.format(len(already_done)/6, year))
        
        for idx, row in gdf.iterrows():
            cell_id = str(row['CELL_ID'])
            outfile = f'{cell_id}_naip_{year}.tif'

            if len([f for f in already_done if f == outfile[:-4]]) < 6:
                # get the tile bounding box for filtering images
                bbox = row['geometry'].bounds
                aoi_bboxes = split_in_six(bbox)
                aois = [ee.Geometry.Rectangle(box,
                                              proj=f'EPSG:{gdf.crs.to_epsg()}',
                                              evenOdd=True,
                                              geodesic=False) for box in aoi_bboxes]
                xmin, ymin, xmax, ymax = bbox
                (xmin, ymin) = np.floor((xmin, ymin))
                (xmax, ymax) = np.ceil((xmax, ymax))
                centroid = ee.Geometry.Point([(xmin+xmax)/2, (ymin+ymax)/2],
                                             proj=f'EPSG:{gdf.crs.to_epsg()}')


                # get the naip image collection for our aoi and timeframe
                coll = ee.ImageCollection('USDA/NAIP/DOQQ')\
                      .filterBounds(centroid)\
                      .filterDate(f'{year}-01-01', f'{year}-12-31')
                coll_size = coll.size().getInfo()

                # if there are no naip images in this timeframe
                if coll_size == 0:
                    print(f'No NAIP images found for {name} in {year}.')

                # if there are multiple images in this timeframe
                # select the one that is lighter (to avoid images with shadows)
                # transform image from RGB to HSV and select the one with the 
                # lowest median value.
                if coll_size > 1:
                    def naip_darkness(image):
                        """Converts NAIP into HSV color-space and returns an image 
                        with the median of the Value band added to image metadata.
                        """
                        hsv = image.select(['R','G','B']).rgbToHsv()
                        median_value = hsv.select('value')\
                                      .reduceRegion(reducer=ee.Reducer.median(), 
                                                    geometry=aoi,
                                                    maxPixels = 40000000,
                                                    scale=1)\
                                      .get('value') 
                        
                        return image.set('median_darkness', median_value)
                    
                    darkness = coll.map(naip_darkness)
                    img = darkness.sort('median_darkness').first()

                if coll_size == 1:
                    img = coll.first()
                
                if coll_size > 0:
                    for tile, aoi in zip([1, 2, 3, 4, 5, 6], aois):
                        out_tile = outfile.split('.')[0]+f'_{tile}'
                        url_params = dict(name=out_tile,
                                          filePerBand=False,
                                          scale=1,
                                          crs=f'EPSG:{gdf.crs.to_epsg()}',
                                          formatOptions={'cloudOptimized':True})
                        url = img.clip(aoi).getDownloadURL(url_params)
                        to_download.append((url, out_tile+'.tif', out_dir))

            # report progress
            if idx % 100 == 0 and idx > 0:
                print()
            if (idx+1) % 10 == 0:
                print('{:,d}'.format(idx+1), end='')
            else:
                print('.', end='')
        print()
    return to_download

## Go.

In [None]:
SHP_DIR = '/content/drive/Shared drives/stand_mapping/data/interim'

WA11_SHP = 'washington_utm11n_training_quads_epsg6340.shp'
WA10_SHP = 'washington_utm10n_training_quads_epsg6339.shp'
OR10_SHP = 'oregon_utm10n_training_quads_epsg6339.shp'
OR11_SHP = 'oregon_utm11n_training_quads_epsg6340.shp'

or10_gdf = gpd.read_file(os.path.join(SHP_DIR, OR10_SHP))
or11_gdf = gpd.read_file(os.path.join(SHP_DIR, OR11_SHP))
wa10_gdf = gpd.read_file(os.path.join(SHP_DIR, WA10_SHP))
wa11_gdf = gpd.read_file(os.path.join(SHP_DIR, WA11_SHP))

In [None]:
GDF = wa11_gdf
STATE = 'washington'
YEARS = [2009, 2011, 2015, 2017]
to_download = get_naip(GDF, STATE, YEARS)
results = multithreaded_download(to_download)

Preparing download URLs for 82 tiles for 4 years

 2009
Already have 1 tiles for 2009
.........10.........20.........30.........40.........50.........60.........70.........80..

 2011
Already have 0 tiles for 2011
.........10.........20.........30.........40.........50.........60.........70.........80..

 2015
Already have 0 tiles for 2015
.........10.........20.........30.........40.........50.........60.........70.........80..

 2017
Already have 0 tiles for 2017
.........10.........20.........30.........40.........50.........60.........70.........80..

 Attempting download of 1,968 images
Starting to download files from Google Earth Engine.


HBox(children=(FloatProgress(value=0.0, max=1968.0), HTML(value='')))




In [None]:
GDF = wa10_gdf
STATE = 'washington'
YEARS = [2009, 2011, 2015, 2017]
to_download = get_naip(GDF, STATE, YEARS)
results = multithreaded_download(to_download)

Preparing download URLs for 277 tiles for 4 years

 2009
Already have 277 tiles for 2009
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........160.........170.........180.........190.........200
.........210.........220.........230.........240.........250.........260.........270.......

 2011
Already have 277 tiles for 2011
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........160.........170.........180.........190.........200
.........210.........220.........230.........240.........250.........260.........270.......

 2015
Already have 169 tiles for 2015
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........16

HBox(children=(FloatProgress(value=0.0, max=2550.0), HTML(value='')))




In [None]:
GDF = or11_gdf
STATE = 'oregon'
YEARS = [2009, 2011, 2014, 2016]
to_download = get_naip(GDF, STATE, YEARS)
results = multithreaded_download(to_download)

Preparing download URLs for 524 tiles for 4 years

 2009
Already have 524 tiles for 2009
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........160.........170.........180.........190.........200
.........210.........220.........230.........240.........250.........260.........270.........280.........290.........300
.........310.........320.........330.........340.........350.........360.........370.........380.........390.........400
.........410.........420.........430.........440.........450.........460.........470.........480.........490.........500
.........510.........520....

 2011
Already have 524 tiles for 2011
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........160.........170.........180.........190.........200
.........210..

In [None]:
GDF = or10_gdf.iloc[:]
STATE = 'oregon'
YEARS = [2009, 2011, 
         2014, 2016
         ]
to_download = get_naip(GDF, STATE, YEARS)
results = multithreaded_download(to_download)

Preparing download URLs for 607 tiles for 4 years

 2009
Already have 607 tiles for 2009
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........130.........140.........150.........160.........170.........180.........190.........200
.........210.........220.........230.........240.........250.........260.........270.........280.........290.........300
.........310.........320.........330.........340.........350.........360.........370.........380.........390.........400
.........410.........420.........430.........440.........450.........460.........470.........480.........490.........500
.........510.........520.........530.........540.........550.........560.........570.........580.........590.........600
.......

 2011
Already have 607 tiles for 2011
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100
.........110.........120.........13

HBox(children=(FloatProgress(value=0.0, max=3642.0), HTML(value='')))


