In [1]:
'''
Query CMR STAC for HLS data given a point location and date range
Return a list of asset filenames for AWS or HTTPS access
Translate to local filenames and download
'''
from datetime import datetime, timedelta
import json
import os
import time
import requests
import boto3
from botocore.exceptions import ClientError
from pystac_client import Client


In [2]:
# which collections to search
collections = ['HLSL30.v2.0', 'HLSS30.v2.0']

# define the point location/centroid for the HLS tile we want
pt = json.loads('{"type":"Point", "coordinates":[-105.530017, 40.15442]}')

# define the dates we want to query
#date_range = "2021-05-01T00:00:00Z/2021-08-30T23:59:59Z"    # closed interval
#date_range = "2021-05-01T00:00:00Z/.."                      # open interval - does not currently work with the CMR-STAC API
#date_range = "2021-05/2021-11"
start_date = datetime(year=2021, day=1, month=1)
end_date = datetime(year=2021, day=31, month=12)

In [5]:
def search_stac_for_HLS(pt, dt_min, dt_max, cloudcover_max=80, lim=100, url='https://cmr.earthdata.nasa.gov/stac/LPCLOUD', collections=['HLSL30.v2.0', 'HLSS30.v2.0']):
    # open the catalog
    catalog = Client.open(f'{url}')
    
    # perform the search
    search = catalog.search(
        collections=collections,
        intersects=pt,
        datetime=dt_min + '/' + dt_max,
        limit=lim
    )

    links = []

    if search.matched() == 0:
        print('No granules found at point', pt, 'from', dt_min, 'to', dt_max)
    else:
        print('Found', search.matched(), 'granules at point', pt, 'from', dt_min, 'to', dt_max)
        item_collection = search.get_all_items()
        
        for i in item_collection:
            if i.properties['eo:cloud_cover'] <= cloudcover_max:
                if len(links) == 0:
                    print(i.properties)
                for a in i.assets:
                    links.append(i.assets[a].href)

    return(links)

In [6]:
hls_links = search_stac_for_HLS(pt, start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))

Found 77 granules at point {'type': 'Point', 'coordinates': [-105.530017, 40.15442]} from 2021-01-01 to 2021-12-31
{'datetime': '2021-01-01T18:02:55.690Z', 'start_datetime': '2021-01-01T18:02:55.690Z', 'end_datetime': '2021-01-01T18:02:55.690Z', 'eo:cloud_cover': 50}


In [7]:
print(hls_links[0:10])

['https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B12.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B02.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B06.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B8A.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B08.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.SAA.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/H

In [8]:
# convert https links to s3 links
s3_links = [l.replace('https://data.lpdaac.earthdatacloud.nasa.gov/', 's3://') for l in hls_links]
s3_links = [l.replace('https://cmr.earthdata.nasa.gov/', 's3://') for l in s3_links]
print(s3_links[0:20])

['s3://lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B12.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B02.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B06.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B8A.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B08.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.SAA.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B05.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B03.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T

In [9]:
def fix_links(src_link, src_dirs, dst_dir, meta_dir, add_tile_dir=True):
    dst_link = src_link

    if '.xml' in dst_link:
        dst_link2 = os.path.join(meta_dir, os.path.basename(dst_link))
    else:
        for src_dir in src_dirs:
            dst_link = dst_link.replace(src_dir, dst_dir)
        dst_splits = dst_link.split('/')
        dst_link2 = '/'.join(dst_splits[0:2]) + \
            '/' + dst_splits[3].split('.')[2] + \
            '/' + '/'.join(dst_splits[3:])
    
    return(dst_link2)

#fix_links(s3_links[0], 's3://lp-prod-protected', './HLS_data')
local_links = [fix_links(src_link=l, src_dirs=['s3://lp-prod-protected', 's3://lp-prod-public'], dst_dir='./HLS_data', meta_dir='./HLS_metadata') for l in s3_links]
print(local_links[0:10])

['./HLS_data/T13TDE/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B12.tif', './HLS_data/T13TDE/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B02.tif', './HLS_data/T13TDE/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B06.tif', './HLS_data/T13TDE/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B8A.tif', './HLS_data/T13TDE/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B08.tif', './HLS_data/T13TDE/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.SAA.tif', './HLS_data/T13TDE/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B05.tif', './HLS_data/T13TDE/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B03.tif', './HLS_data/T13TDE/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B11.tif', './HLS_data/T13TDE/HLS.S30.T13TDE.2021001T175741.v2.0/HLS.S30.T13TDE.2021001T175741.v2.0.B01.tif']


In [10]:
# get credentials
s3_cred_endpoint = 'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials'
def get_temp_creds():
    temp_creds_url = s3_cred_endpoint
    return requests.get(temp_creds_url).json()

temp_creds_req = get_temp_creds()

session = boto3.Session(aws_access_key_id=temp_creds_req['accessKeyId'], 
                        aws_secret_access_key=temp_creds_req['secretAccessKey'],
                        aws_session_token=temp_creds_req['sessionToken'],
                        region_name='us-west-2')

In [11]:
def make_dirs(dst_links):
    for dst_link in dst_links:
        os.makedirs(os.path.dirname(dst_link), exist_ok=True)
        
make_dirs(local_links)

In [14]:
s3 = session.client('s3')

def download_data(s3_links, local_links, s3_session):
    s3_links = [l.replace('s3://', '') for l in s3_links]
    
    for i in range(0, len(s3_links)):
        s3_link = s3_links[i]
        s3_bucket = s3_link.split('/')[0]
        s3_link = s3_link.replace(s3_bucket +'/', '')        
        local_link = local_links[i]
        
        # ignore XML files for now, figure out how to get them later because they contain useful information
        if not '.xml' in local_link:
            with open(local_link, 'wb') as f:
                #print(i, s3_bucket, s3_link, local_link)
                s3.download_fileobj(s3_bucket, s3_link, f)

start_time = time.time()
download_data(s3_links, local_links, s3)
end_time = time.time()
print('Time to download HLS data:', round(end_time - start_time), '(s)')

Time to download HLS data: 331 (s)


In [29]:
# create a .csv file of S3 links, local links, and information about each file
import pandas

stack_df = pandas.DataFrame().from_dict({'S3_links':s3_links, 'local_links':local_links})

# add sensor, tile, dates, bands
for i in range(0, stack_df.shape[0]):
    link = s3_links[i]
    if (not '.xml' in link) & (not '.png' in link):
        stack_df.loc[i, 'sensor'] = os.path.basename(link).split('.')[1]
        stack_df.loc[i, 'tile'] = os.path.basename(link).split('.')[2]
        stack_df.loc[i, 'date'] = os.path.basename(link).split('.')[3]
        stack_df.loc[i, 'band'] = os.path.basename(link).split('.')[6]
        
stack_df

                                               S3_links  \
0     s3://lp-prod-protected/HLSS30.020/HLS.S30.T13T...   
1     s3://lp-prod-protected/HLSS30.020/HLS.S30.T13T...   
2     s3://lp-prod-protected/HLSS30.020/HLS.S30.T13T...   
3     s3://lp-prod-protected/HLSS30.020/HLS.S30.T13T...   
4     s3://lp-prod-protected/HLSS30.020/HLS.S30.T13T...   
...                                                 ...   
1056  s3://lp-prod-protected/HLSS30.020/HLS.S30.T13T...   
1057  s3://lp-prod-protected/HLSS30.020/HLS.S30.T13T...   
1058  s3://lp-prod-protected/HLSS30.020/HLS.S30.T13T...   
1059  s3://lp-prod-public/HLSS30.020/HLS.S30.T13TDE....   
1060       s3://search/concepts/G2165792011-LPCLOUD.xml   

                                            local_links sensor    tile  \
0     ./HLS_data/T13TDE/HLS.S30.T13TDE.2021001T17574...    S30  T13TDE   
1     ./HLS_data/T13TDE/HLS.S30.T13TDE.2021001T17574...    S30  T13TDE   
2     ./HLS_data/T13TDE/HLS.S30.T13TDE.2021001T17574...    S30  T13TD