# Download Missing Files
1. Parse the NASA site to get a list of available files from the satellite (eg. AquaMODIS, AquaTERRA SeaWiFS) 
2. Store the list of file names as a .txt
3. Get list of availible files that are not in the filepath
4. Download files that aren't on disk 
<br>

[File Search: Using the API](https://oceandata.sci.gsfc.nasa.gov/api/file_search_help) 


[OceanData API file search GUI](https://oceandata.sci.gsfc.nasa.gov/api/file_search/)

In [1]:
import os       # Miscellaneous operating system interfaces
import glob     # Unix style pathname pattern expansion
import requests # HTTP library for Python
import getpass  # Portable password input

## Set search filters and file options

In [12]:
# ~~~~~~~~~~~~~~~
## FILE OPTIONS
# ~~~~~~~~~~~~~~~

# SET path to a folder to store the data 
filepath = "/Volumes/Seagate/SeaWiFs/chla/daily/"

# File name/path for availible file names output (temp file)
file = filepath + "file_list.txt"


# ~~~~~~~~~~~~~~~
## SEARCH OPTIONS
# ~~~~~~~~~~~~~~~
# Date Range 
# YYYY-MM-DD

# SET start date
#start_date = "2002-07-04" # first modis dat
#start_date = "1997-09-04"  # first SeaWiFS date
start_date = "2010-10-01"

# SET end date 
#end_date = "2023-01-31"   # last modis date
#end_date = "2010-12-11"  # last SeaWiFS date
end_date = "2023-01-01"

# Resolution
# 4km or 9km
resolution = "9km"

# SET Period (use empty string for all periods)
# Daily = DAY
# Monthly = MO
#period = "DAY"
period = "DAY"


# ~~~~~~~~~~~~~~~
# INSTRUMENT OPTS
# ~~~~~~~~~~~~~~~

# Dictionary of instruments and their sensor/dt ids (incomplete dictionary)
instruments = {
    "aqua": "sensor_id=7&dtid=1043",
    "terra": "sensor_id=8&dtid=1083",
    "seaWiFS": "sensor_id=6&dtid=1123"
}

# SET instrument
inst_key = "seaWiFS"

### Option Check

In [13]:
# Validity check
# Check if intruments dictionary contains inst_name as a key
if inst_key in instruments:
    # Get the sensor/dt id from the dictionary based on inst_name
    instrument = instruments[inst_key]
else:
    print("Invalid instrument name: " + inst_key + " not found in instruments dictionary.")

### Provide app key or login information for Earth Data access. 

In [14]:
### Provide app key for Earth Data access.
def get_appkey():
    # An appkey can be obtained from:
    # https://oceandata.sci.gsfc.nasa.gov/appkey/
    print(
        'An appkey can be obtained from: https://oceandata.sci.gsfc.nasa.gov/appkey/\nPlease enter appkey now.'
    )
    appKey = getpass.getpass('Enter Appkey: ')
    return appKey


# getpass attempts to hide login information from the terminal
def get_login():
    print('Please enter your EarthData login information.')
    #print('Enter Username: ')
    user = input("Enter Username:")
    #print('Enter Password: ')
    password = getpass.getpass("Enter Password: ")
    return user, password


# ~~~~~~~~~~~~~~~
# Get user choice
# ~~~~~~~~~~~~~~~

# Get user choice for appkey or login
choice = "0"
while choice != "1" and choice != "2":
    # Ask user for appkey or login
    print("How would you like to access the data?")
    print("1. App Key")
    print("2. Login")
    choice = input("Enter 1 or 2: ")

# Get appkey/login
if choice == "1":
    # Provide app key for Earth Data access.
    appKey = get_appkey()
elif choice == "2":
    # Get username and password for Earth Data access.
    username, password = get_login()

How would you like to access the data?
1. App Key
2. Login
An appkey can be obtained from: https://oceandata.sci.gsfc.nasa.gov/appkey/
Please enter appkey now.


### Pipe availible files from search to file

In [15]:
# If the filepath does not exist, create it
if not os.path.exists(filepath):
    os.makedirs(filepath)

# Form commands using given search filters
if period != "":
    period = "&period=" + period
#wget = "wget -q --post-data=\"results_as_file=1&" + instrument + "&sdate=" + start_date + " 00:00:00&edate=" + end_date + " 23:59:59&subType=1&prod_id=chlor_a&resolution_id=" + resolution + period + "\" -O - https://oceandata.sci.gsfc.nasa.gov/api/file_search"
curl = "curl -d \"results_as_file=1&" + instrument + "&sdate=" + start_date + " 00:00:00&edate=" + end_date + " 23:59:59&subType=1&prod_id=chlor_a&resolution_id=" + resolution + period + "\" https://oceandata.sci.gsfc.nasa.gov/api/file_search > " + file 

# Execute command to retrieve list of files and store in file
os.system(curl)

# Check the file for "ERROR" or "No Results Found" and print an error message if found
with open(file) as f:
    for line in f:
        if "ERROR" in line:
            print("Error encountered while searching for files.\nPlease verify search parameters and try again.")
        if "No Results Found" in line:
            print("No results found.\nPlease verify search parameters and try again.\nExiting program.")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4177  100  4031  100   146   1759     63  0:00:02  0:00:02 --:--:--  1824


### Get list of availible files that are not already in the specified file path

In [16]:
# Create a list of availible files for download
def get_all_avail_file_list(file):
    file_list = []
    with open(file) as f:
        for line in f:
            # TODO: make more generic 
            # only add daily data to the list (this is a quick fix for now)
            #if ".L3m_DAY_BIOS4_chlor_a_4km.nc" in line:
            if "DAY" in line and "chlor" in line and ".nc" in line:
                file_list.append(line)
    return file_list

# Get list of files in filepath
def get_files(fp):
    # Get file list and sort files by date
    files = glob.glob(fp + "*.nc")
    files.sort()
    return files

# Strip filepath from file names
def format_names(current_files, file_list):
    # Strip filepath from file names
    current_file_names = []
    for ea in current_files:
        name = ea.split("/")
        current_file_names.append(name[-1])

    file_list_names = []
    for ea in file_list:
        name = ea.strip('\n')
        file_list_names.append(name)

    return current_file_names, file_list_names


### Get list of availible files that are not already in the specified file path
def get_list_files_not_in_path(filepath, file_list):
    # Get list of files in filepath
    current_files = get_files(filepath)

    # Strip file path from file names
    current_file_names, file_list_names = format_names(current_files, file_list)

    # Get list of files that are not in the specified file path
    files_needed = list(set(file_list_names) - set(current_file_names))
    files_needed.sort()

    return files_needed


# Create a list of availible files for download
file_list = get_all_avail_file_list(file)
print(file_list)

# Get list of files that are not in the specified file path
files_needed = get_list_files_not_in_path(filepath, file_list)

print("Total matching files: " + str(len(file_list)))
print("Total files needed: " + str(len(files_needed)))

['SEASTAR_SEAWIFS_GAC.20101001.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101002.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101003.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101004.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101005.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101006.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101007.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101008.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101009.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101010.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101011.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101012.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101013.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101014.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101015.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20101016.L3m.DAY.CHL.chlor_a.9km.nc\n', 'SEASTAR_SEAWIFS_GAC.20

### Methods to download files not in folder with AppKey or Login 

In [17]:
### Configure a connection to download data from an Earthdata Login enabled server
#   https://urs.earthdata.nasa.gov/documentation/for_users/data_access/python
def access_data_login(
    username,
    password,
    filepath,
    files_needed,
):
    # overriding requests.Session.rebuild_auth to mantain headers when redirected
    class SessionWithHeaderRedirection(requests.Session):
        AUTH_HOST = 'urs.earthdata.nasa.gov'

        def __init__(self, username, password):
            super().__init__()
            self.auth = (username, password)

    # Overrides from the library to keep headers when redirected to or from
    # the NASA auth host.

        def rebuild_auth(self, prepared_request, response):
            headers = prepared_request.headers
            url = prepared_request.url

            if 'Authorization' in headers:
                original_parsed = requests.utils.urlparse(response.request.url)
                redirect_parsed = requests.utils.urlparse(url)

                if (
                        original_parsed.hostname != redirect_parsed.hostname
                ) and redirect_parsed.hostname != self.AUTH_HOST and original_parsed.hostname != self.AUTH_HOST:
                    del headers['Authorization']

            return

    # Progess Reporting
    total_files = len(files_needed)
    i = 1
    prog = ""

    # create session with the user credentials that will be used to authenticate access to the data
    session = SessionWithHeaderRedirection(username, password)

    if not os.path.isdir(filepath):
        os.makedirs(filepath)

    # list to hold the names of the files that could not be downloaded
    bad_files = []
    # loop over the files and submit a request for each
    for file in files_needed:
        if not os.path.isfile(filepath + file):

            # build the url from the file name of the file we wish to retrieve
            url = "https://oceandata.sci.gsfc.nasa.gov/ob/getfile/" + file

            # extract the filename from the url to be used when saving the file
            filename = url[url.rfind('/') + 1:]

            try:
                # submit the request using the session
                response = session.get(url, stream=True)
                if (response.status_code == 200):
                    # Progress reporting
                    prog = "Progress: " + str(i) + " of " + str(total_files)
                    print(prog, end="\r")

                else:
                    print("\n Status: ", response.status_code)

                # raise an exception in case of http errors
                response.raise_for_status()

                # save the file
                with open(filepath + filename, 'wb') as fd:
                    for chunk in response.iter_content(chunk_size=1024 * 1024):
                        fd.write(chunk)

            except requests.exceptions.HTTPError as e:
                bad_files.append(file)
                # handle any errors here
                print(e)

        i += 1

    # Print report of files that failed to download
    if (len(bad_files) == 0):
        print("All files downloaded successfully")
    elif (len(bad_files) > 0):
        print("Failed to download ", str(len(bad_files)), " files.")
        print("A list of failed files are saved in ",
              filepath + "failed_files.txt")
        print("Files that failed to download: ", bad_files)

        # Store the bad files in a text file with each file on a seperate line, overwriting the file if it already exists
        with open(filepath + "failed_files.txt", "w") as f:
            for s in bad_files:
                f.write(s + "\n")

    return bad_files


### Download files not in folder
def access_data_appkey(filepath, files_needed, appKey):

    # Create file path if it does not exist
    if not os.path.isdir(filepath):
        os.makedirs(filepath)

    # Progess Reporting
    total_files = len(files_needed)
    i = 1
    prog = ""

    # Download files
    bad_files = []
    for f in files_needed:
        if not os.path.isfile(filepath + f):
            try:
                # submit the request
                url = "https://oceandata.sci.gsfc.nasa.gov/ob/getfile/" + f + "?appkey=" + appKey
                r = requests.get(url, allow_redirects=True)
                # print status or progress
                if (r.status_code == 200):
                    # Progress reporting
                    prog = "Progress: " + str(i) + " of " + str(total_files)
                    print(prog, end="\r")
                else:  # Print status code if not 200
                    print("\n Status: ", r.status_code)

                # raise an exception in case of http errors
                r.raise_for_status()

                # save the file
                with open(filepath + f, 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=1024 * 1024):
                        fd.write(chunk)

            except requests.exceptions.HTTPError as e:
                bad_files.append(f)
                # handle any errors here
                print(e)

        # Progress reporting
        #prog = str(i) + " of " + str(total_files)
        #print(prog, end="\r")
        i += 1

    # Print report of files that failed to download
    if (len(bad_files) == 0):
        print("All files downloaded successfully")
    elif (len(bad_files) > 0):
        print("Failed to download ", str(len(bad_files)), " files.")
        print("A list of failed files are saved in ",
              filepath + "failed_files.txt")
        print("Files that failed to download: ", bad_files)

        # Store the bad files in a text file with each file on a seperate line, overwriting the file if it already exists
        with open(filepath + "failed_files.txt", "w") as fi:
            for s in bad_files:
                fi.write(s + "\n")

    return bad_files

### Download files not already in folder using user choice for AppKey or Login

In [18]:
# Download files not in folder using user choice for appkey or login
if choice == "1":
    # Download files not in folder using app key
    bad_files = access_data_appkey(filepath, files_needed, appKey)
    if len(bad_files) > 0:
        repeat = input(
            "Would you like to try to download these files again? (y/n): ")
        if repeat == "y":
            # Download files not in folder using login
            bad_files = access_data_appkey(filepath, bad_files, appKey)

elif choice == "2":
    # Download files not in folder using login
    bad_files = access_data_login(username, password, filepath, files_needed)
    if len(bad_files) > 0:
        repeat = input(
            "Would you like to try to download these files again? (y/n): ")
        if repeat == "y":
            # Download files not in folder using login
            bad_files = access_data_login(username, password, filepath,
                                            bad_files)

# Delete file list
#os.remove(file)

All files downloaded successfully
