In [8]:
import cdsapi
import os, sys
from pathlib import Path
import time
from tqdm import tqdm
from loguru import logger
import earthaccess
import numpy as np
import pandas as pd
import xarray as xr

# Download NDVI files

This notebook goes through the process of downloading NDVI data in particular, the [MCD19A3CMG v061](https://lpdaac.usgs.gov/products/mcd19a3cmgv061/) data product, using module `earthaccess`.

### Set up credentials
Key in username and password when prompted

In [2]:
earthaccess.login(persist=True)

<earthaccess.auth.Auth at 0x7f06441c02e0>

### Input parameters
`BATCH` is used to mark parallel downloads and make sure they dont get mixed up

In [37]:
DEST_PATH = Path("../../../data/01-raw/ndvi")
BATCH = 1

In [4]:
PH_BBOX = (116.5, 4.25, 127, 21.5)
start_year = 2003
end_year = 2006

### Setup logging

In [56]:
LOG_PATH = Path("../../logs/")

# Configure logger
logger.remove()
logger.add(sys.stderr, format="{time} {level} {message}", level="INFO")

# Configure daily rotation for file logging
daily_sink_file_fmt = str(LOG_PATH / "era5_{time:YYYY-MM-DD}_batch") + f"{BATCH}.log"
logger.add(
    daily_sink_file_fmt,
    rotation="00:00",
    format="{time} {level} {message}",
    level="INFO",
)

12

### Generate monthly bounding dates

In [13]:
def generate_monthly_date_pairs(start_year, end_year):
    date_pairs = []

    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            start_date = pd.Timestamp(year, month, 1)
            end_date = start_date + pd.offsets.MonthEnd(0)
            date_pairs.append(
                (start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))
            )

    return date_pairs

In [77]:
date_pairs = generate_monthly_date_pairs(start_year, end_year)
date_pairs

[('2003-01-01', '2003-01-31'),
 ('2003-02-01', '2003-02-28'),
 ('2003-03-01', '2003-03-31'),
 ('2003-04-01', '2003-04-30'),
 ('2003-05-01', '2003-05-31'),
 ('2003-06-01', '2003-06-30'),
 ('2003-07-01', '2003-07-31'),
 ('2003-08-01', '2003-08-31'),
 ('2003-09-01', '2003-09-30'),
 ('2003-10-01', '2003-10-31'),
 ('2003-11-01', '2003-11-30'),
 ('2003-12-01', '2003-12-31'),
 ('2004-01-01', '2004-01-31'),
 ('2004-02-01', '2004-02-29'),
 ('2004-03-01', '2004-03-31'),
 ('2004-04-01', '2004-04-30'),
 ('2004-05-01', '2004-05-31'),
 ('2004-06-01', '2004-06-30'),
 ('2004-07-01', '2004-07-31'),
 ('2004-08-01', '2004-08-31'),
 ('2004-09-01', '2004-09-30'),
 ('2004-10-01', '2004-10-31'),
 ('2004-11-01', '2004-11-30'),
 ('2004-12-01', '2004-12-31'),
 ('2005-01-01', '2005-01-31'),
 ('2005-02-01', '2005-02-28'),
 ('2005-03-01', '2005-03-31'),
 ('2005-04-01', '2005-04-30'),
 ('2005-05-01', '2005-05-31'),
 ('2005-06-01', '2005-06-30'),
 ('2005-07-01', '2005-07-31'),
 ('2005-08-01', '2005-08-31'),
 ('2005-

### Create monthly batch dump folder

In [None]:
os.makedirs(DEST_PATH / f"hdfs{BATCH}", exist_ok=True)

In [78]:
# Delete files if directory exists
for fn in os.listdir(DEST_PATH / f"hdfs{BATCH}"):
    os.remove(DEST_PATH / f"hdfs{BATCH}" / fn)

### Download daily files and save as single monthly file
Check the logfile in `logs/` to see the progress

In [None]:
VARIABLES = ["NDVI_gapfill", "EVI"]  # EVI optional to keep

In [80]:
for months in date_pairs:
    start_time = time.time()
    month_fn = "".join(months[0].split("-")[:-1])
    output_filename = f"NDVI_PH_{month_fn}.nc"
    logger.info(
        "==========================================================================================="
    )

    logger.info(
        f"Downloading data for the month {months[0]} to filename {output_filename}"
    )

    if (DEST_PATH / output_filename).is_file():
        logger.warning(
            f"{output_filename} already exists in {DEST_PATH}! Skipping request."
        )
        continue

    results = earthaccess.search_data(
        short_name="MCD19A3CMG",
        cloud_hosted=True,
        bounding_box=PH_BBOX,
        temporal=months,
    )

    files = earthaccess.download(results, DEST_PATH / f"hdfs{BATCH}")

    # Stop and check if folder has expected number of files
    try:
        assert len(results) == len(os.listdir(DEST_PATH / f"hdfs{BATCH}"))
    except:
        logger.error(
            f"{month_fn } has incomplete downloaded data! Please check and rerun, skipping..."
        )
        for fn in os.listdir(DEST_PATH / f"hdfs{BATCH}"):
            os.remove(DEST_PATH / f"hdfs{BATCH}" / fn)
        continue

    logger.info(f"Processing daily data to create {output_filename}...")

    # download daily files
    ds_list = []
    for fn in os.listdir(DEST_PATH / f"hdfs{BATCH}"):
        ds = xr.open_dataset(DEST_PATH / f"hdfs{BATCH}" / fn, engine="rasterio")
        # subset to bounding box and variable list
        ds = ds[VARIABLES]
        ds = ds.sel(y=slice(PH_BBOX[3], PH_BBOX[1]), x=slice(PH_BBOX[0], PH_BBOX[2]))
        # add time dimension
        file_dt = pd.to_datetime(
            f"{ds.attrs['EQUATORCROSSINGDATE.1']} {ds.attrs['EQUATORCROSSINGTIME.1'].split('.')[0]}"
        ).to_datetime64()
        ds = ds.assign_coords(time=file_dt)
        ds = ds.expand_dims(dim="time")
        ds_list.append(ds)

    # concatenate daily datasets and save as netcdf
    month_ds = xr.concat(ds_list, dim="time")
    month_ds.to_netcdf(DEST_PATH / output_filename)

    end_time = time.time()
    runtime_seconds = end_time - start_time
    human_readable_runtime = time.strftime("%H:%M:%S", time.gmtime(runtime_seconds))

    logger.success(f"File {output_filename} done in {human_readable_runtime}")
    # clean download dump folder
    for fn in os.listdir(DEST_PATH / f"hdfs{BATCH}"):
        os.remove(DEST_PATH / f"hdfs{BATCH}" / fn)

2024-03-15T21:09:03.632061+0800 INFO Downloading data for the month 2003-01-01 to filename NDVI_PH_200301.nc
2024-03-15T21:09:03.634072+0800 INFO Downloading data for the month 2003-02-01 to filename NDVI_PH_200302.nc
2024-03-15T21:09:03.636955+0800 INFO Downloading data for the month 2003-03-01 to filename NDVI_PH_200303.nc
2024-03-15T21:09:03.638849+0800 INFO Downloading data for the month 2003-04-01 to filename NDVI_PH_200304.nc
2024-03-15T21:09:03.640529+0800 INFO Downloading data for the month 2003-05-01 to filename NDVI_PH_200305.nc
2024-03-15T21:09:03.642484+0800 INFO Downloading data for the month 2003-06-01 to filename NDVI_PH_200306.nc
2024-03-15T21:09:03.644530+0800 INFO Downloading data for the month 2003-07-01 to filename NDVI_PH_200307.nc
2024-03-15T21:09:03.646684+0800 INFO Downloading data for the month 2003-08-01 to filename NDVI_PH_200308.nc
2024-03-15T21:09:03.648828+0800 INFO Downloading data for the month 2003-09-01 to filename NDVI_PH_200309.nc
2024-03-15T21:09:03

Granules found: 31
 Getting 31 granules, approx download size: 0.89 GB


QUEUEING TASKS | : 100%|██████████| 31/31 [00:00<00:00, 2875.99it/s]
PROCESSING TASKS | : 100%|██████████| 31/31 [00:40<00:00,  1.29s/it]
COLLECTING RESULTS | : 100%|██████████| 31/31 [00:00<00:00, 272585.79it/s]
2024-03-15T21:09:46.964723+0800 INFO Processing daily data to create NDVI_PH_200608.nc...
2024-03-15T21:10:01.074645+0800 SUCCESS File NDVI_PH_200608.nc done in 00:00:57
2024-03-15T21:10:01.168065+0800 INFO Downloading data for the month 2006-09-01 to filename NDVI_PH_200609.nc
2024-03-15T21:10:01.170350+0800 INFO Downloading data for the month 2006-10-01 to filename NDVI_PH_200610.nc
2024-03-15T21:10:01.171890+0800 INFO Downloading data for the month 2006-11-01 to filename NDVI_PH_200611.nc
2024-03-15T21:10:01.173160+0800 INFO Downloading data for the month 2006-12-01 to filename NDVI_PH_200612.nc
