## Performing Stage-In programatically via Python NoteBook.
### Steps
- Download and install uds library from https://pypi.org/project/mdps-ds-lib/
- Set Log Level
- Create environment variables
- Call stage-in class to see them downloaded. 

In [1]:
%pip install mdps-ds-lib==0.5.1.dev10100

Collecting mdps-ds-lib==0.5.1.dev10100
  Using cached mdps_ds_lib-0.5.1.dev10100-py3-none-any.whl.metadata (1.4 kB)
Collecting elasticsearch==7.13.4 (from mdps-ds-lib==0.5.1.dev10100)
  Using cached elasticsearch-7.13.4-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting fastjsonschema<3.0.0,>=2.19.1 (from mdps-ds-lib==0.5.1.dev10100)
  Using cached fastjsonschema-2.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting jsonschema<5.0.0,>=4.23.0 (from mdps-ds-lib==0.5.1.dev10100)
  Using cached jsonschema-4.23.0-py3-none-any.whl.metadata (7.9 kB)
Collecting requests-aws4auth==1.2.3 (from mdps-ds-lib==0.5.1.dev10100)
  Using cached requests_aws4auth-1.2.3-py2.py3-none-any.whl.metadata (18 kB)
Collecting xmltodict==0.13.0 (from mdps-ds-lib==0.5.1.dev10100)
  Using cached xmltodict-0.13.0-py2.py3-none-any.whl.metadata (7.7 kB)
Using cached mdps_ds_lib-0.5.1.dev10100-py3-none-any.whl (75 kB)
Using cached elasticsearch-7.13.4-py2.py3-none-any.whl (356 kB)
Using cached requests_aws4auth-1.2.3-py2.

In [2]:
import logging
logging.basicConfig(level=30, format="%(asctime)s [%(levelname)s] [%(name)s::%(lineno)d] %(message)s")


In [3]:
import os

os.environ['AWS_ACCESS_KEY_ID'] = 'xxx'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'xxx'
os.environ['AWS_SESSION_TOKEN'] = 'xxx'

os.environ['EDL_USERNAME'] = '/unity/uds/user/wphyo/edl_username'  # Parameter Store Key for EarthData Login Username
os.environ['EDL_PASSWORD'] = '/unity/uds/user/wphyo/edl_dwssap'  # Parameter Store Key for EarthData Login Password
os.environ['EDL_PASSWORD_TYPE'] = 'PARAM_STORE'  # Can hardcode it to PARAM_STORE if that is used.
os.environ['EDL_BASE_URL'] = 'urs.earthdata.nasa.gov'  # Earthdata Login URL to get Earthdata token to download files
os.environ['GRANULES_DOWNLOAD_TYPE'] = 'DAAC'  # Download type to choose download class. DAAC, HTTP, and so on.
os.environ['DOWNLOADING_KEYS'] = 'data,metadata'  # Which asset keys to download

os.environ['STAC_JSON'] = 'https://raw.githubusercontent.com/GodwinShen/emit-ghg/main/test/catalog.json'  # URL to direct which granules + assets to download
# os.environ['STAC_JSON'] = os.path.join(os.getcwd(), 'stage_in.json')  # Alternatively,  you can store the file locally, and point it as a path
os.environ['DOWNLOAD_DIR'] = os.path.join(os.getcwd(), 'downloaded_files')  # Base directory where files will be downloaded
os.environ['OUTPUT_FILE'] = os.path.join(os.getcwd(), 'stage_in_result.json')  # file path where the result is written locally for review

In [4]:
from mdps_ds_lib.lib.utils.file_utils import FileUtils
FileUtils.mk_dir_p(os.environ['DOWNLOAD_DIR'])  # Creating a base directory if not created. They can be created manually w/o calling this. 


In [5]:
from mdps_ds_lib.stage_in_out.download_granules_factory import DownloadGranulesFactory
from mdps_ds_lib.stage_in_out.stage_in_out_utils import StageInOutUtils

# Hardcoded method call. All params are set via environment previously
result_str = DownloadGranulesFactory().get_class(os.getenv('GRANULES_DOWNLOAD_TYPE', 'MISSING_GRANULES_DOWNLOAD_TYPE')).download()
StageInOutUtils.write_output_to_file(result_str)
print('done')

done


In [8]:
from glob import glob
print(list(glob(os.path.join(os.environ['DOWNLOAD_DIR'], '*'))))  # Checking if files are downloaded. 

['/home/jovyan/downloaded_files/G2721220118-LPCLOUD.xml', '/home/jovyan/downloaded_files/EMIT_L1B_RAD_001_20230620T084426_2317106_011.nc', '/home/jovyan/downloaded_files/EMIT_L2A_RFL_001_20230620T084426_2317106_011.nc', '/home/jovyan/downloaded_files/downloaded_feature_collection.json', '/home/jovyan/downloaded_files/G2721699381-LPCLOUD.xml']


## Performing Stage-out programatically as auxiliary files
#### Steps
- Create environment variables
- Call stage-in class to see them uploaded.

In [9]:
os.environ['GRANULES_UPLOAD_TYPE'] = 'UPLOAD_AUXILIARY_FILE_AS_GRANULE'  # Setting uploading as auxiliary
os.environ['STAGING_BUCKET'] = 'uds-test-cumulus-internal'  # S3 bucket where they will reside
os.environ['VERIFY_SSL'] = 'FALSE'  # Optional param.
os.environ['RESULT_PATH_PREFIX'] = 'stage_out'  # Optional. prefix to store the result in S3 which will trigger auto catalog.
os.environ['PARALLEL_COUNT'] = '1'  # How many threads are used to upload it

os.environ['OUTPUT_DIRECTORY'] = os.path.join(os.getcwd(), 'output_result_dir')  # the success / failure results to be stored locally for rewview
os.environ['BASE_DIRECTORY'] = os.path.join(os.getcwd(), 'downloaded_files')  # Base folder to upload
os.environ['OUTPUT_FILE'] = os.path.join(os.getcwd(), 'stage_out_result.json')  # file path where the result overview is written locally for review

In [10]:
tenant = 'UDS_DEMO'
tenant_venue = 'TEST' 
collection_name = 'UDS_UNIT_COLLECTION'
collection_version = '24.09.10.11.00'.replace('.', '') 
temp_collection_id = f'URN:NASA:UNITY:{tenant}:{tenant_venue}:{collection_name}___{collection_version}'
os.environ['COLLECTION_ID'] = temp_collection_id  # Setting Collection ID


In [11]:
from mdps_ds_lib.stage_in_out.upoad_granules_factory import UploadGranulesFactory
from mdps_ds_lib.stage_in_out.stage_in_out_utils import StageInOutUtils

FileUtils.mk_dir_p(os.environ['OUTPUT_DIRECTORY'])

# Hardcoded method call. All params are set via environment previously
upload_result_str = UploadGranulesFactory().get_class(os.getenv('GRANULES_UPLOAD_TYPE', UploadGranulesFactory.UPLOAD_S3_BY_STAC_CATALOG)).upload()
StageInOutUtils.write_output_to_file(upload_result_str)
print('done')

done


In [12]:
from mdps_ds_lib.lib.aws.aws_s3 import AwsS3

s3 = AwsS3()
# Check S3 if they are uploaded.
s3_keys = [str(k) for k in s3.get_child_s3_files(os.environ['STAGING_BUCKET'], os.environ['COLLECTION_ID'])]
print('\n'.join(s3_keys))

('URN:NASA:UNITY:UDS_DEMO:TEST:UDS_UNIT_COLLECTION___2409101100/URN:NASA:UNITY:UDS_DEMO:TEST:UDS_UNIT_COLLECTION___2409101100:URN:NASA:UNITY:UDS_DEMO:TEST:UDS_UNIT_COLLECTION___2409101100:EMIT_L1B_RAD_001_20230620T084426_2317106_011/EMIT_L1B_RAD_001_20230620T084426_2317106_011.nc', 1852557979)
('URN:NASA:UNITY:UDS_DEMO:TEST:UDS_UNIT_COLLECTION___2409101100/URN:NASA:UNITY:UDS_DEMO:TEST:UDS_UNIT_COLLECTION___2409101100:URN:NASA:UNITY:UDS_DEMO:TEST:UDS_UNIT_COLLECTION___2409101100:EMIT_L1B_RAD_001_20230620T084426_2317106_011/EMIT_L1B_RAD_001_20230620T084426_2317106_011.nc.stac.json', 1724)
('URN:NASA:UNITY:UDS_DEMO:TEST:UDS_UNIT_COLLECTION___2409101100/URN:NASA:UNITY:UDS_DEMO:TEST:UDS_UNIT_COLLECTION___2409101100:URN:NASA:UNITY:UDS_DEMO:TEST:UDS_UNIT_COLLECTION___2409101100:EMIT_L2A_RFL_001_20230620T084426_2317106_011/EMIT_L2A_RFL_001_20230620T084426_2317106_011.nc', 1851092294)
('URN:NASA:UNITY:UDS_DEMO:TEST:UDS_UNIT_COLLECTION___2409101100/URN:NASA:UNITY:UDS_DEMO:TEST:UDS_UNIT_COLLECTIO

### Performing a Dry-Run Example to see errors

In [6]:
os.environ['VERIFY_SSL'] = 'FALSE'
os.environ['DRY_RUN'] = 'TRUE'
os.environ['RESULT_PATH_PREFIX'] = 'integration_test/stage_out'
os.environ['PROJECT'] = 'DEMO'
os.environ['VENUE'] = 'DEV'
os.environ['STAGING_BUCKET'] = 'uds-sbx-cumulus-staging'

os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'

os.environ['OUTPUT_FILE'] = 'dry-run-error-example/some_output/output.json'
os.environ['UPLOAD_DIR'] = ''  # not needed
os.environ['CATALOG_FILE'] = 'dry-run-error-example/catalog.json'

In [7]:
from mdps_ds_lib.stage_in_out.upoad_granules_factory import UploadGranulesFactory


upload_result = UploadGranulesFactory().get_class(UploadGranulesFactory.UPLOAD_S3_BY_STAC_CATALOG).upload()
print(upload_result)

{}


There are ERRORS in the setup.
{
    "error": "missing OUTPUT_DIRECTORY to write result files"
}
{
    "granule_file": "dry-run-error-example/some_granules/test_file01.nc.stac.json",
    "error": "unable to read the stac file",
    "details": "missing file: dry-run-error-example/some_granules/test_file01.nc.stac.json"
}
{
    "granule_file": "dry-run-error-example/some_granules/test_file02.nc.stac.json",
    "error": "missing uploading file for data - data",
    "details": "dry-run-error-example/some_granules/./test_file02.nc"
}
{
    "granule_file": "dry-run-error-example/some_granules/test_file02.nc.stac.json",
    "error": "missing uploading file for metadata - metadata1",
    "details": "dry-run-error-example/some_granules/./test_file02.nc.cas"
}
{
    "granule_file": "dry-run-error-example/some_granules/test_file03.nc.stac.json",
    "error": "unable to read the stac file",
    "details": "missing file: dry-run-error-example/some_granules/test_file03.nc.stac.json"
}
{
    "granule

### Performing Normal Stage-out 
1. retrieve collection from STAC json (different collections)
1. asset key names can be anything
1. printing uploading filenames
1. accepting empty string as result_path_prefix

In [8]:
%rm -rf normal-stage-out/output_dir

In [9]:
os.environ['VERIFY_SSL'] = 'FALSE'
os.environ['RESULT_PATH_PREFIX'] = ''  # point 4
os.environ['DRY_RUN'] = ''  # resetting previous setting
os.environ['PROJECT'] = 'DEMO'
os.environ['VENUE'] = 'DEV1'
os.environ['STAGING_BUCKET'] = 'uds-sbx-cumulus-staging'
os.environ['GRANULES_SEARCH_DOMAIN'] = 'UNITY'
os.environ['OUTPUT_FILE'] = 'normal-stage-out/some_output/output.json'
os.environ['UPLOAD_DIR'] = ''  # not needed
os.environ['OUTPUT_DIRECTORY'] = 'normal-stage-out/output_dir'
FileUtils.mk_dir_p(os.environ.get('OUTPUT_DIRECTORY'))
os.environ['CATALOG_FILE'] = 'normal-stage-out/catalog.json'

In [10]:
from mdps_ds_lib.stage_in_out.upoad_granules_factory import UploadGranulesFactory


upload_result = UploadGranulesFactory().get_class(UploadGranulesFactory.UPLOAD_S3_BY_STAC_CATALOG).upload()
print(upload_result)

2024-10-29 18:20:02,485 [AUDIT] [mdps_ds_lib.stage_in_out.upload_granules_by_complete_catalog_s3::11] uploading type=data, name=data, href=normal-stage-out/some_granules/./test_file01.nc
2024-10-29 18:20:02,626 [AUDIT] [mdps_ds_lib.stage_in_out.upload_granules_by_complete_catalog_s3::11] uploading type=metadata, name=metadata1, href=normal-stage-out/some_granules/./test_file01.nc.cas
2024-10-29 18:20:02,753 [AUDIT] [mdps_ds_lib.stage_in_out.upload_granules_by_complete_catalog_s3::11] uploading type=metadata, name=metadata2, href=normal-stage-out/some_granules/./test_file01.nc.stac.json
2024-10-29 18:20:03,475 [AUDIT] [mdps_ds_lib.stage_in_out.upload_granules_by_complete_catalog_s3::11] uploading type=data, name=data, href=normal-stage-out/some_granules/./test_file02.nc
2024-10-29 18:20:03,651 [AUDIT] [mdps_ds_lib.stage_in_out.upload_granules_by_complete_catalog_s3::11] uploading type=metadata, name=metadata1, href=normal-stage-out/some_granules/./test_file02.nc.cas
2024-10-29 18:20:03,

{"type": "Catalog", "id": "NA", "stac_version": "1.0.0", "description": "NA", "links": [{"rel": "root", "href": "/tmp/normal-stage-out/catalog.json", "type": "application/json"}, {"rel": "item", "href": "normal-stage-out/output_dir/successful_features.json", "type": "application/json"}, {"rel": "item", "href": "normal-stage-out/output_dir/failed_features.json", "type": "application/json"}]}


In [11]:
from mdps_ds_lib.lib.aws.aws_s3 import AwsS3

s3 = AwsS3()
# Check S3 if they are uploaded.
s3_keys = [str(k) for k in s3.get_child_s3_files(os.environ['STAGING_BUCKET'], f'URN:NASA:UNITY:{os.environ["PROJECT"]}:{os.environ["VENUE"]}:')]
print('\n'.join(s3_keys))

('URN:NASA:UNITY:DEMO:DEV1:NA_01/URN:NASA:UNITY:DEMO:DEV1:NA_01:test_file01/test_file01.nc', 11)
('URN:NASA:UNITY:DEMO:DEV1:NA_01/URN:NASA:UNITY:DEMO:DEV1:NA_01:test_file01/test_file01.nc.cas', 3800)
('URN:NASA:UNITY:DEMO:DEV1:NA_01/URN:NASA:UNITY:DEMO:DEV1:NA_01:test_file01/test_file01.nc.stac.json', 754)
('URN:NASA:UNITY:DEMO:DEV1:NA_02/URN:NASA:UNITY:DEMO:DEV1:NA_02:test_file02/test_file02.nc', 11)
('URN:NASA:UNITY:DEMO:DEV1:NA_02/URN:NASA:UNITY:DEMO:DEV1:NA_02:test_file02/test_file02.nc.cas', 3800)
('URN:NASA:UNITY:DEMO:DEV1:NA_02/URN:NASA:UNITY:DEMO:DEV1:NA_02:test_file02/test_file02.nc.stac.json', 754)
('URN:NASA:UNITY:DEMO:DEV1:NA_03/URN:NASA:UNITY:DEMO:DEV1:NA_03:test_file03/test_file03.nc', 11)
('URN:NASA:UNITY:DEMO:DEV1:NA_03/URN:NASA:UNITY:DEMO:DEV1:NA_03:test_file03/test_file03.nc.cas', 3800)
('URN:NASA:UNITY:DEMO:DEV1:NA_03/URN:NASA:UNITY:DEMO:DEV1:NA_03:test_file03/test_file03.nc.stac.json', 754)
('URN:NASA:UNITY:DEMO:DEV1:NA_04/URN:NASA:UNITY:DEMO:DEV1:NA_04:test_file04/