Skip to content

Commit 71d40f7

Browse files
authored
Fix failing integration tests (#869)
1 parent 84be54e commit 71d40f7

File tree

7 files changed

+134
-115
lines changed

7 files changed

+134
-115
lines changed

.github/workflows/integration-test.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,21 @@ jobs:
7373
7474
- name: Checkout source
7575
uses: actions/checkout@v4
76+
with:
77+
# Getting the correct commit for a pull_request_target event appears to be
78+
# a known, problematic issue: https://github.com/actions/checkout/issues/518
79+
# It seems that ideally, we want github.event.pull_request.merge_commit_sha,
80+
# but that it is not reliable, and can sometimes be a null values. It
81+
# appears that this is the most reasonable way to ensure that we are pulling
82+
# the same code that triggered things, based upon this particular comment:
83+
# https://github.com/actions/checkout/issues/518#issuecomment-1661941548
84+
ref: "refs/pull/${{ github.event.number }}/merge"
85+
fetch-depth: 2
86+
87+
- name: Sanity check
88+
# Continuing from previous comment in checkout step above.
89+
run: |
90+
[[ "$(git rev-parse 'HEAD~2')" == "${{ github.event.pull_request.head.sha }}" ]]
7691
7792
- name: Install package with dependencies
7893
uses: ./.github/actions/install-pkg

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ jobs:
2727
run: mypy
2828

2929
- name: Test
30-
run: pytest tests/unit --cov=earthaccess --cov=tests --cov-report=term-missing --capture=no --tb=native --log-cli-level=INFO
30+
run: pytest tests/unit --verbose --cov=earthaccess --cov-report=term-missing --capture=no --tb=native --log-cli-level=INFO
3131

3232
- name: Upload coverage
3333
# Don't upload coverage when using the `act` tool to run the workflow locally

earthaccess/api.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from pathlib import Path
23

34
import requests
45
import s3fs
@@ -202,9 +203,10 @@ def login(strategy: str = "all", persist: bool = False, system: System = PROD) -
202203

203204
def download(
204205
granules: Union[DataGranule, List[DataGranule], str, List[str]],
205-
local_path: Optional[str],
206+
local_path: Optional[Union[Path, str]] = None,
206207
provider: Optional[str] = None,
207208
threads: int = 8,
209+
*,
208210
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
209211
) -> List[str]:
210212
"""Retrieves data granules from a remote storage system.
@@ -215,7 +217,11 @@ def download(
215217
216218
Parameters:
217219
granules: a granule, list of granules, a granule link (HTTP), or a list of granule links (HTTP)
218-
local_path: local directory to store the remote data granules
220+
local_path: Local directory to store the remote data granules. If not
221+
supplied, defaults to a subdirectory of the current working directory
222+
of the form `data/YYYY-MM-DD-UUID`, where `YYYY-MM-DD` is the year,
223+
month, and day of the current date, and `UUID` is the last 6 digits
224+
of a UUID4 value.
219225
provider: if we download a list of URLs, we need to specify the provider.
220226
threads: parallel number of threads to use to download the files, adjust as necessary, default = 8
221227
pqdm_kwargs: Additional keyword arguments to pass to pqdm, a parallel processing library.
@@ -228,31 +234,29 @@ def download(
228234
Raises:
229235
Exception: A file download failed.
230236
"""
231-
provider = _normalize_location(provider)
232-
pqdm_kwargs = {
233-
"exception_behavior": "immediate",
234-
"n_jobs": threads,
235-
**(pqdm_kwargs or {}),
236-
}
237+
provider = _normalize_location(str(provider))
238+
237239
if isinstance(granules, DataGranule):
238240
granules = [granules]
239241
elif isinstance(granules, str):
240242
granules = [granules]
243+
241244
try:
242-
results = earthaccess.__store__.get(
243-
granules, local_path, provider, threads, pqdm_kwargs
245+
return earthaccess.__store__.get(
246+
granules, local_path, provider, threads, pqdm_kwargs=pqdm_kwargs
244247
)
245248
except AttributeError as err:
246249
logger.error(
247250
f"{err}: You must call earthaccess.login() before you can download data"
248251
)
249-
return []
250-
return results
252+
253+
return []
251254

252255

253256
def open(
254257
granules: Union[List[str], List[DataGranule]],
255258
provider: Optional[str] = None,
259+
*,
256260
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
257261
) -> List[AbstractFileSystem]:
258262
"""Returns a list of file-like objects that can be used to access files
@@ -269,15 +273,11 @@ def open(
269273
Returns:
270274
A list of "file pointers" to remote (i.e. s3 or https) files.
271275
"""
272-
provider = _normalize_location(provider)
273-
pqdm_kwargs = {
274-
"exception_behavior": "immediate",
275-
**(pqdm_kwargs or {}),
276-
}
277-
results = earthaccess.__store__.open(
278-
granules=granules, provider=provider, pqdm_kwargs=pqdm_kwargs
276+
return earthaccess.__store__.open(
277+
granules=granules,
278+
provider=_normalize_location(provider),
279+
pqdm_kwargs=pqdm_kwargs,
279280
)
280-
return results
281281

282282

283283
def get_s3_credentials(

earthaccess/store.py

Lines changed: 51 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -63,22 +63,20 @@ def __repr__(self) -> str:
6363
def _open_files(
6464
url_mapping: Mapping[str, Union[DataGranule, None]],
6565
fs: fsspec.AbstractFileSystem,
66-
threads: int = 8,
66+
*,
6767
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
6868
) -> List[fsspec.spec.AbstractBufferedFile]:
6969
def multi_thread_open(data: tuple[str, Optional[DataGranule]]) -> EarthAccessFile:
7070
url, granule = data
7171
return EarthAccessFile(fs.open(url), granule) # type: ignore
7272

7373
pqdm_kwargs = {
74-
"exception_behavior": "immediate",
74+
"exception_behaviour": "immediate",
75+
"n_jobs": 8,
7576
**(pqdm_kwargs or {}),
7677
}
7778

78-
fileset = pqdm(
79-
url_mapping.items(), multi_thread_open, n_jobs=threads, **pqdm_kwargs
80-
)
81-
return fileset
79+
return pqdm(url_mapping.items(), multi_thread_open, **pqdm_kwargs)
8280

8381

8482
def make_instance(
@@ -344,6 +342,7 @@ def open(
344342
self,
345343
granules: Union[List[str], List[DataGranule]],
346344
provider: Optional[str] = None,
345+
*,
347346
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
348347
) -> List[fsspec.spec.AbstractBufferedFile]:
349348
"""Returns a list of file-like objects that can be used to access files
@@ -361,14 +360,15 @@ def open(
361360
A list of "file pointers" to remote (i.e. s3 or https) files.
362361
"""
363362
if len(granules):
364-
return self._open(granules, provider, pqdm_kwargs)
363+
return self._open(granules, provider, pqdm_kwargs=pqdm_kwargs)
365364
return []
366365

367366
@singledispatchmethod
368367
def _open(
369368
self,
370369
granules: Union[List[str], List[DataGranule]],
371370
provider: Optional[str] = None,
371+
*,
372372
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
373373
) -> List[Any]:
374374
raise NotImplementedError("granules should be a list of DataGranule or URLs")
@@ -378,7 +378,8 @@ def _open_granules(
378378
self,
379379
granules: List[DataGranule],
380380
provider: Optional[str] = None,
381-
threads: int = 8,
381+
*,
382+
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
382383
) -> List[Any]:
383384
fileset: List = []
384385
total_size = round(sum([granule.size() for granule in granules]) / 1024, 2)
@@ -411,7 +412,7 @@ def _open_granules(
411412
fileset = _open_files(
412413
url_mapping,
413414
fs=s3_fs,
414-
threads=threads,
415+
pqdm_kwargs=pqdm_kwargs,
415416
)
416417
except Exception as e:
417418
raise RuntimeError(
@@ -420,19 +421,19 @@ def _open_granules(
420421
f"Exception: {traceback.format_exc()}"
421422
) from e
422423
else:
423-
fileset = self._open_urls_https(url_mapping, threads=threads)
424-
return fileset
424+
fileset = self._open_urls_https(url_mapping, pqdm_kwargs=pqdm_kwargs)
425425
else:
426426
url_mapping = _get_url_granule_mapping(granules, access="on_prem")
427-
fileset = self._open_urls_https(url_mapping, threads=threads)
428-
return fileset
427+
fileset = self._open_urls_https(url_mapping, pqdm_kwargs=pqdm_kwargs)
428+
429+
return fileset
429430

430431
@_open.register
431432
def _open_urls(
432433
self,
433434
granules: List[str],
434435
provider: Optional[str] = None,
435-
threads: int = 8,
436+
*,
436437
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
437438
) -> List[Any]:
438439
fileset: List = []
@@ -460,7 +461,6 @@ def _open_urls(
460461
fileset = _open_files(
461462
url_mapping,
462463
fs=s3_fs,
463-
threads=threads,
464464
pqdm_kwargs=pqdm_kwargs,
465465
)
466466
except Exception as e:
@@ -481,15 +481,16 @@ def _open_urls(
481481
raise ValueError(
482482
"We cannot open S3 links when we are not in-region, try using HTTPS links"
483483
)
484-
fileset = self._open_urls_https(url_mapping, threads, pqdm_kwargs)
484+
fileset = self._open_urls_https(url_mapping, pqdm_kwargs=pqdm_kwargs)
485485
return fileset
486486

487487
def get(
488488
self,
489489
granules: Union[List[DataGranule], List[str]],
490-
local_path: Union[Path, str, None] = None,
490+
local_path: Optional[Union[Path, str]] = None,
491491
provider: Optional[str] = None,
492492
threads: int = 8,
493+
*,
493494
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
494495
) -> List[str]:
495496
"""Retrieves data granules from a remote storage system.
@@ -503,7 +504,11 @@ def get(
503504
504505
Parameters:
505506
granules: A list of granules(DataGranule) instances or a list of granule links (HTTP).
506-
local_path: Local directory to store the remote data granules.
507+
local_path: Local directory to store the remote data granules. If not
508+
supplied, defaults to a subdirectory of the current working directory
509+
of the form `data/YYYY-MM-DD-UUID`, where `YYYY-MM-DD` is the year,
510+
month, and day of the current date, and `UUID` is the last 6 digits
511+
of a UUID4 value.
507512
provider: a valid cloud provider, each DAAC has a provider code for their cloud distributions
508513
threads: Parallel number of threads to use to download the files;
509514
adjust as necessary, default = 8.
@@ -514,26 +519,28 @@ def get(
514519
Returns:
515520
List of downloaded files
516521
"""
522+
if not granules:
523+
raise ValueError("List of URLs or DataGranule instances expected")
524+
517525
if local_path is None:
518-
today = datetime.datetime.today().strftime("%Y-%m-%d")
526+
today = datetime.datetime.now().strftime("%Y-%m-%d")
519527
uuid = uuid4().hex[:6]
520528
local_path = Path.cwd() / "data" / f"{today}-{uuid}"
521-
elif isinstance(local_path, str):
522-
local_path = Path(local_path)
523529

524-
if len(granules):
525-
files = self._get(granules, local_path, provider, threads, pqdm_kwargs)
526-
return files
527-
else:
528-
raise ValueError("List of URLs or DataGranule instances expected")
530+
pqdm_kwargs = {
531+
"n_jobs": threads,
532+
**(pqdm_kwargs or {}),
533+
}
534+
535+
return self._get(granules, Path(local_path), provider, pqdm_kwargs=pqdm_kwargs)
529536

530537
@singledispatchmethod
531538
def _get(
532539
self,
533540
granules: Union[List[DataGranule], List[str]],
534541
local_path: Path,
535542
provider: Optional[str] = None,
536-
threads: int = 8,
543+
*,
537544
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
538545
) -> List[str]:
539546
"""Retrieves data granules from a remote storage system.
@@ -566,7 +573,7 @@ def _get_urls(
566573
granules: List[str],
567574
local_path: Path,
568575
provider: Optional[str] = None,
569-
threads: int = 8,
576+
*,
570577
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
571578
) -> List[str]:
572579
data_links = granules
@@ -590,7 +597,7 @@ def _get_urls(
590597
else:
591598
# if we are not in AWS
592599
return self._download_onprem_granules(
593-
data_links, local_path, threads, pqdm_kwargs
600+
data_links, local_path, pqdm_kwargs=pqdm_kwargs
594601
)
595602

596603
@_get.register
@@ -599,7 +606,7 @@ def _get_granules(
599606
granules: List[DataGranule],
600607
local_path: Path,
601608
provider: Optional[str] = None,
602-
threads: int = 8,
609+
*,
603610
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
604611
) -> List[str]:
605612
data_links: List = []
@@ -615,7 +622,7 @@ def _get_granules(
615622
for granule in granules
616623
)
617624
)
618-
total_size = round(sum([granule.size() for granule in granules]) / 1024, 2)
625+
total_size = round(sum(granule.size() for granule in granules) / 1024, 2)
619626
logger.info(
620627
f" Getting {len(granules)} granules, approx download size: {total_size} GB"
621628
)
@@ -642,7 +649,7 @@ def _get_granules(
642649
# if the data are cloud-based, but we are not in AWS,
643650
# it will be downloaded as if it was on prem
644651
return self._download_onprem_granules(
645-
data_links, local_path, threads, pqdm_kwargs
652+
data_links, local_path, pqdm_kwargs=pqdm_kwargs
646653
)
647654

648655
def _download_file(self, url: str, directory: Path) -> str:
@@ -684,7 +691,7 @@ def _download_onprem_granules(
684691
self,
685692
urls: List[str],
686693
directory: Path,
687-
threads: int = 8,
694+
*,
688695
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
689696
) -> List[Any]:
690697
"""Downloads a list of URLS into the data directory.
@@ -711,25 +718,26 @@ def _download_onprem_granules(
711718

712719
arguments = [(url, directory) for url in urls]
713720

714-
results = pqdm(
715-
arguments,
716-
self._download_file,
717-
n_jobs=threads,
718-
argument_type="args",
719-
**pqdm_kwargs,
720-
)
721-
return results
721+
pqdm_kwargs = {
722+
"exception_behaviour": "immediate",
723+
**(pqdm_kwargs or {}),
724+
# We don't want a user to be able to override the following kwargs,
725+
# which is why they appear *after* spreading pqdm_kwargs above.
726+
"argument_type": "args",
727+
}
728+
729+
return pqdm(arguments, self._download_file, **pqdm_kwargs)
722730

723731
def _open_urls_https(
724732
self,
725733
url_mapping: Mapping[str, Union[DataGranule, None]],
726-
threads: int = 8,
734+
*,
727735
pqdm_kwargs: Optional[Mapping[str, Any]] = None,
728736
) -> List[fsspec.AbstractFileSystem]:
729737
https_fs = self.get_fsspec_session()
730738

731739
try:
732-
return _open_files(url_mapping, https_fs, threads, pqdm_kwargs)
740+
return _open_files(url_mapping, https_fs, pqdm_kwargs=pqdm_kwargs)
733741
except Exception:
734742
logger.exception(
735743
"An exception occurred while trying to access remote files via HTTPS"

scripts/integration-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env bash
22

33
set -x
4-
pytest tests/integration --cov=earthaccess --cov=tests/integration --cov-report=term-missing "${@}" --capture=no --tb=native --log-cli-level=INFO
4+
pytest tests/integration --cov=earthaccess --cov-report=term-missing "${@}" --capture=no --tb=native --log-cli-level=INFO
55
RET=$?
66
set +x
77

0 commit comments

Comments
 (0)