Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #4886 look for a tarball first fallback to individual files #4904

Merged
merged 17 commits into from Feb 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions changelog/4904.bugfix.rst
@@ -0,0 +1,2 @@
Fixed the `~.SRSClient` which silently failed to download the SRS files when the tarball for the previous years did not exist.
Client now actually searches for the tarballs and srs files on the ftp archive before returning them as results.
4 changes: 3 additions & 1 deletion sunpy/net/dataretriever/sources/gong.py
Expand Up @@ -18,12 +18,14 @@ class GONGClient(GenericClient):
<sunpy.net.fido_factory.UnifiedResponse object at ...>
Results from 1 Provider:
<BLANKLINE>
3 Results from the GONGClient:
5 Results from the GONGClient:
Start Time End Time Instrument ... Source ExtentType
----------------------- ----------------------- ---------- ... ------ ----------
2019-12-31 22:14:00.000 2019-12-31 22:14:59.999 GONG ... NSO SYNOPTIC
2019-12-31 23:04:00.000 2019-12-31 23:04:59.999 GONG ... NSO SYNOPTIC
2019-12-31 23:54:00.000 2019-12-31 23:54:59.999 GONG ... NSO SYNOPTIC
2020-01-01 00:14:00.000 2020-01-01 00:14:59.999 GONG ... NSO SYNOPTIC
2020-01-01 01:14:00.000 2020-01-01 01:14:59.999 GONG ... NSO SYNOPTIC
<BLANKLINE>
<BLANKLINE>

Expand Down
144 changes: 77 additions & 67 deletions sunpy/net/dataretriever/sources/noaa.py
Expand Up @@ -3,16 +3,17 @@
# Google Summer of Code 2014
import pathlib
import tarfile
from datetime import datetime
from collections import OrderedDict

from astropy import units as u
from astropy.time import Time, TimeDelta
from astropy.time import Time

from sunpy.extern.parse import parse
from sunpy import log
from sunpy.net import attrs as a
from sunpy.net.dataretriever import GenericClient, QueryResponse
from sunpy.time import TimeRange
from sunpy.util.parfive_helpers import Downloader
from sunpy.util.scraper import Scraper

__all__ = ['NOAAIndicesClient', 'NOAAPredictClient', 'SRSClient']

Expand Down Expand Up @@ -150,59 +151,83 @@ class SRSClient(GenericClient):
2 Results from the SRSClient:
Start Time End Time Instrument ... Source Provider
----------------------- ----------------------- ---------- ... ------ --------
2016-01-01 00:00:00.000 2016-01-02 00:00:00.000 SOON ... SWPC NOAA
2016-01-01 00:00:00.000 2016-01-02 00:00:00.000 SOON ... SWPC NOAA
2016-01-01 00:00:00.000 2016-01-01 23:59:59.999 SOON ... SWPC NOAA
2016-01-02 00:00:00.000 2016-01-02 23:59:59.999 SOON ... SWPC NOAA
<BLANKLINE>
<BLANKLINE>

"""
BASE_URL = 'ftp://ftp.swpc.noaa.gov/pub/warehouse/'
MIN_YEAR = 1996

def _get_url_for_timerange(self, timerange):
"""
Returns a list of urls corresponding to a
given time-range.
Returns a list of urls corresponding to a given time-range.
"""
result = list()
base_url = 'ftp://ftp.swpc.noaa.gov/pub/warehouse/'
total_days = int(timerange.days.value) + 1
all_dates = timerange.split(total_days)
today_year = int(Time.now().strftime('%Y'))
for day in all_dates:
end_year = int(day.end.strftime('%Y'))
if end_year > today_year or end_year < 1996:
continue
elif end_year == today_year:
suffix = '{}/SRS/{}SRS.txt'.format(
end_year, day.end.strftime('%Y%m%d'))
else:
suffix = '{}/{}_SRS.tar.gz'.format(
end_year, day.end.strftime('%Y'))
url = base_url + suffix
result.append(url)
return result
# Validate time range srs generated daily since 1996
cur_year = Time.now().datetime.year
req_start_year = timerange.start.datetime.year
req_end_year = timerange.end.datetime.year

# Return early if both start and end are less than or greater than limits
if req_start_year <= req_end_year < self.MIN_YEAR \
or req_end_year >= req_start_year > cur_year:
return result

# No point in searching below the min or above max years
start_year = max(req_start_year, self.MIN_YEAR)
end_year = min(req_end_year, cur_year)

# Search for tarballs for all years in the query
tarball_timerange = TimeRange(f'{start_year}-01-01', f'{end_year}-12-31 23:59:59.999')
tarball_urls = dict()
tarball_scraper = Scraper(self.BASE_URL + '%Y/%Y_SRS.tar.gz')
tarballs = tarball_scraper.filelist(tarball_timerange)
max_tarball_year = None
for tb_url in tarballs:
date = tarball_scraper._extractDateURL(tb_url)
year = date.to_datetime().year
max_tarball_year = year
tarball_urls[year] = tb_url
log.debug('SRS tarball found for year %d', year)

# Create a new time range for the times not covered by tarballs, have to assume tarballs
# cover a year, and look for individual srs file for this time range.
srs_urls = dict()
min_file_year = max_tarball_year if max_tarball_year else start_year
min_file_date = (datetime(max_tarball_year, 12, 31, 23, 59, 59) if max_tarball_year else
datetime(start_year, 1, 1, 0, 0, 0))
max_file_date = min(timerange.end.datetime, Time.now().datetime)
if min_file_date < max_file_date:
file_timerange = TimeRange(f'{min_file_year}-01-01', max_file_date)
srsfile_scraper = Scraper(self.BASE_URL + '%Y/SRS/%Y%m%dSRS.txt')
srsfiles = srsfile_scraper.filelist(file_timerange)
for srs_url in srsfiles:
date = srsfile_scraper._extractDateURL(srs_url)
srs_urls[(date.datetime.year, date.datetime.month, date.datetime.day)] = srs_url
log.debug('SRS file found for year %d', date)

# Now iterate over all days and if the day is in a year we have a tarball for or a day there
# is a individual srs file add to the result with corresponding extdict
for day in timerange.get_dates():
day_ymd = (int(day.strftime('%Y')), int(day.strftime('%m')), int(day.strftime('%d')))
extdict = {'year': day_ymd[0], 'month': day_ymd[1], 'day': day_ymd[2]}
if self.MIN_YEAR <= day_ymd[0] <= cur_year:
if day_ymd[0] in tarball_urls.keys():
result.append((extdict, tarball_urls[day_ymd[0]]))
elif day_ymd in srs_urls.keys():
result.append((extdict, srs_urls[day_ymd]))

def post_search_hook(self, exdict, matchdict):
# update the extracted metadata to include the queried times rather
# than those scraped from the downloaded zip (which includes full year data).
rowdict = super().post_search_hook(exdict, matchdict)
rowdict["Start Time"] = matchdict["Start Time"]
rowdict["End Time"] = matchdict["End Time"]
rowdict["Start Time"].format = 'iso'
rowdict["End Time"].format = 'iso'
return rowdict
return result

def search(self, *args, **kwargs):
extractor1 = '{}/warehouse/{:4d}/SRS/{year:4d}{month:2d}{day:2d}SRS.txt'
extractor2 = '{}/warehouse/{year:4d}/{}'
matchdict = self._get_match_dict(*args, **kwargs)
timerange = TimeRange(matchdict['Start Time'], matchdict['End Time'])
metalist = []
for url in self._get_url_for_timerange(timerange):
exdict1 = parse(extractor1, url)
exdict2 = parse(extractor2, url)
exdict = (exdict2 if exdict1 is None else exdict1).named
exdict['url'] = url
rowdict = self.post_search_hook(exdict, matchdict)
for extdict, url in self._get_url_for_timerange(timerange):
extdict['url'] = url
rowdict = self.post_search_hook(extdict, matchdict)
metalist.append(rowdict)
return QueryResponse(metalist, client=self)

Expand All @@ -219,20 +244,14 @@ def fetch(self, qres, path=None, error_callback=None, **kwargs):
-------
Results Object
"""

urls = [qrblock['url'] for qrblock in qres]

filenames = []
local_filenames = []

for i, [url, qre] in enumerate(zip(urls, qres)):
for url, qre in zip(urls, qres):
name = url.split('/')[-1]

day = Time(qre['Start Time'].strftime('%Y-%m-%d')) + TimeDelta(i*u.day)

day = qre['Start Time']
if name not in filenames:
filenames.append(name)

if name.endswith('.gz'):
local_filenames.append('{}SRS.txt'.format(day.strftime('%Y%m%d')))
else:
Expand All @@ -250,37 +269,28 @@ def fetch(self, qres, path=None, error_callback=None, **kwargs):
# OrderedDict is required to maintain ordering because it will be zipped with paths later
urls = list(OrderedDict.fromkeys(urls))

dobj = Downloader(max_conn=5)

downloader = Downloader(max_conn=2)
for aurl, fname in zip(urls, paths):
dobj.enqueue_file(aurl, filename=fname)
downloader.enqueue_file(aurl, filename=fname)

paths = dobj.download()
paths = downloader.download()

outfiles = []
for fname, srs_filename in zip(local_paths, local_filenames):

name = fname.name

past_year = False
for i, fname2 in enumerate(paths):
for fname2 in paths:
fname2 = pathlib.Path(fname2)

if fname2.name.endswith('.txt'):
continue

year = fname2.name.split('_SRS')[0]

if year in name:
TarFile = tarfile.open(fname2)
filepath = fname.parent
member = TarFile.getmember('SRS/' + srs_filename)
member.name = name
TarFile.extract(member, path=filepath)
TarFile.close()

with tarfile.open(fname2) as open_tar:
filepath = fname.parent
member = open_tar.getmember('SRS/' + srs_filename)
member.name = name
open_tar.extract(member, path=filepath)
outfiles.append(fname)

past_year = True
break

Expand Down
31 changes: 28 additions & 3 deletions sunpy/net/dataretriever/sources/tests/test_noaa.py
Expand Up @@ -137,13 +137,13 @@ def test_fetch(mock_wait, mock_search, mock_enqueue, tmp_path, indices_client):
path / "observed-solar-cycle-indices.json"))


@no_vso
@mock.patch('sunpy.net.dataretriever.sources.noaa.NOAAIndicesClient.search',
return_value=mock_query_object('2012/10/4', '2012/10/6'))
# The return value of download is irrelevant
@mock.patch('parfive.Downloader.download',
return_value=None)
@mock.patch('parfive.Downloader.enqueue_file')
@no_vso
def test_fido(mock_wait, mock_search, mock_enqueue, tmp_path, indices_client):
path = tmp_path / "sub"
path.mkdir()
Expand All @@ -159,6 +159,7 @@ def test_fido(mock_wait, mock_search, mock_enqueue, tmp_path, indices_client):
path / "observed-solar-cycle-indices.json"))


@no_vso
@pytest.mark.remote_data
def test_srs_tar_unpack():
qr = Fido.search(a.Instrument("soon") & a.Time("2015/01/01", "2015/01/01T23:59:29"))
Expand All @@ -167,6 +168,7 @@ def test_srs_tar_unpack():
assert res.data[0].endswith("20150101SRS.txt")


@no_vso
@pytest.mark.remote_data
def test_srs_tar_unpack_midyear():
qr = Fido.search(a.Instrument("soon") & a.Time("2011/06/07", "2011/06/08T23:59:29"))
Expand All @@ -176,15 +178,27 @@ def test_srs_tar_unpack_midyear():
assert res.data[-1].endswith("20110608SRS.txt")


@no_vso
@pytest.mark.remote_data
@mock.patch('ftplib.FTP.nlst', side_effect=[[''], ['20200101SRS.txt', '20200102SRS.txt']])
def test_srs_missing_tarball(mock_ftp_nlst):
qr = Fido.search(a.Time('2020-01-01', '2020-01-02'), a.Instrument.srs_table)
urls = [qrblock['url'] for qrblock in qr[0]]
assert urls[0].endswith('20200101SRS.txt')
assert urls[1].endswith('20200102SRS.txt')


@no_vso
@pytest.mark.remote_data
def test_srs_current_year():
year = datetime.date.today().year
qr = Fido.search(a.Instrument("soon") & a.Time(f"{year}/01/01", f"{year}/01/01T23:59:29"))
qr = Fido.search(a.Instrument("soon") & a.Time(f"{year}/02/01", f"{year}/02/01T23:59:29"))
res = Fido.fetch(qr)
assert len(res) == 1
assert res.data[0].endswith(f"{year}0101SRS.txt")
assert res.data[0].endswith(f"{year}0201SRS.txt")


@no_vso
@pytest.mark.remote_data
def test_srs_save_path(tmpdir):
qr = Fido.search(a.Instrument.srs_table, a.Time("2016/10/01", "2016/10/02"))
Expand All @@ -194,6 +208,7 @@ def test_srs_save_path(tmpdir):
assert files[1].endswith("20161002SRS.txt")


@pytest.mark.remote_data
@pytest.mark.filterwarnings('ignore:ERFA function')
def test_srs_out_of_range(srs_client):
res = srs_client.search(a.Time('1995/01/01', '1995/02/01'))
Expand All @@ -202,6 +217,16 @@ def test_srs_out_of_range(srs_client):
assert len(res) == 0


@pytest.mark.remote_data
@pytest.mark.filterwarnings('ignore:ERFA function')
def test_srs_start_or_end_out_of_range(srs_client):
res = srs_client.search(a.Time('1995/12/30', '1996/01/02'))
assert len(res) == 1
cur_year = datetime.date.today().year
res = srs_client.search(a.Time(f'{cur_year}/01/01', f'{cur_year+2}/01/01'))
assert len(res) > 0


def test_no_time(predict_client, indices_client):
res = indices_client.search(a.Instrument.noaa_indices)
assert len(res) == 1
Expand Down
4 changes: 2 additions & 2 deletions sunpy/net/tests/strategies.py
Expand Up @@ -54,7 +54,7 @@ def offline_instruments():
Returns a strategy for any instrument that does not need the internet to do
a query.
"""
offline_instr = ['noaa-indices', 'noaa-predict', 'soon']
offline_instr = ['noaa-indices', 'noaa-predict']
offline_instr = st.builds(a.Instrument, st.sampled_from(offline_instr))

return st.one_of(offline_instr)
Expand All @@ -65,7 +65,7 @@ def online_instruments():
Returns a strategy for any instrument that does need the internet to do
a query.
"""
online_instr = ['lyra', 'goes', 'eve', 'rhessi', 'norh']
online_instr = ['lyra', 'goes', 'eve', 'rhessi', 'norh', 'soon']
online_instr = st.builds(a.Instrument, st.sampled_from(online_instr))

return online_instr
Expand Down
4 changes: 2 additions & 2 deletions sunpy/net/tests/test_fido.py
Expand Up @@ -40,8 +40,6 @@ def offline_query(draw, instrument=offline_instruments()):
# If we have AttrAnd then we don't have GOES
if isinstance(query, a.Instrument) and query.value == 'goes':
query &= draw(goes_time())
elif isinstance(query, a.Instrument) and query.value == 'soon':
query &= draw(srs_time())
else:
query = attr.and_(query, draw(time_attr()))
return query
Expand All @@ -55,6 +53,8 @@ def online_query(draw, instrument=online_instruments()):
query &= a.Level.zero
if isinstance(query, a.Instrument) and query.value == 'norh':
query &= a.Wavelength(17*u.GHz)
if isinstance(query, a.Instrument) and query.value == 'soon':
query &= draw(srs_time())

return query

Expand Down