Skip to content

Commit

Permalink
Fix #4886 look for a tarball first fallback to individual files (#4904)
Browse files Browse the repository at this point in the history
* Fix #4886 look for a tarball first fallback to individual files

* Update scraper to use relativedelta not units

* Add changelog

* Fix scraper to pass tests

* Review updates and more test fixes

* More scraper range fixes

* Fix #4886 look for a tarball first fallback to individual files

* Update scraper to use relativedelta not units

* Add changelog

* Fix scraper to pass tests

* Review updates and more test fixes

* More scraper range fixes

* changed test and updated changelog

* Rework date logic and update gong doc test.

* Update sunpy/net/dataretriever/sources/noaa.py

Co-authored-by: Nabil Freij <nabil.freij@gmail.com>

* Apply suggestions from code review

Co-authored-by: Stuart Mumford <stuart@cadair.com>

Co-authored-by: Nabil Freij <nabil.freij@gmail.com>
Co-authored-by: Stuart Mumford <stuart@cadair.com>
  • Loading branch information
3 people committed Feb 18, 2021
1 parent 6869bbf commit 6685160
Show file tree
Hide file tree
Showing 8 changed files with 174 additions and 99 deletions.
2 changes: 2 additions & 0 deletions changelog/4904.bugfix.rst
@@ -0,0 +1,2 @@
Fixed the `~.SRSClient` which silently failed to download the SRS files when the tarball for the previous years did not exist.
Client now actually searches for the tarballs and srs files on the ftp archive before returning them as results.
4 changes: 3 additions & 1 deletion sunpy/net/dataretriever/sources/gong.py
Expand Up @@ -18,12 +18,14 @@ class GONGClient(GenericClient):
<sunpy.net.fido_factory.UnifiedResponse object at ...>
Results from 1 Provider:
<BLANKLINE>
3 Results from the GONGClient:
5 Results from the GONGClient:
Start Time End Time Instrument ... Source ExtentType
----------------------- ----------------------- ---------- ... ------ ----------
2019-12-31 22:14:00.000 2019-12-31 22:14:59.999 GONG ... NSO SYNOPTIC
2019-12-31 23:04:00.000 2019-12-31 23:04:59.999 GONG ... NSO SYNOPTIC
2019-12-31 23:54:00.000 2019-12-31 23:54:59.999 GONG ... NSO SYNOPTIC
2020-01-01 00:14:00.000 2020-01-01 00:14:59.999 GONG ... NSO SYNOPTIC
2020-01-01 01:14:00.000 2020-01-01 01:14:59.999 GONG ... NSO SYNOPTIC
<BLANKLINE>
<BLANKLINE>
Expand Down
144 changes: 77 additions & 67 deletions sunpy/net/dataretriever/sources/noaa.py
Expand Up @@ -3,16 +3,17 @@
# Google Summer of Code 2014
import pathlib
import tarfile
from datetime import datetime
from collections import OrderedDict

from astropy import units as u
from astropy.time import Time, TimeDelta
from astropy.time import Time

from sunpy.extern.parse import parse
from sunpy import log
from sunpy.net import attrs as a
from sunpy.net.dataretriever import GenericClient, QueryResponse
from sunpy.time import TimeRange
from sunpy.util.parfive_helpers import Downloader
from sunpy.util.scraper import Scraper

__all__ = ['NOAAIndicesClient', 'NOAAPredictClient', 'SRSClient']

Expand Down Expand Up @@ -150,59 +151,83 @@ class SRSClient(GenericClient):
2 Results from the SRSClient:
Start Time End Time Instrument ... Source Provider
----------------------- ----------------------- ---------- ... ------ --------
2016-01-01 00:00:00.000 2016-01-02 00:00:00.000 SOON ... SWPC NOAA
2016-01-01 00:00:00.000 2016-01-02 00:00:00.000 SOON ... SWPC NOAA
2016-01-01 00:00:00.000 2016-01-01 23:59:59.999 SOON ... SWPC NOAA
2016-01-02 00:00:00.000 2016-01-02 23:59:59.999 SOON ... SWPC NOAA
<BLANKLINE>
<BLANKLINE>
"""
BASE_URL = 'ftp://ftp.swpc.noaa.gov/pub/warehouse/'
MIN_YEAR = 1996

def _get_url_for_timerange(self, timerange):
"""
Returns a list of urls corresponding to a
given time-range.
Returns a list of urls corresponding to a given time-range.
"""
result = list()
base_url = 'ftp://ftp.swpc.noaa.gov/pub/warehouse/'
total_days = int(timerange.days.value) + 1
all_dates = timerange.split(total_days)
today_year = int(Time.now().strftime('%Y'))
for day in all_dates:
end_year = int(day.end.strftime('%Y'))
if end_year > today_year or end_year < 1996:
continue
elif end_year == today_year:
suffix = '{}/SRS/{}SRS.txt'.format(
end_year, day.end.strftime('%Y%m%d'))
else:
suffix = '{}/{}_SRS.tar.gz'.format(
end_year, day.end.strftime('%Y'))
url = base_url + suffix
result.append(url)
return result
# Validate time range srs generated daily since 1996
cur_year = Time.now().datetime.year
req_start_year = timerange.start.datetime.year
req_end_year = timerange.end.datetime.year

# Return early if both start and end are less than or greater than limits
if req_start_year <= req_end_year < self.MIN_YEAR \
or req_end_year >= req_start_year > cur_year:
return result

# No point in searching below the min or above max years
start_year = max(req_start_year, self.MIN_YEAR)
end_year = min(req_end_year, cur_year)

# Search for tarballs for all years in the query
tarball_timerange = TimeRange(f'{start_year}-01-01', f'{end_year}-12-31 23:59:59.999')
tarball_urls = dict()
tarball_scraper = Scraper(self.BASE_URL + '%Y/%Y_SRS.tar.gz')
tarballs = tarball_scraper.filelist(tarball_timerange)
max_tarball_year = None
for tb_url in tarballs:
date = tarball_scraper._extractDateURL(tb_url)
year = date.to_datetime().year
max_tarball_year = year
tarball_urls[year] = tb_url
log.debug('SRS tarball found for year %d', year)

# Create a new time range for the times not covered by tarballs, have to assume tarballs
# cover a year, and look for individual srs file for this time range.
srs_urls = dict()
min_file_year = max_tarball_year if max_tarball_year else start_year
min_file_date = (datetime(max_tarball_year, 12, 31, 23, 59, 59) if max_tarball_year else
datetime(start_year, 1, 1, 0, 0, 0))
max_file_date = min(timerange.end.datetime, Time.now().datetime)
if min_file_date < max_file_date:
file_timerange = TimeRange(f'{min_file_year}-01-01', max_file_date)
srsfile_scraper = Scraper(self.BASE_URL + '%Y/SRS/%Y%m%dSRS.txt')
srsfiles = srsfile_scraper.filelist(file_timerange)
for srs_url in srsfiles:
date = srsfile_scraper._extractDateURL(srs_url)
srs_urls[(date.datetime.year, date.datetime.month, date.datetime.day)] = srs_url
log.debug('SRS file found for year %d', date)

# Now iterate over all days and if the day is in a year we have a tarball for or a day there
# is a individual srs file add to the result with corresponding extdict
for day in timerange.get_dates():
day_ymd = (int(day.strftime('%Y')), int(day.strftime('%m')), int(day.strftime('%d')))
extdict = {'year': day_ymd[0], 'month': day_ymd[1], 'day': day_ymd[2]}
if self.MIN_YEAR <= day_ymd[0] <= cur_year:
if day_ymd[0] in tarball_urls.keys():
result.append((extdict, tarball_urls[day_ymd[0]]))
elif day_ymd in srs_urls.keys():
result.append((extdict, srs_urls[day_ymd]))

def post_search_hook(self, exdict, matchdict):
# update the extracted metadata to include the queried times rather
# than those scraped from the downloaded zip (which includes full year data).
rowdict = super().post_search_hook(exdict, matchdict)
rowdict["Start Time"] = matchdict["Start Time"]
rowdict["End Time"] = matchdict["End Time"]
rowdict["Start Time"].format = 'iso'
rowdict["End Time"].format = 'iso'
return rowdict
return result

def search(self, *args, **kwargs):
extractor1 = '{}/warehouse/{:4d}/SRS/{year:4d}{month:2d}{day:2d}SRS.txt'
extractor2 = '{}/warehouse/{year:4d}/{}'
matchdict = self._get_match_dict(*args, **kwargs)
timerange = TimeRange(matchdict['Start Time'], matchdict['End Time'])
metalist = []
for url in self._get_url_for_timerange(timerange):
exdict1 = parse(extractor1, url)
exdict2 = parse(extractor2, url)
exdict = (exdict2 if exdict1 is None else exdict1).named
exdict['url'] = url
rowdict = self.post_search_hook(exdict, matchdict)
for extdict, url in self._get_url_for_timerange(timerange):
extdict['url'] = url
rowdict = self.post_search_hook(extdict, matchdict)
metalist.append(rowdict)
return QueryResponse(metalist, client=self)

Expand All @@ -219,20 +244,14 @@ def fetch(self, qres, path=None, error_callback=None, **kwargs):
-------
Results Object
"""

urls = [qrblock['url'] for qrblock in qres]

filenames = []
local_filenames = []

for i, [url, qre] in enumerate(zip(urls, qres)):
for url, qre in zip(urls, qres):
name = url.split('/')[-1]

day = Time(qre['Start Time'].strftime('%Y-%m-%d')) + TimeDelta(i*u.day)

day = qre['Start Time']
if name not in filenames:
filenames.append(name)

if name.endswith('.gz'):
local_filenames.append('{}SRS.txt'.format(day.strftime('%Y%m%d')))
else:
Expand All @@ -250,37 +269,28 @@ def fetch(self, qres, path=None, error_callback=None, **kwargs):
# OrderedDict is required to maintain ordering because it will be zipped with paths later
urls = list(OrderedDict.fromkeys(urls))

dobj = Downloader(max_conn=5)

downloader = Downloader(max_conn=2)
for aurl, fname in zip(urls, paths):
dobj.enqueue_file(aurl, filename=fname)
downloader.enqueue_file(aurl, filename=fname)

paths = dobj.download()
paths = downloader.download()

outfiles = []
for fname, srs_filename in zip(local_paths, local_filenames):

name = fname.name

past_year = False
for i, fname2 in enumerate(paths):
for fname2 in paths:
fname2 = pathlib.Path(fname2)

if fname2.name.endswith('.txt'):
continue

year = fname2.name.split('_SRS')[0]

if year in name:
TarFile = tarfile.open(fname2)
filepath = fname.parent
member = TarFile.getmember('SRS/' + srs_filename)
member.name = name
TarFile.extract(member, path=filepath)
TarFile.close()

with tarfile.open(fname2) as open_tar:
filepath = fname.parent
member = open_tar.getmember('SRS/' + srs_filename)
member.name = name
open_tar.extract(member, path=filepath)
outfiles.append(fname)

past_year = True
break

Expand Down
31 changes: 28 additions & 3 deletions sunpy/net/dataretriever/sources/tests/test_noaa.py
Expand Up @@ -137,13 +137,13 @@ def test_fetch(mock_wait, mock_search, mock_enqueue, tmp_path, indices_client):
path / "observed-solar-cycle-indices.json"))


@no_vso
@mock.patch('sunpy.net.dataretriever.sources.noaa.NOAAIndicesClient.search',
return_value=mock_query_object('2012/10/4', '2012/10/6'))
# The return value of download is irrelevant
@mock.patch('parfive.Downloader.download',
return_value=None)
@mock.patch('parfive.Downloader.enqueue_file')
@no_vso
def test_fido(mock_wait, mock_search, mock_enqueue, tmp_path, indices_client):
path = tmp_path / "sub"
path.mkdir()
Expand All @@ -159,6 +159,7 @@ def test_fido(mock_wait, mock_search, mock_enqueue, tmp_path, indices_client):
path / "observed-solar-cycle-indices.json"))


@no_vso
@pytest.mark.remote_data
def test_srs_tar_unpack():
qr = Fido.search(a.Instrument("soon") & a.Time("2015/01/01", "2015/01/01T23:59:29"))
Expand All @@ -167,6 +168,7 @@ def test_srs_tar_unpack():
assert res.data[0].endswith("20150101SRS.txt")


@no_vso
@pytest.mark.remote_data
def test_srs_tar_unpack_midyear():
qr = Fido.search(a.Instrument("soon") & a.Time("2011/06/07", "2011/06/08T23:59:29"))
Expand All @@ -176,15 +178,27 @@ def test_srs_tar_unpack_midyear():
assert res.data[-1].endswith("20110608SRS.txt")


@no_vso
@pytest.mark.remote_data
@mock.patch('ftplib.FTP.nlst', side_effect=[[''], ['20200101SRS.txt', '20200102SRS.txt']])
def test_srs_missing_tarball(mock_ftp_nlst):
qr = Fido.search(a.Time('2020-01-01', '2020-01-02'), a.Instrument.srs_table)
urls = [qrblock['url'] for qrblock in qr[0]]
assert urls[0].endswith('20200101SRS.txt')
assert urls[1].endswith('20200102SRS.txt')


@no_vso
@pytest.mark.remote_data
def test_srs_current_year():
year = datetime.date.today().year
qr = Fido.search(a.Instrument("soon") & a.Time(f"{year}/01/01", f"{year}/01/01T23:59:29"))
qr = Fido.search(a.Instrument("soon") & a.Time(f"{year}/02/01", f"{year}/02/01T23:59:29"))
res = Fido.fetch(qr)
assert len(res) == 1
assert res.data[0].endswith(f"{year}0101SRS.txt")
assert res.data[0].endswith(f"{year}0201SRS.txt")


@no_vso
@pytest.mark.remote_data
def test_srs_save_path(tmpdir):
qr = Fido.search(a.Instrument.srs_table, a.Time("2016/10/01", "2016/10/02"))
Expand All @@ -194,6 +208,7 @@ def test_srs_save_path(tmpdir):
assert files[1].endswith("20161002SRS.txt")


@pytest.mark.remote_data
@pytest.mark.filterwarnings('ignore:ERFA function')
def test_srs_out_of_range(srs_client):
res = srs_client.search(a.Time('1995/01/01', '1995/02/01'))
Expand All @@ -202,6 +217,16 @@ def test_srs_out_of_range(srs_client):
assert len(res) == 0


@pytest.mark.remote_data
@pytest.mark.filterwarnings('ignore:ERFA function')
def test_srs_start_or_end_out_of_range(srs_client):
res = srs_client.search(a.Time('1995/12/30', '1996/01/02'))
assert len(res) == 1
cur_year = datetime.date.today().year
res = srs_client.search(a.Time(f'{cur_year}/01/01', f'{cur_year+2}/01/01'))
assert len(res) > 0


def test_no_time(predict_client, indices_client):
res = indices_client.search(a.Instrument.noaa_indices)
assert len(res) == 1
Expand Down
4 changes: 2 additions & 2 deletions sunpy/net/tests/strategies.py
Expand Up @@ -54,7 +54,7 @@ def offline_instruments():
Returns a strategy for any instrument that does not need the internet to do
a query.
"""
offline_instr = ['noaa-indices', 'noaa-predict', 'soon']
offline_instr = ['noaa-indices', 'noaa-predict']
offline_instr = st.builds(a.Instrument, st.sampled_from(offline_instr))

return st.one_of(offline_instr)
Expand All @@ -65,7 +65,7 @@ def online_instruments():
Returns a strategy for any instrument that does need the internet to do
a query.
"""
online_instr = ['lyra', 'goes', 'eve', 'rhessi', 'norh']
online_instr = ['lyra', 'goes', 'eve', 'rhessi', 'norh', 'soon']
online_instr = st.builds(a.Instrument, st.sampled_from(online_instr))

return online_instr
Expand Down
4 changes: 2 additions & 2 deletions sunpy/net/tests/test_fido.py
Expand Up @@ -40,8 +40,6 @@ def offline_query(draw, instrument=offline_instruments()):
# If we have AttrAnd then we don't have GOES
if isinstance(query, a.Instrument) and query.value == 'goes':
query &= draw(goes_time())
elif isinstance(query, a.Instrument) and query.value == 'soon':
query &= draw(srs_time())
else:
query = attr.and_(query, draw(time_attr()))
return query
Expand All @@ -55,6 +53,8 @@ def online_query(draw, instrument=online_instruments()):
query &= a.Level.zero
if isinstance(query, a.Instrument) and query.value == 'norh':
query &= a.Wavelength(17*u.GHz)
if isinstance(query, a.Instrument) and query.value == 'soon':
query &= draw(srs_time())

return query

Expand Down

0 comments on commit 6685160

Please sign in to comment.