sunpy · Cadair · Feb 18, 2021 · Jan 25, 2021 · Jan 29, 2021 · Jan 29, 2021
diff --git a/changelog/4904.bugfix.rst b/changelog/4904.bugfix.rst
@@ -0,0 +1,2 @@
+Fixed the `~.SRSClient` which silently failed to download the SRS files when the tarball for the previous years did not exist.
+Client now actually searches for the tarballs and srs files on the ftp archive before returning them as results.
diff --git a/sunpy/net/dataretriever/sources/gong.py b/sunpy/net/dataretriever/sources/gong.py
@@ -18,12 +18,14 @@ class GONGClient(GenericClient):
     <sunpy.net.fido_factory.UnifiedResponse object at ...>
     Results from 1 Provider:
     <BLANKLINE>
-    3 Results from the GONGClient:
+    5 Results from the GONGClient:
            Start Time               End Time        Instrument ... Source ExtentType
     ----------------------- ----------------------- ---------- ... ------ ----------
     2019-12-31 22:14:00.000 2019-12-31 22:14:59.999       GONG ...    NSO   SYNOPTIC
     2019-12-31 23:04:00.000 2019-12-31 23:04:59.999       GONG ...    NSO   SYNOPTIC
     2019-12-31 23:54:00.000 2019-12-31 23:54:59.999       GONG ...    NSO   SYNOPTIC
+    2020-01-01 00:14:00.000 2020-01-01 00:14:59.999       GONG ...    NSO   SYNOPTIC
+    2020-01-01 01:14:00.000 2020-01-01 01:14:59.999       GONG ...    NSO   SYNOPTIC
     <BLANKLINE>
     <BLANKLINE>
 

diff --git a/sunpy/net/dataretriever/sources/noaa.py b/sunpy/net/dataretriever/sources/noaa.py
@@ -3,16 +3,17 @@
 # Google Summer of Code 2014
 import pathlib
 import tarfile
+from datetime import datetime
 from collections import OrderedDict
 
-from astropy import units as u
-from astropy.time import Time, TimeDelta
+from astropy.time import Time
 
-from sunpy.extern.parse import parse
+from sunpy import log
 from sunpy.net import attrs as a
 from sunpy.net.dataretriever import GenericClient, QueryResponse
 from sunpy.time import TimeRange
 from sunpy.util.parfive_helpers import Downloader
+from sunpy.util.scraper import Scraper
 
 __all__ = ['NOAAIndicesClient', 'NOAAPredictClient', 'SRSClient']
 
@@ -150,59 +151,83 @@ class SRSClient(GenericClient):
     2 Results from the SRSClient:
            Start Time               End Time        Instrument ... Source Provider
     ----------------------- ----------------------- ---------- ... ------ --------
-    2016-01-01 00:00:00.000 2016-01-02 00:00:00.000       SOON ...   SWPC     NOAA
-    2016-01-01 00:00:00.000 2016-01-02 00:00:00.000       SOON ...   SWPC     NOAA
+    2016-01-01 00:00:00.000 2016-01-01 23:59:59.999       SOON ...   SWPC     NOAA
+    2016-01-02 00:00:00.000 2016-01-02 23:59:59.999       SOON ...   SWPC     NOAA
     <BLANKLINE>
     <BLANKLINE>
 
     """
+    BASE_URL = 'ftp://ftp.swpc.noaa.gov/pub/warehouse/'
+    MIN_YEAR = 1996
 
     def _get_url_for_timerange(self, timerange):
         """
-        Returns a list of urls corresponding to a
-        given time-range.
+        Returns a list of urls corresponding to a given time-range.
         """
         result = list()
-        base_url = 'ftp://ftp.swpc.noaa.gov/pub/warehouse/'
-        total_days = int(timerange.days.value) + 1
-        all_dates = timerange.split(total_days)
-        today_year = int(Time.now().strftime('%Y'))
-        for day in all_dates:
-            end_year = int(day.end.strftime('%Y'))
-            if end_year > today_year or end_year < 1996:
-                continue
-            elif end_year == today_year:
-                suffix = '{}/SRS/{}SRS.txt'.format(
-                    end_year, day.end.strftime('%Y%m%d'))
-            else:
-                suffix = '{}/{}_SRS.tar.gz'.format(
-                    end_year, day.end.strftime('%Y'))
-            url = base_url + suffix
-            result.append(url)
-        return result
+        # Validate time range srs generated daily since 1996
+        cur_year = Time.now().datetime.year
+        req_start_year = timerange.start.datetime.year
+        req_end_year = timerange.end.datetime.year
+
+        # Return early if both start and end are less than or greater than limits
+        if req_start_year <= req_end_year < self.MIN_YEAR \
+                or req_end_year >= req_start_year > cur_year:
+            return result
+
+        # No point in searching below the min or above max years
+        start_year = max(req_start_year, self.MIN_YEAR)
+        end_year = min(req_end_year, cur_year)
+
+        # Search for tarballs for all years in the query
+        tarball_timerange = TimeRange(f'{start_year}-01-01', f'{end_year}-12-31 23:59:59.999')
+        tarball_urls = dict()
+        tarball_scraper = Scraper(self.BASE_URL + '%Y/%Y_SRS.tar.gz')
+        tarballs = tarball_scraper.filelist(tarball_timerange)
+        max_tarball_year = None
+        for tb_url in tarballs:
+            date = tarball_scraper._extractDateURL(tb_url)
+            year = date.to_datetime().year
+            max_tarball_year = year
+            tarball_urls[year] = tb_url
+            log.debug('SRS tarball found for year %d', year)
+
+        # Create a new time range for the times not covered by tarballs, have to assume tarballs
+        # cover a year, and look for individual srs file for this time range.
+        srs_urls = dict()
+        min_file_year = max_tarball_year if max_tarball_year else start_year
+        min_file_date = (datetime(max_tarball_year, 12, 31, 23, 59, 59) if max_tarball_year else
+                         datetime(start_year, 1, 1, 0, 0, 0))
+        max_file_date = min(timerange.end.datetime, Time.now().datetime)
+        if min_file_date < max_file_date:
+            file_timerange = TimeRange(f'{min_file_year}-01-01', max_file_date)
+            srsfile_scraper = Scraper(self.BASE_URL + '%Y/SRS/%Y%m%dSRS.txt')
+            srsfiles = srsfile_scraper.filelist(file_timerange)
+            for srs_url in srsfiles:
+                date = srsfile_scraper._extractDateURL(srs_url)
+                srs_urls[(date.datetime.year, date.datetime.month, date.datetime.day)] = srs_url
+                log.debug('SRS file found for year %d', date)
+
+        # Now iterate over all days and if the day is in a year we have a tarball for or a day there
+        # is a individual srs file add to the result with corresponding extdict
+        for day in timerange.get_dates():
+            day_ymd = (int(day.strftime('%Y')), int(day.strftime('%m')), int(day.strftime('%d')))
+            extdict = {'year': day_ymd[0], 'month': day_ymd[1], 'day': day_ymd[2]}
+            if self.MIN_YEAR <= day_ymd[0] <= cur_year:
+                if day_ymd[0] in tarball_urls.keys():
+                    result.append((extdict, tarball_urls[day_ymd[0]]))
+                elif day_ymd in srs_urls.keys():
+                    result.append((extdict, srs_urls[day_ymd]))
 
-    def post_search_hook(self, exdict, matchdict):
-        # update the extracted metadata to include the queried times rather
-        # than those scraped from the downloaded zip (which includes full year data).
-        rowdict = super().post_search_hook(exdict, matchdict)
-        rowdict["Start Time"] = matchdict["Start Time"]
-        rowdict["End Time"] = matchdict["End Time"]
-        rowdict["Start Time"].format = 'iso'
-        rowdict["End Time"].format = 'iso'
-        return rowdict
+        return result
 
     def search(self, *args, **kwargs):
-        extractor1 = '{}/warehouse/{:4d}/SRS/{year:4d}{month:2d}{day:2d}SRS.txt'
-        extractor2 = '{}/warehouse/{year:4d}/{}'
         matchdict = self._get_match_dict(*args, **kwargs)
         timerange = TimeRange(matchdict['Start Time'], matchdict['End Time'])
         metalist = []
-        for url in self._get_url_for_timerange(timerange):
-            exdict1 = parse(extractor1, url)
-            exdict2 = parse(extractor2, url)
-            exdict = (exdict2 if exdict1 is None else exdict1).named
-            exdict['url'] = url
-            rowdict = self.post_search_hook(exdict, matchdict)
+        for extdict, url in self._get_url_for_timerange(timerange):
+            extdict['url'] = url
+            rowdict = self.post_search_hook(extdict, matchdict)
             metalist.append(rowdict)
         return QueryResponse(metalist, client=self)
 
@@ -219,20 +244,14 @@ def fetch(self, qres, path=None, error_callback=None, **kwargs):
         -------
         Results Object
         """
-
         urls = [qrblock['url'] for qrblock in qres]
-
         filenames = []
         local_filenames = []
-
-        for i, [url, qre] in enumerate(zip(urls, qres)):
+        for url, qre in zip(urls, qres):
             name = url.split('/')[-1]
-
-            day = Time(qre['Start Time'].strftime('%Y-%m-%d')) + TimeDelta(i*u.day)
-
+            day = qre['Start Time']
             if name not in filenames:
                 filenames.append(name)
-
             if name.endswith('.gz'):
                 local_filenames.append('{}SRS.txt'.format(day.strftime('%Y%m%d')))
             else:
@@ -250,37 +269,28 @@ def fetch(self, qres, path=None, error_callback=None, **kwargs):
         # OrderedDict is required to maintain ordering because it will be zipped with paths later
         urls = list(OrderedDict.fromkeys(urls))
 
-        dobj = Downloader(max_conn=5)
-
+        downloader = Downloader(max_conn=2)
         for aurl, fname in zip(urls, paths):
-            dobj.enqueue_file(aurl, filename=fname)
+            downloader.enqueue_file(aurl, filename=fname)
 
-        paths = dobj.download()
+        paths = downloader.download()
 
         outfiles = []
         for fname, srs_filename in zip(local_paths, local_filenames):
-
             name = fname.name
-
             past_year = False
-            for i, fname2 in enumerate(paths):
+            for fname2 in paths:
                 fname2 = pathlib.Path(fname2)
-
                 if fname2.name.endswith('.txt'):
                     continue
-
                 year = fname2.name.split('_SRS')[0]
-
                 if year in name:
-                    TarFile = tarfile.open(fname2)
-                    filepath = fname.parent
-                    member = TarFile.getmember('SRS/' + srs_filename)
-                    member.name = name
-                    TarFile.extract(member, path=filepath)
-                    TarFile.close()
-
+                    with tarfile.open(fname2) as open_tar:
+                        filepath = fname.parent
+                        member = open_tar.getmember('SRS/' + srs_filename)
+                        member.name = name
+                        open_tar.extract(member, path=filepath)
                     outfiles.append(fname)
-
                     past_year = True
                     break
 

diff --git a/sunpy/net/dataretriever/sources/tests/test_noaa.py b/sunpy/net/dataretriever/sources/tests/test_noaa.py
@@ -137,13 +137,13 @@ def test_fetch(mock_wait, mock_search, mock_enqueue, tmp_path, indices_client):
                                           path / "observed-solar-cycle-indices.json"))
 
 
-@no_vso
 @mock.patch('sunpy.net.dataretriever.sources.noaa.NOAAIndicesClient.search',
             return_value=mock_query_object('2012/10/4', '2012/10/6'))
 # The return value of download is irrelevant
 @mock.patch('parfive.Downloader.download',
             return_value=None)
 @mock.patch('parfive.Downloader.enqueue_file')
+@no_vso
 def test_fido(mock_wait, mock_search, mock_enqueue, tmp_path, indices_client):
     path = tmp_path / "sub"
     path.mkdir()
@@ -159,6 +159,7 @@ def test_fido(mock_wait, mock_search, mock_enqueue, tmp_path, indices_client):
                                           path / "observed-solar-cycle-indices.json"))
 
 
+@no_vso
 @pytest.mark.remote_data
 def test_srs_tar_unpack():
     qr = Fido.search(a.Instrument("soon") & a.Time("2015/01/01", "2015/01/01T23:59:29"))
@@ -167,6 +168,7 @@ def test_srs_tar_unpack():
     assert res.data[0].endswith("20150101SRS.txt")
 
 
+@no_vso
 @pytest.mark.remote_data
 def test_srs_tar_unpack_midyear():
     qr = Fido.search(a.Instrument("soon") & a.Time("2011/06/07", "2011/06/08T23:59:29"))
@@ -176,15 +178,27 @@ def test_srs_tar_unpack_midyear():
     assert res.data[-1].endswith("20110608SRS.txt")
 
 
+@no_vso
+@pytest.mark.remote_data
+@mock.patch('ftplib.FTP.nlst', side_effect=[[''], ['20200101SRS.txt', '20200102SRS.txt']])
+def test_srs_missing_tarball(mock_ftp_nlst):
+    qr = Fido.search(a.Time('2020-01-01', '2020-01-02'), a.Instrument.srs_table)
+    urls = [qrblock['url'] for qrblock in qr[0]]
+    assert urls[0].endswith('20200101SRS.txt')
+    assert urls[1].endswith('20200102SRS.txt')
+
+
+@no_vso
 @pytest.mark.remote_data
 def test_srs_current_year():
     year = datetime.date.today().year
-    qr = Fido.search(a.Instrument("soon") & a.Time(f"{year}/01/01", f"{year}/01/01T23:59:29"))
+    qr = Fido.search(a.Instrument("soon") & a.Time(f"{year}/02/01", f"{year}/02/01T23:59:29"))
     res = Fido.fetch(qr)
     assert len(res) == 1
-    assert res.data[0].endswith(f"{year}0101SRS.txt")
+    assert res.data[0].endswith(f"{year}0201SRS.txt")
 
 
+@no_vso
 @pytest.mark.remote_data
 def test_srs_save_path(tmpdir):
     qr = Fido.search(a.Instrument.srs_table, a.Time("2016/10/01", "2016/10/02"))
@@ -194,6 +208,7 @@ def test_srs_save_path(tmpdir):
     assert files[1].endswith("20161002SRS.txt")
 
 
+@pytest.mark.remote_data
 @pytest.mark.filterwarnings('ignore:ERFA function')
 def test_srs_out_of_range(srs_client):
     res = srs_client.search(a.Time('1995/01/01', '1995/02/01'))
@@ -202,6 +217,16 @@ def test_srs_out_of_range(srs_client):
     assert len(res) == 0
 
 
+@pytest.mark.remote_data
+@pytest.mark.filterwarnings('ignore:ERFA function')
+def test_srs_start_or_end_out_of_range(srs_client):
+    res = srs_client.search(a.Time('1995/12/30', '1996/01/02'))
+    assert len(res) == 1
+    cur_year = datetime.date.today().year
+    res = srs_client.search(a.Time(f'{cur_year}/01/01', f'{cur_year+2}/01/01'))
+    assert len(res) > 0
+
+
 def test_no_time(predict_client, indices_client):
     res = indices_client.search(a.Instrument.noaa_indices)
     assert len(res) == 1

diff --git a/sunpy/net/tests/strategies.py b/sunpy/net/tests/strategies.py
@@ -54,7 +54,7 @@ def offline_instruments():
     Returns a strategy for any instrument that does not need the internet to do
     a query.
     """
-    offline_instr = ['noaa-indices', 'noaa-predict', 'soon']
+    offline_instr = ['noaa-indices', 'noaa-predict']
     offline_instr = st.builds(a.Instrument, st.sampled_from(offline_instr))
 
     return st.one_of(offline_instr)
@@ -65,7 +65,7 @@ def online_instruments():
     Returns a strategy for any instrument that does need the internet to do
     a query.
     """
-    online_instr = ['lyra', 'goes', 'eve', 'rhessi', 'norh']
+    online_instr = ['lyra', 'goes', 'eve', 'rhessi', 'norh', 'soon']
     online_instr = st.builds(a.Instrument, st.sampled_from(online_instr))
 
     return online_instr

diff --git a/sunpy/net/tests/test_fido.py b/sunpy/net/tests/test_fido.py
@@ -40,8 +40,6 @@ def offline_query(draw, instrument=offline_instruments()):
     # If we have AttrAnd then we don't have GOES
     if isinstance(query, a.Instrument) and query.value == 'goes':
         query &= draw(goes_time())
-    elif isinstance(query, a.Instrument) and query.value == 'soon':
-        query &= draw(srs_time())
     else:
         query = attr.and_(query, draw(time_attr()))
     return query
@@ -55,6 +53,8 @@ def online_query(draw, instrument=online_instruments()):
         query &= a.Level.zero
     if isinstance(query, a.Instrument) and query.value == 'norh':
         query &= a.Wavelength(17*u.GHz)
+    if isinstance(query, a.Instrument) and query.value == 'soon':
+        query &= draw(srs_time())
 
     return query