Merge pull request #7453 from ViciousEagle03/my_fix

Fix filename sanitization for downloaded files (do not replace periods, do not change case, and do not leave Unicode characters decomposed)
sunpy · Mar 21, 2024 · f610592 · f610592
2 parents 2e61930 + cbc88fb
commit f610592
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 34 deletions.
diff --git a/changelog/7453.bugfix.rst b/changelog/7453.bugfix.rst
@@ -0,0 +1 @@
+Fixed the sanitization of the names of files downloaded via VSO so that periods are no longer replaced and case is no longer forced to be lowercase.
diff --git a/docs/tutorial/acquiring_data/index.rst b/docs/tutorial/acquiring_data/index.rst
@@ -489,7 +489,7 @@ Here we'll just download the first file in the result:
     >>> downloaded_files = Fido.fetch(results[0, 0]) # doctest: +REMOTE_DATA
     >>> downloaded_files # doctest: +REMOTE_DATA
     <parfive.results.Results object at ...>
-    ['.../aia_lev1_335a_2020_01_01t00_00_00_64z_image_lev1.fits']
+    ['.../aia.lev1.335A_2020_01_01T00_00_00.64Z.image_lev1.fits']
 
 This downloads the files to the location set in the sunpy config file.
 It also returns a `parfive.Results` object ``downloaded_files``, which contains local file paths to all the downloaded data.

diff --git a/sunpy/net/tests/test_fido.py b/sunpy/net/tests/test_fido.py
@@ -231,7 +231,7 @@ def test_path(tmp_path):
     results = Fido.search(
         a.Time("2022/1/1", "2022/1/1"), a.Instrument.aia)
     file = Fido.fetch(results, path=tmp_path / "{file}")
-    assert file == [f'{tmp_path}/aia_lev1_335a_2022_01_01t00_00_00_62z_image_lev1.fits']
+    assert file == [str(pathlib.Path(tmp_path, "aia.lev1.335A_2022_01_01T00_00_00.62Z.image_lev1.fits"))]
 
 
 @pytest.mark.remote_data

diff --git a/sunpy/net/vso/tests/test_vso.py b/sunpy/net/vso/tests/test_vso.py
@@ -130,11 +130,10 @@ def test_path(client, tmpdir):
     files = client.fetch(qr, path=tmp_dir)
 
     assert len(files) == 1
-
-    # The construction of a VSO filename is bonkers complex, so there is no
+    # The construction of a VSO filename is BONKERS, so there is no
     # practical way to determine what it should be in this test, so we just
     # put it here.
-    assert "aia_lev1_171a_2020_06_07t06_33_09_35z_image_lev1.fits" in files[0]
+    assert "aia.lev1.171A_2020_06_07T06_33_09.35Z.image_lev1.fits" in files[0]
 
 
 @pytest.mark.filterwarnings('ignore:ERFA function.*dubious year')
@@ -357,10 +356,10 @@ def test_incorrect_content_disposition(client):
     results = client.search(
         core_attrs.Time('2011/1/1 01:00', '2011/1/1 01:02'),
         core_attrs.Instrument('mdi'), response_format="table")
-    files = client.fetch(results[0:1])
+    files = client.fetch(results[:1])
 
     assert len(files) == 1
-    assert files[0].endswith("mdi_vw_v_9466622_9466622.tar")
+    assert files[0].endswith("mdi_vw_V_9466622_9466622.tar")
     assert "Content" not in files[0]
 
 
@@ -435,7 +434,7 @@ def test_iris_filename(client):
     url = "https://www.lmsal.com/solarsoft/irisa/data/level2_compressed/2018/01/02/20180102_153155_3610108077/iris_l2_20180102_153155_3610108077_SJI_1330_t000.fits.gz"
     search_results = client.search(a.Time("2018-01-02 15:31:55", "2018-01-02 15:31:55"), a.Instrument.iris)
     filename = client.mk_filename(pattern, search_results[0], None, url)
-    assert filename.endswith("iris_l2_20180102_153155_3610108077_sji_1330_t000_fits.gz")
+    assert filename.endswith("iris_l2_20180102_153155_3610108077_SJI_1330_t000.fits.gz")
 
 
 @pytest.mark.remote_data

diff --git a/sunpy/util/net.py b/sunpy/util/net.py
@@ -2,7 +2,6 @@
 This module provides general net utility functions.
 """
 import os
-import re
 import sys
 import shutil
 from unicodedata import normalize
@@ -15,14 +14,16 @@
 __all__ = ['parse_header', 'slugify', 'get_content_disposition', 'get_filename',
            'get_system_filename', 'download_file', 'download_fileobj']
 
-# Characters not allowed in slugified version.
-_punct_re = re.compile(r'[:\t !"#$%&\'()*\-/<=>?@\[\\\]^_`{|},.]+')
-
 
 def slugify(text, delim='_'):
-    """
+    r"""
     Slugify given unicode text.
 
+    This function performs a Unicode normalization to NFKC form, followed by replacing
+    the following characters by the delimiter:
+
+    : (tab) (space) ! " # $ % & ' ( ) * - / < = > ? @ [ \\ ] ^ _ ` { | } ,
+
     Parameters
     ----------
     text : `str`
@@ -35,21 +36,12 @@ def slugify(text, delim='_'):
     `str` :
         The slugify `str` name.
     """
-    text = normalize('NFKD', text)
-
-    period = '.'
-
-    name_and_extension = text.rsplit(period, 1)
-    name = name_and_extension[0]
+    text = normalize('NFKC', text)
 
-    name = str(delim).join(
-        filter(None, (word for word in _punct_re.split(name.lower()))))
+    chars_to_replace = ":\t !\"#$%&'()*-/<=>?@[\\]^_`{|},"
+    trans_map = str.maketrans({c: delim for c in chars_to_replace})
 
-    if len(name_and_extension) == 2:
-        extension = name_and_extension[1]
-        return str(period).join([name, extension])
-    else:
-        return name
+    return text.translate(trans_map)
 
 
 def get_content_disposition(content_disposition):

diff --git a/sunpy/util/tests/test_net.py b/sunpy/util/tests/test_net.py
@@ -18,11 +18,10 @@ def test_content_disposition_unicode():
 
 
 def test_slugify():
-    assert sunpy.util.net.slugify("äb c", "b_c")
-    assert sunpy.util.net.slugify("file.greg.fits") == "file_greg.fits"
-    assert sunpy.util.net.slugify("file.greg.fits", "x") == "filexgreg.fits"
-    assert sunpy.util.net.slugify("filegreg.fits") == "filegreg.fits"
-    assert sunpy.util.net.slugify("filegreg") == "filegreg"
-    assert sunpy.util.net.slugify("f/i*l:e,gr.eg.fits") == "f_i_l_e_gr_eg.fits"
-    assert sunpy.util.net.slugify(
-        "part1.part2.part3.part4.part5") == "part1_part2_part3_part4.part5"
+    assert sunpy.util.net.slugify("ä™") == "äTM"  # Unicode NFKC normalization
+    assert sunpy.util.net.slugify("filegreg") == "filegreg"  # no file extension
+    assert sunpy.util.net.slugify("filegreg.fits") == "filegreg.fits"  # one file extension
+    assert sunpy.util.net.slugify("file.greg.fits") == "file.greg.fits"  # more than one apparent file extension
+    assert sunpy.util.net.slugify("AbCdEf") == "AbCdEf"  # uppercase characters
+    assert sunpy.util.net.slugify("f/i*l:e,gr.eg.fits") == "f_i_l_e_gr.eg.fits"  # special characters
+    assert sunpy.util.net.slugify("file greg'.fits", "x") == "filexgregx.fits"  # custom delimiter