Skip to content

Commit

Permalink
Merge pull request #7453 from ViciousEagle03/my_fix
Browse files Browse the repository at this point in the history
Fix filename sanitization for downloaded files (do not replace periods, do not change case, and do not leave Unicode characters decomposed)
  • Loading branch information
nabobalis committed Mar 21, 2024
2 parents 2e61930 + cbc88fb commit f610592
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 34 deletions.
1 change: 1 addition & 0 deletions changelog/7453.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed the sanitization of the names of files downloaded via VSO so that periods are no longer replaced and case is no longer forced to be lowercase.
2 changes: 1 addition & 1 deletion docs/tutorial/acquiring_data/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ Here we'll just download the first file in the result:
>>> downloaded_files = Fido.fetch(results[0, 0]) # doctest: +REMOTE_DATA
>>> downloaded_files # doctest: +REMOTE_DATA
<parfive.results.Results object at ...>
['.../aia_lev1_335a_2020_01_01t00_00_00_64z_image_lev1.fits']
['.../aia.lev1.335A_2020_01_01T00_00_00.64Z.image_lev1.fits']
This downloads the files to the location set in the sunpy config file.
It also returns a `parfive.Results` object ``downloaded_files``, which contains local file paths to all the downloaded data.
Expand Down
2 changes: 1 addition & 1 deletion sunpy/net/tests/test_fido.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def test_path(tmp_path):
results = Fido.search(
a.Time("2022/1/1", "2022/1/1"), a.Instrument.aia)
file = Fido.fetch(results, path=tmp_path / "{file}")
assert file == [f'{tmp_path}/aia_lev1_335a_2022_01_01t00_00_00_62z_image_lev1.fits']
assert file == [str(pathlib.Path(tmp_path, "aia.lev1.335A_2022_01_01T00_00_00.62Z.image_lev1.fits"))]


@pytest.mark.remote_data
Expand Down
11 changes: 5 additions & 6 deletions sunpy/net/vso/tests/test_vso.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,10 @@ def test_path(client, tmpdir):
files = client.fetch(qr, path=tmp_dir)

assert len(files) == 1

# The construction of a VSO filename is bonkers complex, so there is no
# The construction of a VSO filename is BONKERS, so there is no
# practical way to determine what it should be in this test, so we just
# put it here.
assert "aia_lev1_171a_2020_06_07t06_33_09_35z_image_lev1.fits" in files[0]
assert "aia.lev1.171A_2020_06_07T06_33_09.35Z.image_lev1.fits" in files[0]


@pytest.mark.filterwarnings('ignore:ERFA function.*dubious year')
Expand Down Expand Up @@ -357,10 +356,10 @@ def test_incorrect_content_disposition(client):
results = client.search(
core_attrs.Time('2011/1/1 01:00', '2011/1/1 01:02'),
core_attrs.Instrument('mdi'), response_format="table")
files = client.fetch(results[0:1])
files = client.fetch(results[:1])

assert len(files) == 1
assert files[0].endswith("mdi_vw_v_9466622_9466622.tar")
assert files[0].endswith("mdi_vw_V_9466622_9466622.tar")
assert "Content" not in files[0]


Expand Down Expand Up @@ -435,7 +434,7 @@ def test_iris_filename(client):
url = "https://www.lmsal.com/solarsoft/irisa/data/level2_compressed/2018/01/02/20180102_153155_3610108077/iris_l2_20180102_153155_3610108077_SJI_1330_t000.fits.gz"
search_results = client.search(a.Time("2018-01-02 15:31:55", "2018-01-02 15:31:55"), a.Instrument.iris)
filename = client.mk_filename(pattern, search_results[0], None, url)
assert filename.endswith("iris_l2_20180102_153155_3610108077_sji_1330_t000_fits.gz")
assert filename.endswith("iris_l2_20180102_153155_3610108077_SJI_1330_t000.fits.gz")


@pytest.mark.remote_data
Expand Down
28 changes: 10 additions & 18 deletions sunpy/util/net.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
This module provides general net utility functions.
"""
import os
import re
import sys
import shutil
from unicodedata import normalize
Expand All @@ -15,14 +14,16 @@
__all__ = ['parse_header', 'slugify', 'get_content_disposition', 'get_filename',
'get_system_filename', 'download_file', 'download_fileobj']

# Characters not allowed in slugified version.
_punct_re = re.compile(r'[:\t !"#$%&\'()*\-/<=>?@\[\\\]^_`{|},.]+')


def slugify(text, delim='_'):
"""
r"""
Slugify given unicode text.
This function performs a Unicode normalization to NFKC form, followed by replacing
the following characters by the delimiter:
: (tab) (space) ! " # $ % & ' ( ) * - / < = > ? @ [ \\ ] ^ _ ` { | } ,
Parameters
----------
text : `str`
Expand All @@ -35,21 +36,12 @@ def slugify(text, delim='_'):
`str` :
The slugify `str` name.
"""
text = normalize('NFKD', text)

period = '.'

name_and_extension = text.rsplit(period, 1)
name = name_and_extension[0]
text = normalize('NFKC', text)

name = str(delim).join(
filter(None, (word for word in _punct_re.split(name.lower()))))
chars_to_replace = ":\t !\"#$%&'()*-/<=>?@[\\]^_`{|},"
trans_map = str.maketrans({c: delim for c in chars_to_replace})

if len(name_and_extension) == 2:
extension = name_and_extension[1]
return str(period).join([name, extension])
else:
return name
return text.translate(trans_map)


def get_content_disposition(content_disposition):
Expand Down
15 changes: 7 additions & 8 deletions sunpy/util/tests/test_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,10 @@ def test_content_disposition_unicode():


def test_slugify():
assert sunpy.util.net.slugify("äb c", "b_c")
assert sunpy.util.net.slugify("file.greg.fits") == "file_greg.fits"
assert sunpy.util.net.slugify("file.greg.fits", "x") == "filexgreg.fits"
assert sunpy.util.net.slugify("filegreg.fits") == "filegreg.fits"
assert sunpy.util.net.slugify("filegreg") == "filegreg"
assert sunpy.util.net.slugify("f/i*l:e,gr.eg.fits") == "f_i_l_e_gr_eg.fits"
assert sunpy.util.net.slugify(
"part1.part2.part3.part4.part5") == "part1_part2_part3_part4.part5"
assert sunpy.util.net.slugify("ä™") == "äTM" # Unicode NFKC normalization
assert sunpy.util.net.slugify("filegreg") == "filegreg" # no file extension
assert sunpy.util.net.slugify("filegreg.fits") == "filegreg.fits" # one file extension
assert sunpy.util.net.slugify("file.greg.fits") == "file.greg.fits" # more than one apparent file extension
assert sunpy.util.net.slugify("AbCdEf") == "AbCdEf" # uppercase characters
assert sunpy.util.net.slugify("f/i*l:e,gr.eg.fits") == "f_i_l_e_gr.eg.fits" # special characters
assert sunpy.util.net.slugify("file greg'.fits", "x") == "filexgregx.fits" # custom delimiter

0 comments on commit f610592

Please sign in to comment.