Skip to content

Commit 06a8c06

Browse files
[3.13] gh-133890: Handle UnicodeEncodeError in tarfile (GH-134147) (GH-134196)
UnicodeEncodeError is now handled the same way as OSError during TarFile member extraction. (cherry picked from commit 9983c7d)
1 parent ea9c962 commit 06a8c06

File tree

3 files changed

+49
-6
lines changed

3 files changed

+49
-6
lines changed

Lib/tarfile.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2376,7 +2376,7 @@ def _get_extract_tarinfo(self, member, filter_function, path):
23762376
unfiltered = tarinfo
23772377
try:
23782378
tarinfo = filter_function(tarinfo, path)
2379-
except (OSError, FilterError) as e:
2379+
except (OSError, UnicodeEncodeError, FilterError) as e:
23802380
self._handle_fatal_error(e)
23812381
except ExtractError as e:
23822382
self._handle_nonfatal_error(e)
@@ -2397,7 +2397,7 @@ def _extract_one(self, tarinfo, path, set_attrs, numeric_owner):
23972397
self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
23982398
set_attrs=set_attrs,
23992399
numeric_owner=numeric_owner)
2400-
except OSError as e:
2400+
except (OSError, UnicodeEncodeError) as e:
24012401
self._handle_fatal_error(e)
24022402
except ExtractError as e:
24032403
self._handle_nonfatal_error(e)

Lib/test/test_tarfile.py

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3457,11 +3457,12 @@ class ArchiveMaker:
34573457
with t.open() as tar:
34583458
... # `tar` is now a TarFile with 'filename' in it!
34593459
"""
3460-
def __init__(self):
3460+
def __init__(self, **kwargs):
34613461
self.bio = io.BytesIO()
3462+
self.tar_kwargs = dict(kwargs)
34623463

34633464
def __enter__(self):
3464-
self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio)
3465+
self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio, **self.tar_kwargs)
34653466
return self
34663467

34673468
def __exit__(self, *exc):
@@ -4040,7 +4041,10 @@ def test_tar_filter(self):
40404041
# that in the test archive.)
40414042
with tarfile.TarFile.open(tarname) as tar:
40424043
for tarinfo in tar.getmembers():
4043-
filtered = tarfile.tar_filter(tarinfo, '')
4044+
try:
4045+
filtered = tarfile.tar_filter(tarinfo, '')
4046+
except UnicodeEncodeError:
4047+
continue
40444048
self.assertIs(filtered.name, tarinfo.name)
40454049
self.assertIs(filtered.type, tarinfo.type)
40464050

@@ -4051,11 +4055,48 @@ def test_data_filter(self):
40514055
for tarinfo in tar.getmembers():
40524056
try:
40534057
filtered = tarfile.data_filter(tarinfo, '')
4054-
except tarfile.FilterError:
4058+
except (tarfile.FilterError, UnicodeEncodeError):
40554059
continue
40564060
self.assertIs(filtered.name, tarinfo.name)
40574061
self.assertIs(filtered.type, tarinfo.type)
40584062

4063+
@unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
4064+
def test_filter_unencodable(self):
4065+
# Sanity check using a valid path.
4066+
tarinfo = tarfile.TarInfo(os_helper.TESTFN)
4067+
filtered = tarfile.tar_filter(tarinfo, '')
4068+
self.assertIs(filtered.name, tarinfo.name)
4069+
filtered = tarfile.data_filter(tarinfo, '')
4070+
self.assertIs(filtered.name, tarinfo.name)
4071+
4072+
tarinfo = tarfile.TarInfo('test\x00')
4073+
self.assertRaises(ValueError, tarfile.tar_filter, tarinfo, '')
4074+
self.assertRaises(ValueError, tarfile.data_filter, tarinfo, '')
4075+
tarinfo = tarfile.TarInfo('\ud800')
4076+
self.assertRaises(UnicodeEncodeError, tarfile.tar_filter, tarinfo, '')
4077+
self.assertRaises(UnicodeEncodeError, tarfile.data_filter, tarinfo, '')
4078+
4079+
@unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
4080+
def test_extract_unencodable(self):
4081+
# Create a member with name \xed\xa0\x80 which is UTF-8 encoded
4082+
# lone surrogate \ud800.
4083+
with ArchiveMaker(encoding='ascii', errors='surrogateescape') as arc:
4084+
arc.add('\udced\udca0\udc80')
4085+
with os_helper.temp_cwd() as tmp:
4086+
tar = arc.open(encoding='utf-8', errors='surrogatepass',
4087+
errorlevel=1)
4088+
self.assertEqual(tar.getnames(), ['\ud800'])
4089+
with self.assertRaises(UnicodeEncodeError):
4090+
tar.extractall(filter=tarfile.tar_filter)
4091+
self.assertEqual(os.listdir(), [])
4092+
4093+
tar = arc.open(encoding='utf-8', errors='surrogatepass',
4094+
errorlevel=0, debug=1)
4095+
with support.captured_stderr() as stderr:
4096+
tar.extractall(filter=tarfile.tar_filter)
4097+
self.assertEqual(os.listdir(), [])
4098+
self.assertIn('tarfile: UnicodeEncodeError ', stderr.getvalue())
4099+
40594100
def test_default_filter_warns(self):
40604101
"""Ensure the default filter warns"""
40614102
with ArchiveMaker() as arc:
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
The :mod:`tarfile` module now handles :exc:`UnicodeEncodeError` in the same
2+
way as :exc:`OSError` when cannot extract a member.

0 commit comments

Comments
 (0)