Skip to content

Commit

Permalink
Sanitize formats before sorting
Browse files Browse the repository at this point in the history
Closes #4501
  • Loading branch information
pukkandan committed Feb 12, 2023
1 parent 365b900 commit 39f32f1
Showing 1 changed file with 32 additions and 38 deletions.
70 changes: 32 additions & 38 deletions yt_dlp/YoutubeDL.py
Original file line number Diff line number Diff line change
Expand Up @@ -2561,7 +2561,6 @@ def sanitize_numeric_fields(info):
info_dict['requested_subtitles'] = self.process_subtitles(
info_dict['id'], subtitles, automatic_captions)

self.sort_formats(info_dict)
formats = self._get_formats(info_dict)

# or None ensures --clean-infojson removes it
Expand Down Expand Up @@ -2601,22 +2600,40 @@ def is_wellformed(f):
if not formats:
self.raise_no_formats(info_dict)

formats_dict = {}

# We check that all the formats have the format and format_id fields
for i, format in enumerate(formats):
for format in formats:
sanitize_string_field(format, 'format_id')
sanitize_numeric_fields(format)
format['url'] = sanitize_url(format['url'])
if format.get('ext') is None:
format['ext'] = determine_ext(format['url']).lower()
if format.get('protocol') is None:
format['protocol'] = determine_protocol(format)
if format.get('resolution') is None:
format['resolution'] = self.format_resolution(format, default=None)
if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
format['dynamic_range'] = 'SDR'
if format.get('aspect_ratio') is None:
format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2))
if (info_dict.get('duration') and format.get('tbr')
and not format.get('filesize') and not format.get('filesize_approx')):
format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict))

# This is copied to http_headers by the above _calc_headers and can now be removed
if '__x_forwarded_for_ip' in info_dict:
del info_dict['__x_forwarded_for_ip']

self.sort_formats({'formats': formats})

# Sanitize and group by format_id
formats_dict = {}
for i, format in enumerate(formats):
if not format.get('format_id'):
format['format_id'] = str(i)
else:
# Sanitize format_id from characters used in format selector expression
format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
format_id = format['format_id']
if format_id not in formats_dict:
formats_dict[format_id] = []
formats_dict[format_id].append(format)
formats_dict.setdefault(format['format_id'], []).append(format)

# Make sure all formats have unique format_id
common_exts = set(itertools.chain(*self._format_selection_exts.values()))
Expand All @@ -2625,40 +2642,17 @@ def is_wellformed(f):
for i, format in enumerate(ambiguous_formats):
if ambigious_id:
format['format_id'] = '%s-%d' % (format_id, i)
if format.get('ext') is None:
format['ext'] = determine_ext(format['url']).lower()
# Ensure there is no conflict between id and ext in format selection
# See https://github.com/yt-dlp/yt-dlp/issues/1282
if format['format_id'] != format['ext'] and format['format_id'] in common_exts:
format['format_id'] = 'f%s' % format['format_id']

for i, format in enumerate(formats):
if format.get('format') is None:
format['format'] = '{id} - {res}{note}'.format(
id=format['format_id'],
res=self.format_resolution(format),
note=format_field(format, 'format_note', ' (%s)'),
)
if format.get('protocol') is None:
format['protocol'] = determine_protocol(format)
if format.get('resolution') is None:
format['resolution'] = self.format_resolution(format, default=None)
if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
format['dynamic_range'] = 'SDR'
if format.get('aspect_ratio') is None:
format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2))
if (info_dict.get('duration') and format.get('tbr')
and not format.get('filesize') and not format.get('filesize_approx')):
format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))

# Add HTTP headers, so that external programs can use them from the
# json output
full_format_info = info_dict.copy()
full_format_info.update(format)
format['http_headers'] = self._calc_headers(full_format_info)
# Remove private housekeeping stuff
if '__x_forwarded_for_ip' in info_dict:
del info_dict['__x_forwarded_for_ip']
if format.get('format') is None:
format['format'] = '{id} - {res}{note}'.format(
id=format['format_id'],
res=self.format_resolution(format),
note=format_field(format, 'format_note', ' (%s)'),
)

if self.params.get('check_formats') is True:
formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
Expand Down

0 comments on commit 39f32f1

Please sign in to comment.