Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/yt-dlp/yt-dlp into ytdlp
Browse files Browse the repository at this point in the history
* 'master' of https://github.com/yt-dlp/yt-dlp:
  [youtube] Detect DRM better
  [outtmpl] Limit changes during sanitization
  [youtube:tab] Return shorts url if video is a short (#3168)
  [VideoConvertor] Ensure all streams are copied
  [Concat] Ensure final directory exists
  • Loading branch information
Lesmiscore committed Mar 27, 2022
2 parents 0810ffa + 727029c commit 124fec7
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 24 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu
* Some private fields such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this
* When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this
* `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi`
* youtube-dl tries to remove some superfluous punctuations from filenames. While this can sometimes be helpfull, it is often undesirable. So yt-dlp tries to keep the fields in the filenames as close to their original values as possible. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior

For ease of use, a few more compat options are available:
* `--compat-options all`: Use all compat options
Expand Down
6 changes: 4 additions & 2 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,12 @@ def test_sanitize_filename(self):
sanitize_filename('New World record at 0:12:34'),
'New World record at 0_12_34')

self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf')
self.assertEqual(sanitize_filename('--gasdgf'), '--gasdgf')
self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf')
self.assertEqual(sanitize_filename('.gasdgf'), 'gasdgf')
self.assertEqual(sanitize_filename('--gasdgf', is_id=False), '_-gasdgf')
self.assertEqual(sanitize_filename('.gasdgf'), '.gasdgf')
self.assertEqual(sanitize_filename('.gasdgf', is_id=True), '.gasdgf')
self.assertEqual(sanitize_filename('.gasdgf', is_id=False), 'gasdgf')

forbidden = '"\0\\/'
for fc in forbidden:
Expand Down
7 changes: 5 additions & 2 deletions yt_dlp/YoutubeDL.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
MaxDownloadsReached,
merge_headers,
network_exceptions,
NO_DEFAULT,
number_of_digits,
orderedSet,
OUTTMPL_TYPES,
Expand Down Expand Up @@ -1207,8 +1208,10 @@ def get_value(mdict):
na = self.params.get('outtmpl_na_placeholder', 'NA')

def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
return sanitize_filename(str(value), restricted=restricted,
is_id=re.search(r'(^|[_.])id(\.|$)', key))
return sanitize_filename(str(value), restricted=restricted, is_id=(
bool(re.search(r'(^|[_.])id(\.|$)', key))
if 'filename-sanitization' in self.params.get('compat_opts', [])
else NO_DEFAULT))

sanitizer = sanitize if callable(sanitize) else filename_sanitizer
sanitize = bool(sanitize)
Expand Down
10 changes: 8 additions & 2 deletions yt_dlp/extractor/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -827,12 +827,17 @@ def _extract_video(self, renderer):
renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str)
badges = self._extract_badges(renderer)
thumbnails = self._extract_thumbnails(renderer, 'thumbnail')
navigation_url = urljoin('https://www.youtube.com/', traverse_obj(
renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str))
url = f'https://www.youtube.com/watch?v={video_id}'
if overlay_style == 'SHORTS' or (navigation_url and '/shorts/' in navigation_url):
url = f'https://www.youtube.com/shorts/{video_id}'

return {
'_type': 'url',
'ie_key': YoutubeIE.ie_key(),
'id': video_id,
'url': f'https://www.youtube.com/watch?v={video_id}',
'url': url,
'title': title,
'description': description,
'duration': duration,
Expand Down Expand Up @@ -3067,7 +3072,7 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live, durati
streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])

for fmt in streaming_formats:
if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
if fmt.get('targetDurationSec'):
continue

itag = str_or_none(fmt.get('itag'))
Expand Down Expand Up @@ -3176,6 +3181,7 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live, durati
'fps': int_or_none(fmt.get('fps')) or None,
'height': height,
'quality': q(quality),
'has_drm': bool(fmt.get('drmFamilies')),
'tbr': tbr,
'url': fmt_url,
'width': int_or_none(fmt.get('width')),
Expand Down
2 changes: 1 addition & 1 deletion yt_dlp/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ def _dict_from_options_callback(
action='callback', callback=_set_from_options_callback,
callback_kwargs={
'allowed_values': {
'filename', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles',
'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles',
'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge',
'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', 'embed-metadata',
'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi',
Expand Down
6 changes: 4 additions & 2 deletions yt_dlp/postprocessor/ffmpeg.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,9 +618,9 @@ def _target_ext(self, source_ext):

@staticmethod
def _options(target_ext):
yield from FFmpegPostProcessor.stream_copy_opts(False)
if target_ext == 'avi':
return ['-c:v', 'libxvid', '-vtag', 'XVID']
return []
yield from ('-c:v', 'libxvid', '-vtag', 'XVID')

@PostProcessor._restrict_to(images=False)
def run(self, info):
Expand Down Expand Up @@ -1202,6 +1202,8 @@ def __init__(self, downloader, only_multi_video=False):
super().__init__(downloader)

def concat_files(self, in_files, out_file):
if not self._downloader._ensure_dir_exists(out_file):
return
if len(in_files) == 1:
if os.path.realpath(in_files[0]) != os.path.realpath(out_file):
self.to_screen(f'Moving "{in_files[0]}" to "{out_file}"')
Expand Down
34 changes: 19 additions & 15 deletions yt_dlp/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,36 +675,40 @@ def timeconvert(timestr):
return timestamp


def sanitize_filename(s, restricted=False, is_id=False):
def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
"""Sanitizes a string so it could be used as part of a filename.
If restricted is set, use a stricter subset of allowed characters.
Set is_id if this is not an arbitrary string, but an ID that should be kept
if possible.
@param restricted Use a stricter subset of allowed characters
@param is_id Whether this is an ID that should be kept unchanged if possible.
If unset, yt-dlp's new sanitization rules are in effect
"""
if s == '':
return ''

def replace_insane(char):
if restricted and char in ACCENT_CHARS:
return ACCENT_CHARS[char]
elif not restricted and char == '\n':
return ' '
return '\0 '
elif char == '?' or ord(char) < 32 or ord(char) == 127:
return ''
elif char == '"':
return '' if restricted else '\''
elif char == ':':
return '_-' if restricted else ' -'
return '\0_\0-' if restricted else '\0 \0-'
elif char in '\\/|*<>':
return '_'
if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
return '_'
if restricted and ord(char) > 127:
return '_'
return '\0_'
if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
return '\0_'
return char

if s == '':
return ''
# Handle timestamps
s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
result = ''.join(map(replace_insane, s))
if is_id is NO_DEFAULT:
result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
STRIP_RE = '(?:\0.|[ _-])*'
result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
result = result.replace('\0', '') or '_'

if not is_id:
while '__' in result:
result = result.replace('__', '_')
Expand Down

0 comments on commit 124fec7

Please sign in to comment.