Skip to content

Commit

Permalink
Implement n-param descrambling using JSInterp
Browse files Browse the repository at this point in the history
Fixes #29326, closes #29790, closes #30004, closes #30024, closes #30052,
closes #30088, closes #30097, closes #30102, closes #30109, closes #30119,
closes #30125, closes #30128, closes #30162, closes #30173, closes #30186,
closes #30192, closes #30221, closes #30239, closes #30539, closes #30552.
  • Loading branch information
dirkf committed Jan 31, 2022
1 parent 6ca7b77 commit af9e725
Showing 1 changed file with 99 additions and 16 deletions.
115 changes: 99 additions & 16 deletions youtube_dl/extractor/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -1254,6 +1254,17 @@ def _extract_player_info(cls, player_url):
raise ExtractorError('Cannot identify player %r' % player_url)
return id_m.group('id')

def _get_player_code(self, video_id, player_url, player_id=None):
if not player_id:
player_id = self._extract_player_info(player_url)

if player_id not in self._code_cache:
self._code_cache[player_id] = self._download_webpage(
player_url, video_id,
note='Downloading player ' + player_id,
errnote='Download of %s failed' % player_url)
return self._code_cache[player_id]

def _extract_signature_function(self, video_id, player_url, example_sig):
player_id = self._extract_player_info(player_url)

Expand All @@ -1266,12 +1277,7 @@ def _extract_signature_function(self, video_id, player_url, example_sig):
if cache_spec is not None:
return lambda s: ''.join(s[i] for i in cache_spec)

if player_id not in self._code_cache:
self._code_cache[player_id] = self._download_webpage(
player_url, video_id,
note='Downloading player ' + player_id,
errnote='Download of %s failed' % player_url)
code = self._code_cache[player_id]
code = self._get_player_code(video_id, player_url, player_id)
res = self._parse_sig_js(code)

test_string = ''.join(map(compat_chr, range(len(example_sig))))
Expand Down Expand Up @@ -1350,11 +1356,6 @@ def _decrypt_signature(self, s, video_id, player_url):
if player_url is None:
raise ExtractorError('Cannot decrypt signature without player_url')

if player_url.startswith('//'):
player_url = 'https:' + player_url
elif not re.match(r'https?://', player_url):
player_url = compat_urlparse.urljoin(
'https://www.youtube.com', player_url)
try:
player_id = (player_url, self._signature_cache_id(s))
if player_id not in self._player_cache:
Expand All @@ -1371,6 +1372,88 @@ def _decrypt_signature(self, s, video_id, player_url):
raise ExtractorError(
'Signature extraction failed: ' + tb, cause=e)

def _extract_player_url(self, webpage):
player_url = self._search_regex(
r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
webpage or '', 'player URL', fatal=False)
if not player_url:
return
if player_url.startswith('//'):
player_url = 'https:' + player_url
elif not re.match(r'https?://', player_url):
player_url = compat_urlparse.urljoin(
'https://www.youtube.com', player_url)
return player_url

# from yt-dlp
# See also:
# 1. https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-894619419
# 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116
# 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377
def _extract_n_function_name(self, jscode):
return self._search_regex(

This comment has been minimized.

Copy link
@pukkandan

pukkandan Feb 1, 2022

Contributor

A new player breaks this: yt-dlp/yt-dlp@48416bc

(r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',),
jscode, 'Initial JS player n function name', group='nfunc')

def _extract_n_function(self, video_id, player_url):
player_id = self._extract_player_info(player_url)
func_code = self._downloader.cache.load('youtube-nsig', player_id)

if func_code:
jsi = JSInterpreter(func_code)
else:
player_id = self._extract_player_info(player_url)
jscode = self._get_player_code(video_id, player_url, player_id)
funcname = self._extract_n_function_name(jscode)
jsi = JSInterpreter(jscode)
func_code = jsi.extract_function_code(funcname)
self._downloader.cache.store('youtube-nsig', player_id, func_code)

if self._downloader.params.get('youtube_print_sig_code'):
self.to_screen('Extracted nsig function from {0}:\n{1}\n'.format(player_id, func_code[1]))

return lambda s: jsi.extract_function_from_code(*func_code)([s])

def _n_descramble(self, n_param, player_url, video_id):
"""Compute the response to YT's "n" parameter challenge
Args:
n_param -- challenge string that is the value of the
URL's "n" query parameter
player_url -- URL of YT player JS
video_id
"""

sig_id = ('nsig_value', n_param)
if sig_id in self._player_cache:
return self._player_cache[sig_id]

try:
player_id = ('nsig', player_url)
if player_id not in self._player_cache:
self._player_cache[player_id] = self._extract_n_function(video_id, player_url)
func = self._player_cache[player_id]
self._player_cache[sig_id] = func(n_param)
if self._downloader.params.get('verbose', False):
self._downloader.to_screen('[debug] [%s] %s' % (self.IE_NAME, 'Decrypted nsig {0} => {1}'.format(n_param, self._player_cache[sig_id])))
return self._player_cache[sig_id]
except Exception as e:
raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id)

def _unthrottle_format_urls(self, video_id, player_url, formats):
for fmt in formats:
parsed_fmt_url = compat_urlparse.urlparse(fmt['url'])
qs = compat_urlparse.parse_qs(parsed_fmt_url.query)
n_param = qs.get('n')
if not n_param:
continue
n_param = n_param[-1]
n_response = self._n_descramble(n_param, player_url, video_id)
if n_response:
qs['n'] = [n_response]
fmt['url'] = compat_urlparse.urlunparse(
parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True)))

def _mark_watched(self, video_id, player_response):
playback_url = url_or_none(try_get(
player_response,
Expand Down Expand Up @@ -1632,11 +1715,7 @@ def feed_entry(name):
if not (sc and fmt_url and encrypted_sig):
continue
if not player_url:
if not webpage:
continue
player_url = self._search_regex(
r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
webpage, 'player URL', fatal=False)
player_url = self._extract_player_url(webpage)
if not player_url:
continue
signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
Expand Down Expand Up @@ -1782,6 +1861,10 @@ def feed_entry(name):
is_live = video_details.get('isLive')
owner_profile_url = microformat.get('ownerProfileUrl')

if not player_url:
player_url = self._extract_player_url(webpage)
self._unthrottle_format_urls(video_id, player_url, formats)

info = {
'id': video_id,
'title': self._live_title(video_title) if is_live else video_title,
Expand Down

6 comments on commit af9e725

@garoto
Copy link

@garoto garoto commented on af9e725 Jan 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you considering making a new release based on these recent changes alone?

@dirkf
Copy link
Contributor Author

@dirkf dirkf commented on af9e725 Jan 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As soon as possible, but no sooner ...

To be less cryptic, I need to reach an understanding of the whole release process and especially the Windows end of it, since the Windows build would probably be the most valuable part of the release for users.

@garoto
Copy link

@garoto garoto commented on af9e725 Jan 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, makes sense. Don't forget to look how yt-dlp is doing their builds nowadays, since I seem to recall they modernized the whole workflow quite a bit.

@pukkandan
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The previous devs were building the windows release with wine (I don't know the exact process) and releasing with devscripts/create-github-releases.py. If you have a windows machine, you can just do setup.py py2exe instead. But in the long term, I recommend moving the release process to GH actions like yt-dlp. Once set up properly, it makes it much easier

@meowthink
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Glad to see youtube-dl not being throttled. But a note here:
youtube has its player updated recently.
To get this run correctly, I have to make a patch, as what yt-dlp recent committed.
[youtube] Fix n-sig for player e06dea74

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 797c35fd5..520efc17a 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1004,6 +1004,8 @@ class InfoExtractor(object):
             if group is None:
                 # return the first matching group
                 return next(g for g in mobj.groups() if g is not None)
+            elif isinstance(group, (list, tuple)):
+                return tuple(mobj.group(g) for g in group)
             else:
                 return mobj.group(group)
         elif default is not NO_DEFAULT:
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 63918924d..0ad467aa3 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -28,6 +28,7 @@ from ..utils import (
     dict_get,
     float_or_none,
     int_or_none,
+    js_to_json,
     mimetype2ext,
     parse_codecs,
     parse_duration,
@@ -1391,9 +1392,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
     # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116
     # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377
     def _extract_n_function_name(self, jscode):
-        return self._search_regex(
-            (r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',),
-            jscode, 'Initial JS player n function name', group='nfunc')
+        nfunc, idx = self._search_regex(
+            r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]{3})(\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)',
+            jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))
+        if not idx:
+            return nfunc
+        return json.loads(js_to_json(self._search_regex(
+            r'var {}\s*=\s*(\[.+?\]);'.format(nfunc), jscode,
+            'Initial JS player n function list ({}.{})'.format(nfunc, idx))))[int(idx)]

     def _extract_n_function(self, video_id, player_url):
         player_id = self._extract_player_info(player_url)

@dirkf
Copy link
Contributor Author

@dirkf dirkf commented on af9e725 Feb 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A full yt-dl update is about to appear, without f'', and without changing search_regex() (the change is a good thing, though, and should be adopted in a partial back-port of extractor/common.py soon).

Please sign in to comment.