Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pornhub] Fix PC page video extraction #23082

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 32 additions & 25 deletions youtube_dl/extractor/pornhub.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,32 @@ def dl_webpage(platform):
'https://www.%s/view_video.php?viewkey=%s' % (host, video_id),
video_id, 'Downloading %s webpage' % platform)

def parse_js(_webpage, _regex):
js_vars = {}
assignments = self._search_regex(
_regex, _webpage,
'encoded url').split(';')

def parse_js_value(inp):
inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
if '+' in inp:
inps = inp.split('+')
return functools.reduce(
operator.concat, map(parse_js_value, inps))
inp = inp.strip()
if inp in js_vars:
return js_vars[inp]
return remove_quotes(inp)

for assn in assignments:
assn = assn.strip()
if not assn:
continue
assn = re.sub(r'var\s+', '', assn)
vname, value = assn.split('=', 1)
js_vars[vname] = parse_js_value(value)
return js_vars

webpage = dl_webpage('pc')

error_msg = self._html_search_regex(
Expand Down Expand Up @@ -212,13 +238,17 @@ def dl_webpage(platform):
thumbnail = flashvars.get('image_url')
duration = int_or_none(flashvars.get('video_duration'))
media_definitions = flashvars.get('mediaDefinitions')
js_vars = parse_js(webpage, r'(var.+?rahttps.+?)\n')
if isinstance(media_definitions, list):
for definition in media_definitions:
if not isinstance(definition, dict):
continue
video_url = definition.get('videoUrl')
if not video_url or not isinstance(video_url, compat_str):
continue
js_vars_qual = js_vars.get('quality_%sp' % (definition.get('quality')))
if not js_vars_qual:
continue
video_url = js_vars_qual
if video_url in video_urls_set:
continue
video_urls_set.add(video_url)
Expand All @@ -230,30 +260,7 @@ def dl_webpage(platform):
if not video_urls:
tv_webpage = dl_webpage('tv')

assignments = self._search_regex(
r'(var.+?mediastring.+?)</script>', tv_webpage,
'encoded url').split(';')

js_vars = {}

def parse_js_value(inp):
inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
if '+' in inp:
inps = inp.split('+')
return functools.reduce(
operator.concat, map(parse_js_value, inps))
inp = inp.strip()
if inp in js_vars:
return js_vars[inp]
return remove_quotes(inp)

for assn in assignments:
assn = assn.strip()
if not assn:
continue
assn = re.sub(r'var\s+', '', assn)
vname, value = assn.split('=', 1)
js_vars[vname] = parse_js_value(value)
js_vars = parse_js(tv_webpage, r'(var.+?mediastring.+?)</script>')

video_url = js_vars['mediastring']
if video_url not in video_urls_set:
Expand Down