-
Notifications
You must be signed in to change notification settings - Fork 10k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Frontend master extractor #16328
Frontend master extractor #16328
Changes from 13 commits
78790ba
a4619f7
540516c
52e17c4
78909d9
f02a975
c000009
65cf161
20d3041
f51e50d
21aaaaa
5ce9a04
f13d3cc
f0e3ba8
7f69c7d
5297860
8d8f188
3d20dd3
a648b86
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,288 @@ | ||
# coding: utf-8 | ||
from __future__ import unicode_literals | ||
|
||
import collections | ||
|
||
import re | ||
|
||
from .common import InfoExtractor | ||
from ..compat import ( | ||
compat_urlparse, | ||
compat_str) | ||
|
||
from ..utils import ( | ||
ExtractorError, | ||
urlencode_postdata, | ||
qualities | ||
) | ||
|
||
|
||
class FrontEndMasterBaseIE(InfoExtractor): | ||
_API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses' | ||
_VIDEO_BASE = 'http://www.frontendmasters.com/courses' | ||
_COOKIES_BASE = 'https://api.frontendmasters.com' | ||
_LOGIN_URL = 'https://frontendmasters.com/login/' | ||
|
||
def _real_initialize(self): | ||
self._login() | ||
|
||
def _login(self): | ||
(username, password) = self._get_login_info() | ||
if username is None: | ||
return | ||
|
||
login_page = self._download_webpage( | ||
self._LOGIN_URL, None, 'Downloading login page') | ||
|
||
login_form = self._hidden_inputs(login_page) | ||
|
||
login_form.update({ | ||
"username": username, | ||
"password": password | ||
}) | ||
|
||
post_url = self._search_regex( | ||
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, | ||
'post_url', default=self._LOGIN_URL, group='url') | ||
|
||
if not post_url.startswith('http'): | ||
post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) | ||
|
||
response = self._download_webpage( | ||
post_url, None, 'Logging in', | ||
data=urlencode_postdata(login_form), | ||
headers={'Content-Type': 'application/x-www-form-urlencoded'} | ||
) | ||
|
||
error = self._search_regex( | ||
r'<div[^>]+class=["\']Message MessageAlert["\'][^>]*>([^<]+)</div>', | ||
response, 'error message', default=None) | ||
|
||
if error: | ||
raise ExtractorError('Unable to login: check username and password', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does not necessarily mean that. There is a clear captured error message that should be output. |
||
expected=True) | ||
|
||
def _match_course_id(self, url): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unused. |
||
if '_VALID_URL_RE' not in self.__dict__: | ||
self._VALID_URL_RE = re.compile(self._VALID_URL) | ||
m = self._VALID_URL_RE.match(url) | ||
assert m | ||
return compat_str(m.group('courseid')) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
|
||
def _download_course(self, course_id, url, display_id): | ||
response = self._download_json( | ||
'%s/%s' % (self._API_BASE, course_id), course_id, | ||
'Downloading course JSON', | ||
headers={ | ||
'Content-Type': 'application/json;charset=utf-8', | ||
'Referer': url, | ||
}) | ||
return response | ||
|
||
def _pair_section_with_video_elemen_index(self, lesson_elements): | ||
sections = {} | ||
current_section = None | ||
current_section_number = 0 | ||
for elem in lesson_elements: | ||
if not isinstance(elem, int): | ||
elem_name = elem | ||
if not isinstance(elem_name, str): | ||
# convert unicode to str | ||
elem_name = elem.encode('utf-8') | ||
(current_section, current_section_number) = \ | ||
(elem_name, current_section_number + 1) | ||
else: | ||
if current_section: | ||
sections[elem] = (current_section, current_section_number) | ||
|
||
return sections | ||
|
||
|
||
class FrontEndMasterIE(FrontEndMasterBaseIE): | ||
IE_NAME = 'frontend-masters' | ||
_VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<courseid>[a-z\-]+)/?(?P<id>[a-z\-]+)?/?' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
_NETRC_MACHINE = 'frontend-masters' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No dashes. |
||
|
||
_TESTS = [{ | ||
'url': 'https://frontendmasters.com/courses/javascript-basics/', | ||
'info_dict': { | ||
'id': 'javascript-basics', | ||
'title': 'Introduction to JavaScript Programming', | ||
'description': 'md5:269412fbb76d86954761599ad8e4cbc9' | ||
}, | ||
'playlist_count': 19, | ||
'skip': 'Requires FrontendMasters account credentials' | ||
}, { | ||
'url': 'https://frontendmasters.com/courses/web-development/tools', | ||
'md5': '7f161159710d6b7016a4f4af6fcb05e2', | ||
'info_dict': { | ||
'id': 'tools', | ||
'title': 'Tools', | ||
'display_id': 'tools', | ||
'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', | ||
'ext': 'mp4' | ||
}, | ||
'skip': 'Requires FrontendMasters account credentials', | ||
}] | ||
|
||
def _download_single_video(self, url, course_id, video_id): | ||
course_json_content = self._download_course(course_id=course_id, | ||
url=url, | ||
display_id=course_id) | ||
|
||
# Necessary to get mandatory informations like title and video_url | ||
lesson_index = course_json_content.get('lessonSlugs').index(video_id) | ||
lesson_hash = course_json_content.get('lessonHashes')[lesson_index] | ||
lesson_data = course_json_content.get('lessonData')[lesson_hash] | ||
# This is necessary to get the link for the video | ||
lesson_source_base = lesson_data['sourceBase'] | ||
|
||
lesson_title = lesson_data['title'] | ||
|
||
# Some optional fields | ||
lesson_description = lesson_data.get('description') | ||
lesson_index = lesson_data.get('index') | ||
lesson_slug = lesson_data.get('slug') | ||
lesson_thumbnail_url = lesson_data.get('thumbnail') | ||
lesson_section_elements = course_json_content.get('lessonElements') | ||
|
||
try: | ||
course_sections_pairing = self._pair_section_with_video_elemen_index( | ||
lesson_section_elements) | ||
lesson_section = course_sections_pairing.get(lesson_index)[0] | ||
lesson_section_number = course_sections_pairing.get(lesson_index)[1] | ||
except Exception: | ||
lesson_section = None | ||
lesson_section_number = None | ||
|
||
|
||
QUALITIES_PREFERENCE = ('low', 'medium', 'high') | ||
quality_key = qualities(QUALITIES_PREFERENCE) | ||
QUALITIES = { | ||
'low': {'width': 480, 'height': 360}, | ||
'medium': {'width': 1280, 'height': 720}, | ||
'high': {'width': 1920, 'height': 1080} | ||
} | ||
|
||
AllowedQuality = collections.namedtuple('AllowedQuality', | ||
['ext', 'qualities']) | ||
ALLOWED_QUALITIES = [ | ||
AllowedQuality('webm', ['low', 'medium', 'high']), | ||
AllowedQuality('mp4', ['low', 'medium', 'high']) | ||
] | ||
|
||
cookies = self._get_cookies(self._COOKIES_BASE) | ||
cookies_str = ";".join(["%s=%s" % (cookie.key, cookie.value) | ||
for cookie in cookies.values()]) | ||
video_request_url = "%s/source" | ||
video_request_headers = { | ||
"origin": "https://frontendmasters.com", | ||
"referer": lesson_source_base, | ||
"cookie": cookies_str | ||
} | ||
|
||
if self._downloader.params.get('listformats', False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? I took inspiration from the Pluralsight Extractor for the Qualities bit, and for what concern the cookies etc. it's to avoid "400 - Bad request" response from the server There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've already pointed out: this must be removed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean the part with the cookies or the guessing of the formats? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you trolling or what? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found this pattern in other extractors, which are already part of the package - see the pluralsight one line 235. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bother to read previous comments. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
allowed_qualities = ALLOWED_QUALITIES | ||
else: | ||
def guess_allowed_qualities(): | ||
req_format = self._downloader.params.get('format') or 'best' | ||
req_format_split = req_format.split('-', 1) | ||
if len(req_format_split) > 1: | ||
req_ext, req_quality = req_format_split | ||
req_quality = '-'.join(req_quality.split('-')[:2]) | ||
for allowed_quality in ALLOWED_QUALITIES: | ||
if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: | ||
return (AllowedQuality(req_ext, (req_quality,)),) | ||
req_ext = 'webm' if self._downloader.params.get( | ||
'prefer_free_formats') else 'mp4' | ||
return (AllowedQuality(req_ext, ('high',)),) | ||
|
||
allowed_qualities = guess_allowed_qualities() | ||
|
||
formats = [] | ||
for ext, qualities_ in allowed_qualities: | ||
for quality in qualities_: | ||
f = QUALITIES[quality].copy() | ||
video_request_params = { | ||
'r': f['height'], | ||
'f': ext | ||
} | ||
video_response = self._download_json( | ||
video_request_url % lesson_source_base, video_id, | ||
query=video_request_params, headers=video_request_headers) | ||
|
||
# To avoid the possibility of problems with multiple sequential calls to ViewClip API and start | ||
# to return 429 HTTP errors after some time (see the problem Pluralsight has on | ||
# https://github.com/rg3/youtube-dl/pull/6989) and avoid also the risk of | ||
# account ban (see https://github.com/rg3/youtube-dl/issues/6842), | ||
# we will sleep random amount of time before each call to ViewClip. | ||
|
||
# self._sleep( | ||
# random.randint(2, 5), lesson_slug, | ||
# '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') | ||
# | ||
# if not video_response: | ||
# continue | ||
|
||
video_url = video_response.get('url') | ||
clip_f = f.copy() | ||
clip_f.update({ | ||
'url': video_url, | ||
'ext': ext, | ||
'format_id': '%s-%s' % (ext, quality), | ||
'quality': quality_key(quality), | ||
'height': f['height'] | ||
}) | ||
formats.append(clip_f) | ||
|
||
self._sort_formats(formats) | ||
|
||
return { | ||
'id': video_id, | ||
'display_id': lesson_slug, | ||
'title': lesson_title, | ||
'description': lesson_description, | ||
'chapter': lesson_section, | ||
'chapter_number': lesson_section_number, | ||
'thumbnail': lesson_thumbnail_url, | ||
'formats': formats | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are also vtt subtitles available. |
||
|
||
def _download_entire_course(self, url, course_id): | ||
|
||
course_json_content = self._download_course(course_id=course_id, | ||
url=url, | ||
display_id=None) | ||
|
||
title = course_json_content.get('title') | ||
description = course_json_content.get('description') | ||
course_display_id = course_json_content.get('slug') | ||
|
||
videos_data = course_json_content.get('lessonData').values() | ||
videos_data = sorted(videos_data, key=lambda video: video.get('index')) | ||
|
||
entries = [] | ||
for video in videos_data: | ||
video_slug = video.get('slug') | ||
clip_url = "%s/%s/%s" % ( | ||
self._VIDEO_BASE, course_display_id, video_slug) | ||
entries.append({ | ||
'_type': 'url_transparent', | ||
'url': clip_url, | ||
'ie_key': FrontEndMasterIE.ie_key() | ||
}) | ||
|
||
return self.playlist_result(entries, course_id, title, description) | ||
|
||
def _real_extract(self, url): | ||
mobj = re.match(self._VALID_URL, url) | ||
video_id = mobj.group('id') | ||
course_id = mobj.group('courseid') | ||
if video_id: | ||
return self._download_single_video(url, course_id, video_id) | ||
else: | ||
return self._download_entire_course(url, course_id) | ||
|
||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check code with flake8. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Single quotes.