Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GetCourseRuIE] & [AcademyMel] Add extractor #8873

Merged
merged 39 commits into from Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b48082a
[GetCourseRuIE] Add extractor
Dec 29, 2023
548d724
[AcademyMelIE] Add extractor; [GetCourseRuIE] extractor fixes
Jan 5, 2024
0763788
Merge branch 'yt-dlp:master' into getcourse-ru
divStar Jan 5, 2024
28e4eb2
[AcademyMelIE] extractor fixes to allow multiple videos per site
Jan 5, 2024
c3af377
[AcademyMelIE]&[GetCourseRuIE] extractor fixes
Jan 14, 2024
e90283e
[AcademyMelIE]&[GetCourseRuIE] removed unused variables and to_screen…
Jan 14, 2024
14f69f7
[AcademyMelIE]&[GetCourseRuIE] implemented review remarks
Jan 14, 2024
72f9a57
[ie/academymel] Cleanup
bashonly Jan 14, 2024
965935f
[ie/getcourseru] Cleanup
bashonly Jan 14, 2024
8a54a76
[ie/getcourseru] More lenient `_VALID_URL`
bashonly Jan 14, 2024
effa2ea
[AcademyMelIE]&[GetCourseRuIE] test the playlist
Jan 14, 2024
7759ab6
[GetCourseRuPlayerIE]&[GetCourseRuIE] adding more generic getcourse.r…
Jan 17, 2024
2323a99
[GetCourseRuPlayerIE]&[GetCourseRuIE] fixing URLs and playlist_id
Jan 18, 2024
24bb540
[GetCourseRuPlayerIE]&[GetCourseRuIE] fixing flake8 remarks, login-re…
Jan 18, 2024
b0302b5
[GetCourseRuPlayerIE]&[GetCourseRuIE] xdgetId is now parsed from logi…
Jan 18, 2024
7149abe
[GetCourseRuPlayerIE]&[GetCourseRuIE] added another test-case, that w…
Jan 18, 2024
36c6dd1
Only match exactly player02 and add getcourse.io
seproDev Jan 18, 2024
b5748f6
Add Embed regex
seproDev Jan 18, 2024
d7fa3c4
Use gcFileId as this is also used in webpage
seproDev Jan 18, 2024
d7b18d9
Add getcourse.io test
seproDev Jan 18, 2024
db82da7
Add marafon.mani-beauty.com domain
seproDev Jan 18, 2024
9ac446d
Use embed regex for consistency to find iframes
seproDev Jan 18, 2024
7bbcc4e
Extract og title as fallback.
seproDev Jan 18, 2024
d60d40a
Fix regex
seproDev Jan 18, 2024
799f5a6
Only login if username/password is passed
seproDev Jan 18, 2024
a2d1d06
Refactor login code
seproDev Jan 18, 2024
273782d
Add simple sign to login
seproDev Jan 18, 2024
17e21ce
Don't abort on no login
seproDev Jan 18, 2024
0cf1aa6
Extract proper id for pages
seproDev Jan 18, 2024
c765ee8
Cleanup imports
seproDev Jan 18, 2024
21c6dfa
Use separate netrc machine per hostname
seproDev Jan 18, 2024
624158e
typo
seproDev Jan 18, 2024
e488d10
Apply suggestions from code review
seproDev Jan 18, 2024
07c7431
Inline xdgetId and requestSimpleSign
seproDev Jan 18, 2024
cf8ae1d
Use correct netrc in error message
seproDev Jan 18, 2024
b8677c9
[GetCourseRuPlayerIE]&[GetCourseRuIE] added on.psbook.ru domain and f…
Jan 19, 2024
0db6db3
cosmetic
bashonly Jan 19, 2024
6113c3d
cosmetic pt2
bashonly Jan 19, 2024
9944e98
reverse title fallback
bashonly Jan 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions yt_dlp/extractor/_extractors.py
Expand Up @@ -680,6 +680,10 @@
GeniusIE,
GeniusLyricsIE,
)
from .getcourseru import (
GetCourseRuPlayerIE,
GetCourseRuIE
)
from .gettr import (
GettrIE,
GettrStreamingIE,
Expand Down
178 changes: 178 additions & 0 deletions yt_dlp/extractor/getcourseru.py
@@ -0,0 +1,178 @@
import re
import time
import urllib.parse

from .common import InfoExtractor
from ..utils import ExtractorError, int_or_none, url_or_none, urlencode_postdata
from ..utils.traversal import traverse_obj


class GetCourseRuPlayerIE(InfoExtractor):
_VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+'
_TESTS = [{
'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag',
'info_dict': {
'id': '513573381',
'title': '190bdf93f1b29735309853a7a19e24b3',
'ext': 'mp4',
'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80',
'duration': 1693
},
'skip': 'JWT expired',
}]
_EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL}[^\'"]*)']

def _real_extract(self, url):
webpage = self._download_webpage(url, None, 'Downloading player page')
window_configs = self._search_json(
r'window\.configs\s*=', webpage, 'config', None)
video_id = str(window_configs['gcFileId'])
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
window_configs['masterPlaylistUrl'], video_id)

return {
**traverse_obj(window_configs, {
'title': ('videoHash', {str}),
'thumbnail': ('previewUrl', {url_or_none}),
'duration': ('videoDuration', {int_or_none}),
}),
'id': video_id,
'formats': formats,
'subtitles': subtitles
}


class GetCourseRuIE(InfoExtractor):
_NETRC_MACHINE = 'getcourseru'
_LOGIN_URL_PATH = '/cms/system/login'
_TESTS = [{
'url': 'http://academymel.online/3video_1',
'info_dict': {
'id': '3059742',
'display_id': '3video_1',
'title': 'Промоуроки Академии МЕЛ',
},
'playlist_count': 1,
'playlist': [{
'info_dict': {
'id': '513573381',
'ext': 'mp4',
'title': 'Промоуроки Академии МЕЛ',
'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80',
'duration': 1693
},
}]
}, {
'url': 'https://academymel.getcourse.ru/3video_1',
'info_dict': {
'id': '3059742',
'display_id': '3video_1',
'title': 'Промоуроки Академии МЕЛ',
},
'playlist_count': 1,
'playlist': [{
'info_dict': {
'id': '513573381',
'ext': 'mp4',
'title': 'Промоуроки Академии МЕЛ',
'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80',
'duration': 1693
},
}]
}, {
'url': 'https://academymel.getcourse.ru/pl/teach/control/lesson/view?id=319141781&editMode=0',
'info_dict': {
'id': '319141781',
'title': '1. Разминка у стены',
},
'playlist_count': 1,
'playlist': [{
'info_dict': {
'id': '4919601',
'ext': 'mp4',
'title': '1. Разминка у стены',
'thumbnail': 'https://preview-htz.vhcdn.com/preview/5a521788e7dc25b4f70c3dff6512d90e/preview.jpg?version=1703223532&host=vh-81',
'duration': 704
},
}],
'skip': 'paid lesson'
}, {
'url': 'https://manibeauty.getcourse.ru/pl/teach/control/lesson/view?id=272499894',
'info_dict': {
'id': '272499894',
'title': 'Мотивация к тренировкам',
},
'playlist_count': 1,
'playlist': [{
'info_dict': {
'id': '4242723',
'ext': 'mp4',
'title': 'Мотивация к тренировкам',
'thumbnail': 'https://preview-htz.vhcdn.com/preview/70ed5b9f489dd03b4aff55bfdff71a26/preview.jpg?version=1685115787&host=vh-71',
'duration': 30
},
}],
'skip': 'paid lesson'
}, {
'url': 'https://gaismasmandalas.getcourse.io/ATLAUTSEVBUT',
'only_matching': True,
}]
_DOMAINS = [
'academymel.online',
'marafon.mani-beauty.com',
]
_BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})'
_VALID_URL = [
rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P<id>[^?#]+)',
rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P<id>\d+)',
]

def _login(self, hostname, username, password):
if self._get_cookies(f'https://{hostname}').get('PHPSESSID5'):
return
login_url = f'https://{hostname}{self._LOGIN_URL_PATH}'
webpage = self._download_webpage(login_url, None)

self._request_webpage(
login_url, None, 'Logging in', 'Failed to log in',
data=urlencode_postdata({
'action': 'processXdget',
'xdgetId': self._html_search_regex(
r'<form[^>]+\bclass="[^"]*\bstate-login[^"]*"[^>]+\bdata-xdget-id="([^"]+)"',
webpage, 'xdgetId'),
'params[action]': 'login',
'params[url]': login_url,
'params[object_type]': 'cms_page',
'params[object_id]': -1,
'params[email]': username,
'params[password]': password,
'requestTime': int(time.time()),
'requestSimpleSign': self._html_search_regex(
r'window.requestSimpleSign\s*=\s*"([\da-f]+)"', webpage, 'simple sign'),
}))

def _real_extract(self, url):
hostname = urllib.parse.urlparse(url).hostname
username, password = self._get_login_info(netrc_machine=hostname)
if username:
self._login(hostname, username, password)

display_id = self._match_id(url)
# NB: 404 is returned due to yt-dlp not properly following redirects #9020
webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=404)
if self._LOGIN_URL_PATH in urlh.url or urlh.status == 404:
raise ExtractorError(
f'This video is only available for registered users. {self._login_hint("any", netrc=hostname)}',
expected=True)

playlist_id = self._search_regex(
r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id)

title = self._html_extract_title(webpage) or self._og_search_title(webpage)

return self.playlist_from_matches(
re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage),
playlist_id, title, display_id=display_id, ie=GetCourseRuPlayerIE, video_kwargs={
'url_transparent': True,
'title': title,
})