From b48082a9d3252969628cc09857889aab901c59dd Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Fri, 29 Dec 2023 01:18:08 +0100 Subject: [PATCH 01/38] [GetCourseRuIE] Add extractor --- yt_dlp/extractor/_extractors.py | 3 +++ yt_dlp/extractor/getcourseru.py | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 yt_dlp/extractor/getcourseru.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 62103f13c14..dd3cd48f515 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -680,6 +680,9 @@ GeniusIE, GeniusLyricsIE, ) +from .getcourseru import ( + GetCourseRuIE +) from .gettr import ( GettrIE, GettrStreamingIE, diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py new file mode 100644 index 00000000000..d972383ca29 --- /dev/null +++ b/yt_dlp/extractor/getcourseru.py @@ -0,0 +1,22 @@ +from .common import InfoExtractor + + +class GetCourseRuIE(InfoExtractor): + _VALID_URL = r'^https?:\/\/[^\/]+\.getcourse\.ru\/sign-player\/\?.*$' + _TESTS = [{ + 'url': 'https://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiZTJlZWE3MTI5ZDk3OWQzYzYzMDYzMDUzOGJkMzZlZjEiLCJ1c2VyX2lkIjozNTc3NjY5NjIsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4My44NSIsImdjX2hvc3QiOiJhY2FkZW15bWVsLm9ubGluZSIsInRpbWUiOjE3MDM4MDY1NzksInBheWxvYWQiOiJ1XzM1Nzc2Njk2MiIsInVpX2xhbmd1YWdlIjoicnUiLCJpc19oYXZlX2N1c3RvbV9zdHlsZSI6dHJ1ZX0=&s=a2ed5bd648a2ae7a4f7684abe815ec7a', + 'info_dict': { + 'id': 'master.m3u8?user-cdn=cdnvideo&acc-id=714517&user-id=357766962&loc-mode=ru&version=10:2:1:0:2:cdnvideo&consumer=vod&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyLWlkIjozNTc3NjY5NjJ9', + 'title': 'master', + 'ext': 'mp4', + 'duration': 1871 + # note: the original URL is necessary to obtain an up-to-date URL, because the URL is always changing + } + }] + + def _real_extract(self, url): + webpage = self._download_webpage(url, '') + m3u8_url = self._search_regex(r'\"masterPlaylistUrl\":\"(?P.*?)\"', webpage, 'm3u8').replace('\\', '') + self.to_screen('masterPlaylistUrl is "%s"' % m3u8_url) + + return self.url_result(m3u8_url, 'Generic') From 548d72463c2e6be912ec93a292ebbebe0e432db7 Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Fri, 5 Jan 2024 03:02:27 +0100 Subject: [PATCH 02/38] [AcademyMelIE] Add extractor; [GetCourseRuIE] extractor fixes --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/academymel.py | 97 +++++++++++++++++++++++++++++++++ yt_dlp/extractor/getcourseru.py | 31 +++++++++-- 3 files changed, 125 insertions(+), 4 deletions(-) create mode 100644 yt_dlp/extractor/academymel.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index dd3cd48f515..ab9f0c9dcdb 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -42,6 +42,7 @@ AbemaTVTitleIE, ) from .academicearth import AcademicEarthCourseIE +from .academymel import AcademyMelIE from .acast import ( ACastIE, ACastChannelIE, diff --git a/yt_dlp/extractor/academymel.py b/yt_dlp/extractor/academymel.py new file mode 100644 index 00000000000..e9b53951afc --- /dev/null +++ b/yt_dlp/extractor/academymel.py @@ -0,0 +1,97 @@ +from time import time +from .common import InfoExtractor +from ..cookies import LenientSimpleCookie +from ..utils import urlencode_postdata, ExtractorError, RegexNotFoundError + + +class AcademyMelIE(InfoExtractor): + _TEST_EMAIL = 'meriat@jaga.email' # use this as username in the test/local_parameters.json if running the test + _TEST_PASSWORD = 'bBY-ccbp$8' # use this as password in the test/local_parameters.json if running the test + + _CACHE_KEY = 'academymel' + _CACHE_SUBKEY = 'login-cookie-header' + + _NETRC_MACHINE = 'academymel' + _LOGIN_URL = 'https://academymel.online/cms/system/login' + _VALID_URL = r'^https?:\/\/academymel\.online\/(?P.*)$' + + _TESTS = [{ + 'url': 'http://academymel.online/3video_1', + 'info_dict': { + 'id': 'master.m3u8?user-cdn=cdnvideo&acc-id=714517&user-id=359525183&loc-mode=ru&version=10:2:1:0:2:cdnvideo&consumer=vod&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyLWlkIjozNTk1MjUxODN9', + 'title': 'master', + 'ext': 'mp4', + 'duration': 1693 + } + }, { + 'url': 'http://academymel.online/3video_2', + 'info_dict': { + 'id': 'master.m3u8?user-cdn=cdnvideo&acc-id=714517&user-id=359525183&loc-mode=ru&version=10:2:1:0:2:cdnvideo&consumer=vod&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyLWlkIjozNTk1MjUxODN9', + 'title': 'master', + 'ext': 'mp4', + 'duration': 1871 + } + }] + + def _perform_login(self, username, password): + login_body = urlencode_postdata({ + 'action': 'processXdget', + 'xdgetId': 'r6335_1_1', + 'params[action]': 'login', + 'params[url]': 'http://academymel.online/cms/system/login?required=true', + 'params[object_type]': 'cms_page', + 'params[object_id]': -1, + 'params[email]': username, + 'params[password]': password, + 'requestTime': int(time()) + }) + + try: + webpage = self._request_webpage(self._LOGIN_URL, + None, + data=login_body, + note='Logging into the academymel.online', + errnote='Failed to log in into academymel.online', + fatal=True) + except ExtractorError: + raise ExtractorError('Could not log in into academymel.online (login URL: "%s")' % self._LOGIN_URL, + expected=True) + + # The response itself is a JSON, but it is not needed - only the Set-Cookie value(s) are + cookie_header = webpage.get_header('Set-Cookie') + set_cookie_header = LenientSimpleCookie(cookie_header) + set_cookie_header.load(cookie_header) + self.cache.store(self._CACHE_KEY, self._CACHE_SUBKEY, set_cookie_header) + + def _real_extract(self, url): + valid_url = self._match_valid_url(url) + + if not valid_url: + raise ExtractorError('Invalid URL found', expected=True) + + set_cookie_header = self.cache.load(self._CACHE_KEY, self._CACHE_SUBKEY) + + if not set_cookie_header: + raise ExtractorError('The set-cookie has not been loaded', expected=True) + + try: + webpage = self._download_webpage(url, + None, + headers=set_cookie_header, + fatal=True, + note='Downloading video website', + errnote='Failed to download video website') + except ExtractorError: + raise ExtractorError('Could not download the video website at "%s"' % url, expected=True) + + try: + video_url = self._search_regex( + r']+src=\"(?Phttps?:\/\/[^\/]+\.getcourse\.ru\/sign-player\/\?.*)\"', webpage, + 'url', + fatal=True) + except RegexNotFoundError: + raise ExtractorError('Could not extract a GetCourse.ru video URL', expected=True) + + self.to_screen('AcademyMel video URL found: %s' % video_url) + + return self.url_result(video_url, 'GetCourseRu') diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index d972383ca29..2fe190805e9 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -1,22 +1,45 @@ from .common import InfoExtractor +from ..utils import ExtractorError class GetCourseRuIE(InfoExtractor): + _NETRC_MACHINE = 'getcourseru' _VALID_URL = r'^https?:\/\/[^\/]+\.getcourse\.ru\/sign-player\/\?.*$' + _TESTS = [{ - 'url': 'https://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiZTJlZWE3MTI5ZDk3OWQzYzYzMDYzMDUzOGJkMzZlZjEiLCJ1c2VyX2lkIjozNTc3NjY5NjIsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4My44NSIsImdjX2hvc3QiOiJhY2FkZW15bWVsLm9ubGluZSIsInRpbWUiOjE3MDM4MDY1NzksInBheWxvYWQiOiJ1XzM1Nzc2Njk2MiIsInVpX2xhbmd1YWdlIjoicnUiLCJpc19oYXZlX2N1c3RvbV9zdHlsZSI6dHJ1ZX0=&s=a2ed5bd648a2ae7a4f7684abe815ec7a', + 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiZTJlZWE3MTI5ZDk3OWQzYzYzMDYzMDUzOGJkMzZlZjEiLCJ1c2VyX2lkIjozNTc3NjY5NjIsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4My44NSIsImdjX2hvc3QiOiJhY2FkZW15bWVsLm9ubGluZSIsInRpbWUiOjE3MDM4MDY1NzksInBheWxvYWQiOiJ1XzM1Nzc2Njk2MiIsInVpX2xhbmd1YWdlIjoicnUiLCJpc19oYXZlX2N1c3RvbV9zdHlsZSI6dHJ1ZX0=&s=a2ed5bd648a2ae7a4f7684abe815ec7a', 'info_dict': { 'id': 'master.m3u8?user-cdn=cdnvideo&acc-id=714517&user-id=357766962&loc-mode=ru&version=10:2:1:0:2:cdnvideo&consumer=vod&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyLWlkIjozNTc3NjY5NjJ9', 'title': 'master', 'ext': 'mp4', 'duration': 1871 # note: the original URL is necessary to obtain an up-to-date URL, because the URL is always changing - } + }, + 'skip': 'Requires authentication', + 'note': 'This extractor is used by AcademyMel extractor, which has a login feature' }] def _real_extract(self, url): - webpage = self._download_webpage(url, '') - m3u8_url = self._search_regex(r'\"masterPlaylistUrl\":\"(?P.*?)\"', webpage, 'm3u8').replace('\\', '') + valid_url = self._match_valid_url(url) + + if not valid_url: + raise ExtractorError('Invalid URL found', expected=True) + + try: + webpage = self._download_webpage(url, + None, + fatal=True, + note='Retrieving masterPlaylist URL...', + errnote='Failed to retrieve the masterPlaylist URL') + except ExtractorError: + raise ExtractorError('Failed to retrieve the masterPlaylist URL', expected=True) + + try: + m3u8_url = (self._search_regex(r'\"masterPlaylistUrl\":\"(?P.*?)\"', webpage, 'm3u8', fatal=True) + .replace('\\', '')) + except ExtractorError: + raise ExtractorError('Could not extract the masterPlaylist URL from the GetCourse.ru response', expected=True) + self.to_screen('masterPlaylistUrl is "%s"' % m3u8_url) return self.url_result(m3u8_url, 'Generic') From 28e4eb24c040a400035dada81bb2361109e2aa3f Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Fri, 5 Jan 2024 04:38:55 +0100 Subject: [PATCH 03/38] [AcademyMelIE] extractor fixes to allow multiple videos per site --- yt_dlp/extractor/academymel.py | 38 +++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/academymel.py b/yt_dlp/extractor/academymel.py index e9b53951afc..15703b0e407 100644 --- a/yt_dlp/extractor/academymel.py +++ b/yt_dlp/extractor/academymel.py @@ -1,12 +1,15 @@ -from time import time +import re +import time + +from datetime import datetime from .common import InfoExtractor from ..cookies import LenientSimpleCookie -from ..utils import urlencode_postdata, ExtractorError, RegexNotFoundError +from ..utils import urlencode_postdata, ExtractorError class AcademyMelIE(InfoExtractor): _TEST_EMAIL = 'meriat@jaga.email' # use this as username in the test/local_parameters.json if running the test - _TEST_PASSWORD = 'bBY-ccbp$8' # use this as password in the test/local_parameters.json if running the test + _TEST_PASSWORD = 'bBY-ccbp$8' # use this as password in the test/local_parameters.json if running the test _CACHE_KEY = 'academymel' _CACHE_SUBKEY = 'login-cookie-header' @@ -43,7 +46,7 @@ def _perform_login(self, username, password): 'params[object_id]': -1, 'params[email]': username, 'params[password]': password, - 'requestTime': int(time()) + 'requestTime': int(time.time()) }) try: @@ -63,6 +66,16 @@ def _perform_login(self, username, password): set_cookie_header.load(cookie_header) self.cache.store(self._CACHE_KEY, self._CACHE_SUBKEY, set_cookie_header) + def playlist_from_entries(self, entries, valid_url): + current_timestamp = int(time.time()) + current_datetime = datetime.fromtimestamp(current_timestamp) + formatted_datetime = current_datetime.strftime("%d.%m.%Y, %H:%M") + + return self.playlist_result(entries, + 'academymel-playlist-%d' % current_timestamp, + 'AcademyMel playlist (%s)' % formatted_datetime, + 'AcademyMel playlist for %s (at %s)' % (valid_url, formatted_datetime)) + def _real_extract(self, url): valid_url = self._match_valid_url(url) @@ -84,14 +97,11 @@ def _real_extract(self, url): except ExtractorError: raise ExtractorError('Could not download the video website at "%s"' % url, expected=True) - try: - video_url = self._search_regex( - r']+src=\"(?Phttps?:\/\/[^\/]+\.getcourse\.ru\/sign-player\/\?.*)\"', webpage, - 'url', - fatal=True) - except RegexNotFoundError: - raise ExtractorError('Could not extract a GetCourse.ru video URL', expected=True) - - self.to_screen('AcademyMel video URL found: %s' % video_url) + entries = [] + for video_url in re.findall( + r']+src=\"(?Phttps?://[^/]+\.getcourse\.ru/sign-player/\?.*)\"', + webpage): + self.to_screen('AcademyMel video URL found: %s' % video_url) + entries.append(self.url_result(video_url, 'GetCourseRu')) - return self.url_result(video_url, 'GetCourseRu') + return self.playlist_from_entries(entries, valid_url) From c3af3773ee1d13fe8f6ad3740fc7172daaa3169a Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Sun, 14 Jan 2024 01:54:09 +0100 Subject: [PATCH 04/38] [AcademyMelIE]&[GetCourseRuIE] extractor fixes --- yt_dlp/extractor/academymel.py | 67 +++++++++++---------------------- yt_dlp/extractor/getcourseru.py | 49 +++++++++++++++--------- 2 files changed, 54 insertions(+), 62 deletions(-) diff --git a/yt_dlp/extractor/academymel.py b/yt_dlp/extractor/academymel.py index 15703b0e407..e07f62ec436 100644 --- a/yt_dlp/extractor/academymel.py +++ b/yt_dlp/extractor/academymel.py @@ -3,7 +3,6 @@ from datetime import datetime from .common import InfoExtractor -from ..cookies import LenientSimpleCookie from ..utils import urlencode_postdata, ExtractorError @@ -21,19 +20,11 @@ class AcademyMelIE(InfoExtractor): _TESTS = [{ 'url': 'http://academymel.online/3video_1', 'info_dict': { - 'id': 'master.m3u8?user-cdn=cdnvideo&acc-id=714517&user-id=359525183&loc-mode=ru&version=10:2:1:0:2:cdnvideo&consumer=vod&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyLWlkIjozNTk1MjUxODN9', - 'title': 'master', + 'id': '4885302', + 'title': 'Промоуроки Академии МЕЛ', 'ext': 'mp4', 'duration': 1693 } - }, { - 'url': 'http://academymel.online/3video_2', - 'info_dict': { - 'id': 'master.m3u8?user-cdn=cdnvideo&acc-id=714517&user-id=359525183&loc-mode=ru&version=10:2:1:0:2:cdnvideo&consumer=vod&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyLWlkIjozNTk1MjUxODN9', - 'title': 'master', - 'ext': 'mp4', - 'duration': 1871 - } }] def _perform_login(self, username, password): @@ -49,22 +40,12 @@ def _perform_login(self, username, password): 'requestTime': int(time.time()) }) - try: - webpage = self._request_webpage(self._LOGIN_URL, - None, - data=login_body, - note='Logging into the academymel.online', - errnote='Failed to log in into academymel.online', - fatal=True) - except ExtractorError: - raise ExtractorError('Could not log in into academymel.online (login URL: "%s")' % self._LOGIN_URL, - expected=True) - - # The response itself is a JSON, but it is not needed - only the Set-Cookie value(s) are - cookie_header = webpage.get_header('Set-Cookie') - set_cookie_header = LenientSimpleCookie(cookie_header) - set_cookie_header.load(cookie_header) - self.cache.store(self._CACHE_KEY, self._CACHE_SUBKEY, set_cookie_header) + self._request_webpage(self._LOGIN_URL, + None, + data=login_body, + note='Logging into the academymel.online', + errnote='Failed to log in into academymel.online', + fatal=True) def playlist_from_entries(self, entries, valid_url): current_timestamp = int(time.time()) @@ -82,26 +63,24 @@ def _real_extract(self, url): if not valid_url: raise ExtractorError('Invalid URL found', expected=True) - set_cookie_header = self.cache.load(self._CACHE_KEY, self._CACHE_SUBKEY) + webpage = self._download_webpage(url, + None, + fatal=True, + note='Downloading video website', + errnote='Failed to download video website') - if not set_cookie_header: - raise ExtractorError('The set-cookie has not been loaded', expected=True) - - try: - webpage = self._download_webpage(url, - None, - headers=set_cookie_header, - fatal=True, - note='Downloading video website', - errnote='Failed to download video website') - except ExtractorError: - raise ExtractorError('Could not download the video website at "%s"' % url, expected=True) + title = self._search_regex(r'(?P<title>.*)', webpage, 'title') entries = [] + processed_urls = set() # Set to keep track of processed URLs + for video_url in re.findall( - r']+src=\"(?Phttps?://[^/]+\.getcourse\.ru/sign-player/\?.*)\"', - webpage): - self.to_screen('AcademyMel video URL found: %s' % video_url) - entries.append(self.url_result(video_url, 'GetCourseRu')) + r'data-iframe-src=\"(?Phttps?://[^/]+\.getcourse\.ru/sign-player/\?.*?)\"', + webpage, + re.DOTALL + re.VERBOSE): + # Check if the URL has not been processed before + if video_url not in processed_urls: + entries.append(self.url_result(video_url, 'GetCourseRu', url_transparent=True, title=title)) + processed_urls.add(video_url) # Add the URL to the set of processed URLs return self.playlist_from_entries(entries, valid_url) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 2fe190805e9..af7d9dc5260 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -25,21 +25,34 @@ def _real_extract(self, url): if not valid_url: raise ExtractorError('Invalid URL found', expected=True) - try: - webpage = self._download_webpage(url, - None, - fatal=True, - note='Retrieving masterPlaylist URL...', - errnote='Failed to retrieve the masterPlaylist URL') - except ExtractorError: - raise ExtractorError('Failed to retrieve the masterPlaylist URL', expected=True) - - try: - m3u8_url = (self._search_regex(r'\"masterPlaylistUrl\":\"(?P.*?)\"', webpage, 'm3u8', fatal=True) - .replace('\\', '')) - except ExtractorError: - raise ExtractorError('Could not extract the masterPlaylist URL from the GetCourse.ru response', expected=True) - - self.to_screen('masterPlaylistUrl is "%s"' % m3u8_url) - - return self.url_result(m3u8_url, 'Generic') + webpage = self._download_webpage(url, + None, + fatal=True, + note='Retrieving metadata...', + errnote='Failed to retrieve metadata') + + window_configs = self._search_json( + r'window\.configs\s*=\s*', + webpage, + 'config', + video_id=None, + fatal=True) + + self.to_screen('videoId: %s, videoHash: %s, masterPlaylistUrl: %s, thumbnail_url: %s' + % (window_configs.get('videoId'), + window_configs.get('videoHash'), + window_configs.get('masterPlaylistUrl'), + window_configs.get('previewUrl'))) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + window_configs.get('masterPlaylistUrl'), + window_configs.get('videoId')) + + return { + 'id': str(window_configs.get('videoId')), + 'title': window_configs.get('videoHash'), + 'thumbnail': window_configs.get('thumbnailUrl'), + 'duration': int(window_configs.get('videoDuration')), + 'formats': formats, + 'subtitles': subtitles + } From e90283edac9b352352ca6756b354ed3c7ec32ab1 Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Sun, 14 Jan 2024 02:14:00 +0100 Subject: [PATCH 05/38] [AcademyMelIE]&[GetCourseRuIE] removed unused variables and to_screen output --- yt_dlp/extractor/academymel.py | 3 --- yt_dlp/extractor/getcourseru.py | 6 ------ 2 files changed, 9 deletions(-) diff --git a/yt_dlp/extractor/academymel.py b/yt_dlp/extractor/academymel.py index e07f62ec436..dfd2372e226 100644 --- a/yt_dlp/extractor/academymel.py +++ b/yt_dlp/extractor/academymel.py @@ -10,9 +10,6 @@ class AcademyMelIE(InfoExtractor): _TEST_EMAIL = 'meriat@jaga.email' # use this as username in the test/local_parameters.json if running the test _TEST_PASSWORD = 'bBY-ccbp$8' # use this as password in the test/local_parameters.json if running the test - _CACHE_KEY = 'academymel' - _CACHE_SUBKEY = 'login-cookie-header' - _NETRC_MACHINE = 'academymel' _LOGIN_URL = 'https://academymel.online/cms/system/login' _VALID_URL = r'^https?:\/\/academymel\.online\/(?P.*)$' diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index af7d9dc5260..12d58567721 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -38,12 +38,6 @@ def _real_extract(self, url): video_id=None, fatal=True) - self.to_screen('videoId: %s, videoHash: %s, masterPlaylistUrl: %s, thumbnail_url: %s' - % (window_configs.get('videoId'), - window_configs.get('videoHash'), - window_configs.get('masterPlaylistUrl'), - window_configs.get('previewUrl'))) - formats, subtitles = self._extract_m3u8_formats_and_subtitles( window_configs.get('masterPlaylistUrl'), window_configs.get('videoId')) From 14f69f732c77ee25f967a54993aef73ed8e68ecc Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Sun, 14 Jan 2024 22:14:45 +0100 Subject: [PATCH 06/38] [AcademyMelIE]&[GetCourseRuIE] implemented review remarks --- yt_dlp/extractor/_extractors.py | 4 +- yt_dlp/extractor/academymel.py | 87 +++++++++++---------------------- yt_dlp/extractor/getcourseru.py | 44 ++++++----------- 3 files changed, 46 insertions(+), 89 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3dddf2be5b6..145a4d21b22 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -681,9 +681,7 @@ GeniusIE, GeniusLyricsIE, ) -from .getcourseru import ( - GetCourseRuIE -) +from .getcourseru import GetCourseRuIE from .gettr import ( GettrIE, GettrStreamingIE, diff --git a/yt_dlp/extractor/academymel.py b/yt_dlp/extractor/academymel.py index dfd2372e226..01901237e85 100644 --- a/yt_dlp/extractor/academymel.py +++ b/yt_dlp/extractor/academymel.py @@ -1,18 +1,18 @@ import re import time -from datetime import datetime from .common import InfoExtractor -from ..utils import urlencode_postdata, ExtractorError +from .getcourseru import GetCourseRuIE +from ..utils import urlencode_postdata, update_url_query class AcademyMelIE(InfoExtractor): _TEST_EMAIL = 'meriat@jaga.email' # use this as username in the test/local_parameters.json if running the test _TEST_PASSWORD = 'bBY-ccbp$8' # use this as password in the test/local_parameters.json if running the test - _NETRC_MACHINE = 'academymel' _LOGIN_URL = 'https://academymel.online/cms/system/login' - _VALID_URL = r'^https?:\/\/academymel\.online\/(?P.*)$' + _NETRC_MACHINE = 'academymel' + _VALID_URL = r'https?://academymel\.online/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://academymel.online/3video_1', @@ -25,59 +25,30 @@ class AcademyMelIE(InfoExtractor): }] def _perform_login(self, username, password): - login_body = urlencode_postdata({ - 'action': 'processXdget', - 'xdgetId': 'r6335_1_1', - 'params[action]': 'login', - 'params[url]': 'http://academymel.online/cms/system/login?required=true', - 'params[object_type]': 'cms_page', - 'params[object_id]': -1, - 'params[email]': username, - 'params[password]': password, - 'requestTime': int(time.time()) - }) - - self._request_webpage(self._LOGIN_URL, - None, - data=login_body, - note='Logging into the academymel.online', - errnote='Failed to log in into academymel.online', - fatal=True) - - def playlist_from_entries(self, entries, valid_url): - current_timestamp = int(time.time()) - current_datetime = datetime.fromtimestamp(current_timestamp) - formatted_datetime = current_datetime.strftime("%d.%m.%Y, %H:%M") - - return self.playlist_result(entries, - 'academymel-playlist-%d' % current_timestamp, - 'AcademyMel playlist (%s)' % formatted_datetime, - 'AcademyMel playlist for %s (at %s)' % (valid_url, formatted_datetime)) + self._request_webpage( + self._LOGIN_URL, None, 'Logging in', 'Failed to log in', + data=urlencode_postdata({ + 'action': 'processXdget', + 'xdgetId': 'r6335_1_1', + 'params[action]': 'login', + 'params[url]': update_url_query(self._LOGIN_URL, {'required': 'true'}), + 'params[object_type]': 'cms_page', + 'params[object_id]': -1, + 'params[email]': username, + 'params[password]': password, + 'requestTime': int(time.time()) + })) def _real_extract(self, url): - valid_url = self._match_valid_url(url) - - if not valid_url: - raise ExtractorError('Invalid URL found', expected=True) - - webpage = self._download_webpage(url, - None, - fatal=True, - note='Downloading video website', - errnote='Failed to download video website') - - title = self._search_regex(r'(?P<title>.*)', webpage, 'title') - - entries = [] - processed_urls = set() # Set to keep track of processed URLs - - for video_url in re.findall( - r'data-iframe-src=\"(?Phttps?://[^/]+\.getcourse\.ru/sign-player/\?.*?)\"', - webpage, - re.DOTALL + re.VERBOSE): - # Check if the URL has not been processed before - if video_url not in processed_urls: - entries.append(self.url_result(video_url, 'GetCourseRu', url_transparent=True, title=title)) - processed_urls.add(video_url) # Add the URL to the set of processed URLs - - return self.playlist_from_entries(entries, valid_url) + playlist_id = self._match_id(url) + if not self._get_cookies(self._LOGIN_URL).get('PHPSESSID5'): + self.raise_login_required() + webpage = self._download_webpage(url, playlist_id) + title = self._html_extract_title(webpage) + + return self.playlist_from_matches( + re.findall(r'data-iframe-src="(https?://[^."]+\.getcourse\.ru/sign-player/[^"]+)', webpage), + playlist_id, title, ie=GetCourseRuIE, video_kwargs={ + 'url_transparent': True, + 'title': title, + }) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 12d58567721..070c7be0c71 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -1,18 +1,18 @@ from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import int_or_none, traverse_obj, url_or_none class GetCourseRuIE(InfoExtractor): _NETRC_MACHINE = 'getcourseru' - _VALID_URL = r'^https?:\/\/[^\/]+\.getcourse\.ru\/sign-player\/\?.*$' + _VALID_URL = r'https?://[^.]+\.getcourse\.ru/sign-player/\?json=(?P[^#]+)' _TESTS = [{ - 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiZTJlZWE3MTI5ZDk3OWQzYzYzMDYzMDUzOGJkMzZlZjEiLCJ1c2VyX2lkIjozNTc3NjY5NjIsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4My44NSIsImdjX2hvc3QiOiJhY2FkZW15bWVsLm9ubGluZSIsInRpbWUiOjE3MDM4MDY1NzksInBheWxvYWQiOiJ1XzM1Nzc2Njk2MiIsInVpX2xhbmd1YWdlIjoicnUiLCJpc19oYXZlX2N1c3RvbV9zdHlsZSI6dHJ1ZX0=&s=a2ed5bd648a2ae7a4f7684abe815ec7a', + 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1MjY2Njc2LCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=0c5e0c6aef19da36add3135a162b8eba&vh-static-feature=zigzag', 'info_dict': { - 'id': 'master.m3u8?user-cdn=cdnvideo&acc-id=714517&user-id=357766962&loc-mode=ru&version=10:2:1:0:2:cdnvideo&consumer=vod&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyLWlkIjozNTc3NjY5NjJ9', - 'title': 'master', + 'id': '4885302', + 'title': '190bdf93f1b29735309853a7a19e24b3', 'ext': 'mp4', - 'duration': 1871 + 'duration': 1693 # note: the original URL is necessary to obtain an up-to-date URL, because the URL is always changing }, 'skip': 'Requires authentication', @@ -20,33 +20,21 @@ class GetCourseRuIE(InfoExtractor): }] def _real_extract(self, url): - valid_url = self._match_valid_url(url) - - if not valid_url: - raise ExtractorError('Invalid URL found', expected=True) - - webpage = self._download_webpage(url, - None, - fatal=True, - note='Retrieving metadata...', - errnote='Failed to retrieve metadata') + webpage = self._download_webpage(url, None, 'Downloading player page') window_configs = self._search_json( - r'window\.configs\s*=\s*', - webpage, - 'config', - video_id=None, - fatal=True) - + r'window\.configs\s*=', webpage, 'config', None) + video_id = str(window_configs['videoId']) formats, subtitles = self._extract_m3u8_formats_and_subtitles( - window_configs.get('masterPlaylistUrl'), - window_configs.get('videoId')) + window_configs['masterPlaylistUrl'], video_id) return { - 'id': str(window_configs.get('videoId')), - 'title': window_configs.get('videoHash'), - 'thumbnail': window_configs.get('thumbnailUrl'), - 'duration': int(window_configs.get('videoDuration')), + **traverse_obj(window_configs, { + 'title': ('videoHash', {str}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'duration': ('videoDuration', {int_or_none}), + }), + 'id': video_id, 'formats': formats, 'subtitles': subtitles } From 72f9a57674e928612124a117ee6978bb09293616 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 14 Jan 2024 21:57:37 +0000 Subject: [PATCH 07/38] [ie/academymel] Cleanup --- yt_dlp/extractor/academymel.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/academymel.py b/yt_dlp/extractor/academymel.py index 01901237e85..10e65b68bfe 100644 --- a/yt_dlp/extractor/academymel.py +++ b/yt_dlp/extractor/academymel.py @@ -3,17 +3,13 @@ from .common import InfoExtractor from .getcourseru import GetCourseRuIE -from ..utils import urlencode_postdata, update_url_query +from ..utils import update_url_query, urlencode_postdata class AcademyMelIE(InfoExtractor): - _TEST_EMAIL = 'meriat@jaga.email' # use this as username in the test/local_parameters.json if running the test - _TEST_PASSWORD = 'bBY-ccbp$8' # use this as password in the test/local_parameters.json if running the test - - _LOGIN_URL = 'https://academymel.online/cms/system/login' _NETRC_MACHINE = 'academymel' _VALID_URL = r'https?://academymel\.online/(?P[^/?#]+)' - + _LOGIN_URL = 'https://academymel.online/cms/system/login' _TESTS = [{ 'url': 'http://academymel.online/3video_1', 'info_dict': { From 965935fc32d8110efe4db877a3c1907a25091ee4 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 14 Jan 2024 22:00:07 +0000 Subject: [PATCH 08/38] [ie/getcourseru] Cleanup --- yt_dlp/extractor/getcourseru.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 070c7be0c71..7837d2b2466 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -3,9 +3,7 @@ class GetCourseRuIE(InfoExtractor): - _NETRC_MACHINE = 'getcourseru' _VALID_URL = r'https?://[^.]+\.getcourse\.ru/sign-player/\?json=(?P[^#]+)' - _TESTS = [{ 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1MjY2Njc2LCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=0c5e0c6aef19da36add3135a162b8eba&vh-static-feature=zigzag', 'info_dict': { @@ -13,15 +11,12 @@ class GetCourseRuIE(InfoExtractor): 'title': '190bdf93f1b29735309853a7a19e24b3', 'ext': 'mp4', 'duration': 1693 - # note: the original URL is necessary to obtain an up-to-date URL, because the URL is always changing }, - 'skip': 'Requires authentication', - 'note': 'This extractor is used by AcademyMel extractor, which has a login feature' + 'skip': 'JWT expired', }] def _real_extract(self, url): webpage = self._download_webpage(url, None, 'Downloading player page') - window_configs = self._search_json( r'window\.configs\s*=', webpage, 'config', None) video_id = str(window_configs['videoId']) From 8a54a769a7f0b4ae2ff1c0395793b17558449855 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 14 Jan 2024 22:07:16 +0000 Subject: [PATCH 09/38] [ie/getcourseru] More lenient `_VALID_URL` --- yt_dlp/extractor/getcourseru.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 7837d2b2466..8bd651d1332 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -3,7 +3,7 @@ class GetCourseRuIE(InfoExtractor): - _VALID_URL = r'https?://[^.]+\.getcourse\.ru/sign-player/\?json=(?P[^#]+)' + _VALID_URL = r'https?://[^.]+\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=(?P[^#&]+)' _TESTS = [{ 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1MjY2Njc2LCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=0c5e0c6aef19da36add3135a162b8eba&vh-static-feature=zigzag', 'info_dict': { From effa2ea391d9313fa2cce1f372426aa3ec131d5c Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Sun, 14 Jan 2024 23:34:06 +0100 Subject: [PATCH 10/38] [AcademyMelIE]&[GetCourseRuIE] test the playlist --- yt_dlp/extractor/academymel.py | 15 +++++++++++---- yt_dlp/extractor/getcourseru.py | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/academymel.py b/yt_dlp/extractor/academymel.py index 10e65b68bfe..1ee3790bf5f 100644 --- a/yt_dlp/extractor/academymel.py +++ b/yt_dlp/extractor/academymel.py @@ -13,11 +13,18 @@ class AcademyMelIE(InfoExtractor): _TESTS = [{ 'url': 'http://academymel.online/3video_1', 'info_dict': { - 'id': '4885302', + 'id': '3video_1', 'title': 'Промоуроки Академии МЕЛ', - 'ext': 'mp4', - 'duration': 1693 - } + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '4885302', + 'ext': 'mp4', + 'title': 'Промоуроки Академии МЕЛ', + 'duration': 1693 + }, + }] }] def _perform_login(self, username, password): diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 8bd651d1332..347d4c37793 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -5,7 +5,7 @@ class GetCourseRuIE(InfoExtractor): _VALID_URL = r'https?://[^.]+\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=(?P[^#&]+)' _TESTS = [{ - 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1MjY2Njc2LCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=0c5e0c6aef19da36add3135a162b8eba&vh-static-feature=zigzag', + 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1MjcwMzU0LCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=031d44cc738c58863a436d98f1032132&vh-static-feature=zigzag', 'info_dict': { 'id': '4885302', 'title': '190bdf93f1b29735309853a7a19e24b3', From 7759ab6205e5016c6dd5eb2a4f47686613b05ed4 Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Wed, 17 Jan 2024 01:41:29 +0100 Subject: [PATCH 11/38] [GetCourseRuPlayerIE]&[GetCourseRuIE] adding more generic getcourse.ru extractors --- yt_dlp/extractor/_extractors.py | 6 ++- yt_dlp/extractor/academymel.py | 57 -------------------- yt_dlp/extractor/getcourseru.py | 95 +++++++++++++++++++++++++++++++-- 3 files changed, 94 insertions(+), 64 deletions(-) delete mode 100644 yt_dlp/extractor/academymel.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 145a4d21b22..2167e118dc2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -42,7 +42,6 @@ AbemaTVTitleIE, ) from .academicearth import AcademicEarthCourseIE -from .academymel import AcademyMelIE from .acast import ( ACastIE, ACastChannelIE, @@ -681,7 +680,10 @@ GeniusIE, GeniusLyricsIE, ) -from .getcourseru import GetCourseRuIE +from .getcourseru import ( + GetCourseRuPlayerIE, + GetCourseRuIE +) from .gettr import ( GettrIE, GettrStreamingIE, diff --git a/yt_dlp/extractor/academymel.py b/yt_dlp/extractor/academymel.py deleted file mode 100644 index 1ee3790bf5f..00000000000 --- a/yt_dlp/extractor/academymel.py +++ /dev/null @@ -1,57 +0,0 @@ -import re -import time - -from .common import InfoExtractor -from .getcourseru import GetCourseRuIE -from ..utils import update_url_query, urlencode_postdata - - -class AcademyMelIE(InfoExtractor): - _NETRC_MACHINE = 'academymel' - _VALID_URL = r'https?://academymel\.online/(?P[^/?#]+)' - _LOGIN_URL = 'https://academymel.online/cms/system/login' - _TESTS = [{ - 'url': 'http://academymel.online/3video_1', - 'info_dict': { - 'id': '3video_1', - 'title': 'Промоуроки Академии МЕЛ', - }, - 'playlist_count': 1, - 'playlist': [{ - 'info_dict': { - 'id': '4885302', - 'ext': 'mp4', - 'title': 'Промоуроки Академии МЕЛ', - 'duration': 1693 - }, - }] - }] - - def _perform_login(self, username, password): - self._request_webpage( - self._LOGIN_URL, None, 'Logging in', 'Failed to log in', - data=urlencode_postdata({ - 'action': 'processXdget', - 'xdgetId': 'r6335_1_1', - 'params[action]': 'login', - 'params[url]': update_url_query(self._LOGIN_URL, {'required': 'true'}), - 'params[object_type]': 'cms_page', - 'params[object_id]': -1, - 'params[email]': username, - 'params[password]': password, - 'requestTime': int(time.time()) - })) - - def _real_extract(self, url): - playlist_id = self._match_id(url) - if not self._get_cookies(self._LOGIN_URL).get('PHPSESSID5'): - self.raise_login_required() - webpage = self._download_webpage(url, playlist_id) - title = self._html_extract_title(webpage) - - return self.playlist_from_matches( - re.findall(r'data-iframe-src="(https?://[^."]+\.getcourse\.ru/sign-player/[^"]+)', webpage), - playlist_id, title, ie=GetCourseRuIE, video_kwargs={ - 'url_transparent': True, - 'title': title, - }) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 347d4c37793..8444f6976a5 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -1,15 +1,21 @@ +import pprint +from time import time +from re import findall +from urllib.parse import urlparse + from .common import InfoExtractor -from ..utils import int_or_none, traverse_obj, url_or_none +from ..utils import int_or_none, traverse_obj, update_url_query, url_or_none, urlencode_postdata -class GetCourseRuIE(InfoExtractor): - _VALID_URL = r'https?://[^.]+\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=(?P[^#&]+)' +class GetCourseRuPlayerIE(InfoExtractor): + _VALID_URL = r'https?://player\d{2,}\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=(?P[^#&]+)' _TESTS = [{ - 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1MjcwMzU0LCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=031d44cc738c58863a436d98f1032132&vh-static-feature=zigzag', + 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', 'info_dict': { 'id': '4885302', 'title': '190bdf93f1b29735309853a7a19e24b3', 'ext': 'mp4', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', 'duration': 1693 }, 'skip': 'JWT expired', @@ -26,10 +32,89 @@ def _real_extract(self, url): return { **traverse_obj(window_configs, { 'title': ('videoHash', {str}), - 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'thumbnail': ('previewUrl', {url_or_none}), 'duration': ('videoDuration', {int_or_none}), }), 'id': video_id, 'formats': formats, 'subtitles': subtitles } + + +class GetCourseRuIE(InfoExtractor): + _NETRC_MACHINE = 'getcourseru' + _LOGIN_URL_SUFFIX = 'cms/system/login' + _TESTS = [{ + 'url': 'http://academymel.online/3video_1', + 'info_dict': { + 'id': '3video_1', + 'title': 'Промоуроки Академии МЕЛ', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '4885302', + 'ext': 'mp4', + 'title': 'Промоуроки Академии МЕЛ', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693 + }, + }] + }, { + 'url': 'https://academymel.getcourse.ru/3video_1', + 'info_dict': { + 'id': '3video_1', + 'title': 'Промоуроки Академии МЕЛ', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '4885302', + 'ext': 'mp4', + 'title': 'Промоуроки Академии МЕЛ', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693 + }, + }] + }] + _DOMAINS = [ + r'(?!player\d{2,})[^.]+\.getcourse\.ru', + 'academymel.online' + ] + _VALID_URL = rf'https?://({"|".join(_DOMAINS)})/(?P[^#]+)' + + def _login(self, url, username, password): + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" + + self._request_webpage( + base_url + self._LOGIN_URL_SUFFIX, None, 'Logging in', 'Failed to log in', + data=urlencode_postdata({ + 'action': 'processXdget', + 'xdgetId': 'r6335_1_1', + 'params[action]': 'login', + 'params[url]': update_url_query(base_url + self._LOGIN_URL_SUFFIX, {'required': 'true'}), + 'params[object_type]': 'cms_page', + 'params[object_id]': -1, + 'params[email]': username, + 'params[password]': password, + 'requestTime': int(time()) + })) + + def _real_extract(self, url): + username, password = self._get_login_info() + self._login(url, username, password) + if not self._get_cookies(url).get('PHPSESSID5'): + self.raise_login_required() + + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + title = self._html_extract_title(webpage) + + return self.playlist_from_matches( + findall(r'data-iframe-src="(https?://player\d{2,}\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^"]+)', + webpage), + playlist_id, title, ie=GetCourseRuPlayerIE, video_kwargs={ + 'url_transparent': True, + 'title': title, + }) From 2323a99538010778857a69f9418290b54d187153 Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Thu, 18 Jan 2024 01:29:37 +0100 Subject: [PATCH 12/38] [GetCourseRuPlayerIE]&[GetCourseRuIE] fixing URLs and playlist_id --- yt_dlp/extractor/getcourseru.py | 35 ++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 8444f6976a5..a4426d1b4bd 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -1,6 +1,6 @@ import pprint from time import time -from re import findall +from re import escape, findall from urllib.parse import urlparse from .common import InfoExtractor @@ -76,12 +76,32 @@ class GetCourseRuIE(InfoExtractor): 'duration': 1693 }, }] + }, { + 'url': 'https://academymel.getcourse.ru/pl/teach/control/lesson/view?id=319141781&editMode=0', + 'info_dict': { + 'id': '319141781', + 'title': '1. Разминка у стены', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '4919601', + 'ext': 'mp4', + 'title': '1. Разминка у стены', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/5a521788e7dc25b4f70c3dff6512d90e/preview.jpg?version=1703223532&host=vh-81', + 'duration': 704 + }, + }], + 'skip': 'paid lesson' }] _DOMAINS = [ - r'(?!player\d{2,})[^.]+\.getcourse\.ru', 'academymel.online' ] - _VALID_URL = rf'https?://({"|".join(_DOMAINS)})/(?P[^#]+)' + _BASE_URL_RE = rf'https?://(?:(?!player\d+)[^.]+\.getcourse\.ru|{"|".join(map(escape, _DOMAINS))})' + _VALID_URL = [ + rf'{_BASE_URL_RE}/(?P[^/?#]+)/?(?:[?#]|$)', + rf'{_BASE_URL_RE}/[^?#]+/view/?\?(?:[^#]+&)?id=(?P\d+)', + ] def _login(self, url, username, password): parsed_url = urlparse(url) @@ -92,6 +112,7 @@ def _login(self, url, username, password): data=urlencode_postdata({ 'action': 'processXdget', 'xdgetId': 'r6335_1_1', + #'xdgetId': '99945', 'params[action]': 'login', 'params[url]': update_url_query(base_url + self._LOGIN_URL_SUFFIX, {'required': 'true'}), 'params[object_type]': 'cms_page', @@ -104,11 +125,15 @@ def _login(self, url, username, password): def _real_extract(self, url): username, password = self._get_login_info() self._login(url, username, password) + if not self._get_cookies(url).get('PHPSESSID5'): self.raise_login_required() - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + playlist_id = self._search_regex( + r'window\.lessonId\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) + title = self._html_extract_title(webpage) return self.playlist_from_matches( From 24bb5404b8cdce49a1d76c4ec9213c2c4aa25f05 Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Thu, 18 Jan 2024 01:38:42 +0100 Subject: [PATCH 13/38] [GetCourseRuPlayerIE]&[GetCourseRuIE] fixing flake8 remarks, login-related TODO --- yt_dlp/extractor/getcourseru.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index a4426d1b4bd..bd1e8766b72 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -1,4 +1,3 @@ -import pprint from time import time from re import escape, findall from urllib.parse import urlparse @@ -107,12 +106,12 @@ def _login(self, url, username, password): parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" + # TODO: find proper `xdgetId` by parsing the login webpage and extracting it self._request_webpage( base_url + self._LOGIN_URL_SUFFIX, None, 'Logging in', 'Failed to log in', data=urlencode_postdata({ 'action': 'processXdget', 'xdgetId': 'r6335_1_1', - #'xdgetId': '99945', 'params[action]': 'login', 'params[url]': update_url_query(base_url + self._LOGIN_URL_SUFFIX, {'required': 'true'}), 'params[object_type]': 'cms_page', From b0302b510c25b4344099a74f87c15bd149d2cbfd Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Thu, 18 Jan 2024 02:03:40 +0100 Subject: [PATCH 14/38] [GetCourseRuPlayerIE]&[GetCourseRuIE] xdgetId is now parsed from login page --- yt_dlp/extractor/getcourseru.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index bd1e8766b72..3e08d2c84b4 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -106,12 +106,16 @@ def _login(self, url, username, password): parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" - # TODO: find proper `xdgetId` by parsing the login webpage and extracting it + webpage = self._download_webpage(base_url + self._LOGIN_URL_SUFFIX, None) + xdget_id = self._html_search_regex( + r']*class="[^"]*state-login[^"]*"[^>]*data-xdget-id="([^"]+)"', + webpage, 'xdgetId') + self._request_webpage( base_url + self._LOGIN_URL_SUFFIX, None, 'Logging in', 'Failed to log in', data=urlencode_postdata({ 'action': 'processXdget', - 'xdgetId': 'r6335_1_1', + 'xdgetId': xdget_id, 'params[action]': 'login', 'params[url]': update_url_query(base_url + self._LOGIN_URL_SUFFIX, {'required': 'true'}), 'params[object_type]': 'cms_page', From 7149abed680506ab1d931bbc57690e395acd0e55 Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Thu, 18 Jan 2024 02:12:36 +0100 Subject: [PATCH 15/38] [GetCourseRuPlayerIE]&[GetCourseRuIE] added another test-case, that was successfully tested (with proper credentials) --- yt_dlp/extractor/getcourseru.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 3e08d2c84b4..326be3d6652 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -92,6 +92,23 @@ class GetCourseRuIE(InfoExtractor): }, }], 'skip': 'paid lesson' + }, { + 'url': 'https://manibeauty.getcourse.ru/pl/teach/control/lesson/view?id=272499894', + 'info_dict': { + 'id': '272499894', + 'title': 'Мотивация к тренировкам', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '4242723', + 'ext': 'mp4', + 'title': 'Мотивация к тренировкам', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/70ed5b9f489dd03b4aff55bfdff71a26/preview.jpg?version=1685115787&host=vh-71', + 'duration': 30 + }, + }], + 'skip': 'paid lesson' }] _DOMAINS = [ 'academymel.online' From 36c6dd1c7ab43577c55bf1e221e986d5c3c727db Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:24:01 +0100 Subject: [PATCH 16/38] Only match exactly player02 and add getcourse.io --- yt_dlp/extractor/getcourseru.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 326be3d6652..3b6a43492dd 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -7,7 +7,7 @@ class GetCourseRuPlayerIE(InfoExtractor): - _VALID_URL = r'https?://player\d{2,}\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=(?P[^#&]+)' + _VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+' _TESTS = [{ 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', 'info_dict': { @@ -113,7 +113,7 @@ class GetCourseRuIE(InfoExtractor): _DOMAINS = [ 'academymel.online' ] - _BASE_URL_RE = rf'https?://(?:(?!player\d+)[^.]+\.getcourse\.ru|{"|".join(map(escape, _DOMAINS))})' + _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' _VALID_URL = [ rf'{_BASE_URL_RE}/(?P[^/?#]+)/?(?:[?#]|$)', rf'{_BASE_URL_RE}/[^?#]+/view/?\?(?:[^#]+&)?id=(?P\d+)', From b5748f60e767cf20682687e8412254aca488a249 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:24:12 +0100 Subject: [PATCH 17/38] Add Embed regex --- yt_dlp/extractor/getcourseru.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 3b6a43492dd..2a3b3ef6c5d 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -19,6 +19,7 @@ class GetCourseRuPlayerIE(InfoExtractor): }, 'skip': 'JWT expired', }] + _EMBED_REGEX = [rf'(?x)]+\bsrc=[\'"](?P{_VALID_URL}[^\'"]*)'] def _real_extract(self, url): webpage = self._download_webpage(url, None, 'Downloading player page') From d7fa3c48a30fedf3d33a33f2055315d772943d5f Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:24:46 +0100 Subject: [PATCH 18/38] Use gcFileId as this is also used in webpage --- yt_dlp/extractor/getcourseru.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 2a3b3ef6c5d..fb84d966238 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -11,7 +11,7 @@ class GetCourseRuPlayerIE(InfoExtractor): _TESTS = [{ 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', 'info_dict': { - 'id': '4885302', + 'id': '513573381', 'title': '190bdf93f1b29735309853a7a19e24b3', 'ext': 'mp4', 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', @@ -25,7 +25,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, None, 'Downloading player page') window_configs = self._search_json( r'window\.configs\s*=', webpage, 'config', None) - video_id = str(window_configs['videoId']) + video_id = str(window_configs['gcFileId']) formats, subtitles = self._extract_m3u8_formats_and_subtitles( window_configs['masterPlaylistUrl'], video_id) @@ -53,7 +53,7 @@ class GetCourseRuIE(InfoExtractor): 'playlist_count': 1, 'playlist': [{ 'info_dict': { - 'id': '4885302', + 'id': '513573381', 'ext': 'mp4', 'title': 'Промоуроки Академии МЕЛ', 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', @@ -69,7 +69,7 @@ class GetCourseRuIE(InfoExtractor): 'playlist_count': 1, 'playlist': [{ 'info_dict': { - 'id': '4885302', + 'id': '513573381', 'ext': 'mp4', 'title': 'Промоуроки Академии МЕЛ', 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', From d7b18d9b554911d3d1d86cd1837422b9e0aa306a Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:25:02 +0100 Subject: [PATCH 19/38] Add getcourse.io test --- yt_dlp/extractor/getcourseru.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index fb84d966238..b7e29917644 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -110,6 +110,9 @@ class GetCourseRuIE(InfoExtractor): }, }], 'skip': 'paid lesson' + }, { + 'url': 'https://gaismasmandalas.getcourse.io/ATLAUTSEVBUT', + 'only_matching': True, }] _DOMAINS = [ 'academymel.online' From db82da757ce142472240c16cb68d3ae05b900b64 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:25:13 +0100 Subject: [PATCH 20/38] Add marafon.mani-beauty.com domain --- yt_dlp/extractor/getcourseru.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index b7e29917644..61cba91bcc3 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -115,7 +115,8 @@ class GetCourseRuIE(InfoExtractor): 'only_matching': True, }] _DOMAINS = [ - 'academymel.online' + 'academymel.online', + 'marafon.mani-beauty.com', ] _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' _VALID_URL = [ From 9ac446d9aef69d42416f2bf566cd1a7e8c650b18 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:25:48 +0100 Subject: [PATCH 21/38] Use embed regex for consistency to find iframes --- yt_dlp/extractor/getcourseru.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 61cba91bcc3..e8bc58a4d67 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -162,9 +162,8 @@ def _real_extract(self, url): title = self._html_extract_title(webpage) return self.playlist_from_matches( - findall(r'data-iframe-src="(https?://player\d{2,}\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^"]+)', - webpage), - playlist_id, title, ie=GetCourseRuPlayerIE, video_kwargs={ + re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage), + playlist_id, title, display_id=display_id, ie=GetCourseRuPlayerIE, video_kwargs={ 'url_transparent': True, 'title': title, }) From 7bbcc4e89d8795ce990ec1df190eb864682e1bce Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:26:13 +0100 Subject: [PATCH 22/38] Extract og title as fallback. Web interface allows both to be set independently --- yt_dlp/extractor/getcourseru.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index e8bc58a4d67..21e4bc380c1 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -159,7 +159,7 @@ def _real_extract(self, url): playlist_id = self._search_regex( r'window\.lessonId\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) - title = self._html_extract_title(webpage) + title = self._html_extract_title(webpage) or self._og_search_title(webpage) return self.playlist_from_matches( re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage), From d60d40abddc3a7e1e0a8e35ec673fae61fa659bd Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:27:03 +0100 Subject: [PATCH 23/38] Fix regex --- yt_dlp/extractor/getcourseru.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 21e4bc380c1..c25f81036e4 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -120,8 +120,8 @@ class GetCourseRuIE(InfoExtractor): ] _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' _VALID_URL = [ - rf'{_BASE_URL_RE}/(?P[^/?#]+)/?(?:[?#]|$)', - rf'{_BASE_URL_RE}/[^?#]+/view/?\?(?:[^#]+&)?id=(?P\d+)', + rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P[^?#]+)', + rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', ] def _login(self, url, username, password): From 799f5a673784406af80ca262d7a921ca7de531f6 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:27:38 +0100 Subject: [PATCH 24/38] Only login if username/password is passed --- yt_dlp/extractor/getcourseru.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index c25f81036e4..50871bba03e 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -149,7 +149,8 @@ def _login(self, url, username, password): def _real_extract(self, url): username, password = self._get_login_info() - self._login(url, username, password) + if username: + self._login(url, username, password) if not self._get_cookies(url).get('PHPSESSID5'): self.raise_login_required() From a2d1d067931394a1021d4e646ef29458d29e9491 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:28:58 +0100 Subject: [PATCH 25/38] Refactor login code --- yt_dlp/extractor/getcourseru.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 50871bba03e..dd3f02c07f3 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -43,7 +43,7 @@ def _real_extract(self, url): class GetCourseRuIE(InfoExtractor): _NETRC_MACHINE = 'getcourseru' - _LOGIN_URL_SUFFIX = 'cms/system/login' + _LOGIN_URL_PATH = '/cms/system/login' _TESTS = [{ 'url': 'http://academymel.online/3video_1', 'info_dict': { @@ -125,21 +125,21 @@ class GetCourseRuIE(InfoExtractor): ] def _login(self, url, username, password): - parsed_url = urlparse(url) - base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" + domain = urllib.parse.urlparse(url).netloc + login_url = f'https://{domain}{self._LOGIN_URL_PATH}' - webpage = self._download_webpage(base_url + self._LOGIN_URL_SUFFIX, None) + webpage = self._download_webpage(login_url, None) xdget_id = self._html_search_regex( r']*class="[^"]*state-login[^"]*"[^>]*data-xdget-id="([^"]+)"', webpage, 'xdgetId') self._request_webpage( - base_url + self._LOGIN_URL_SUFFIX, None, 'Logging in', 'Failed to log in', + login_url, None, 'Logging in', 'Failed to log in', data=urlencode_postdata({ 'action': 'processXdget', 'xdgetId': xdget_id, 'params[action]': 'login', - 'params[url]': update_url_query(base_url + self._LOGIN_URL_SUFFIX, {'required': 'true'}), + 'params[url]': login_url, 'params[object_type]': 'cms_page', 'params[object_id]': -1, 'params[email]': username, From 273782d14dc329f080a1f35d6b2579ac6a1fc7e5 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:29:50 +0100 Subject: [PATCH 26/38] Add simple sign to login The site sends this. Seems like a csrf value. Even though it is unchecked, better to send it imo. --- yt_dlp/extractor/getcourseru.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index dd3f02c07f3..1cacb0c6483 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -133,6 +133,10 @@ def _login(self, url, username, password): r']*class="[^"]*state-login[^"]*"[^>]*data-xdget-id="([^"]+)"', webpage, 'xdgetId') + simple_sign = self._html_search_regex( + r'window.requestSimpleSign\s*=\s*"([\da-f]+)"', + webpage, 'simple sign') + self._request_webpage( login_url, None, 'Logging in', 'Failed to log in', data=urlencode_postdata({ @@ -144,7 +148,8 @@ def _login(self, url, username, password): 'params[object_id]': -1, 'params[email]': username, 'params[password]': password, - 'requestTime': int(time()) + 'requestTime': int(time.time()), + 'requestSimpleSign': simple_sign, })) def _real_extract(self, url): From 17e21cee515d48d28b13844723412f7838102183 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:30:15 +0100 Subject: [PATCH 27/38] Don't abort on no login Some pages exist that can be accessed without logging in --- yt_dlp/extractor/getcourseru.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 1cacb0c6483..1be80bfb653 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -125,6 +125,8 @@ class GetCourseRuIE(InfoExtractor): ] def _login(self, url, username, password): + if self._get_cookies(url).get('PHPSESSID5'): + return domain = urllib.parse.urlparse(url).netloc login_url = f'https://{domain}{self._LOGIN_URL_PATH}' @@ -157,11 +159,12 @@ def _real_extract(self, url): if username: self._login(url, username, password) - if not self._get_cookies(url).get('PHPSESSID5'): + display_id = self._match_id(url) + # NB: 404 is returned due to yt-dlp not properly following redirects #9020 + webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=404) + if self._LOGIN_URL_PATH in urlh.url or urlh.status == 404: self.raise_login_required() - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) playlist_id = self._search_regex( r'window\.lessonId\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) From 0cf1aa652dbcc62d5bd54636a9c0955181277af2 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:31:19 +0100 Subject: [PATCH 28/38] Extract proper id for pages --- yt_dlp/extractor/getcourseru.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 1be80bfb653..1f1bec9fa0b 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -47,7 +47,8 @@ class GetCourseRuIE(InfoExtractor): _TESTS = [{ 'url': 'http://academymel.online/3video_1', 'info_dict': { - 'id': '3video_1', + 'id': '3059742', + 'display_id': '3video_1', 'title': 'Промоуроки Академии МЕЛ', }, 'playlist_count': 1, @@ -63,7 +64,8 @@ class GetCourseRuIE(InfoExtractor): }, { 'url': 'https://academymel.getcourse.ru/3video_1', 'info_dict': { - 'id': '3video_1', + 'id': '3059742', + 'display_id': '3video_1', 'title': 'Промоуроки Академии МЕЛ', }, 'playlist_count': 1, @@ -166,7 +168,7 @@ def _real_extract(self, url): self.raise_login_required() playlist_id = self._search_regex( - r'window\.lessonId\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) + r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) title = self._html_extract_title(webpage) or self._og_search_title(webpage) From c765ee8f4881550671c00d5d26c6b4406ed4c077 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 06:31:43 +0100 Subject: [PATCH 29/38] Cleanup imports --- yt_dlp/extractor/getcourseru.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 1f1bec9fa0b..258872db5d0 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -1,9 +1,10 @@ -from time import time -from re import escape, findall -from urllib.parse import urlparse +import re +import time +import urllib.parse from .common import InfoExtractor -from ..utils import int_or_none, traverse_obj, update_url_query, url_or_none, urlencode_postdata +from ..utils import int_or_none, url_or_none, urlencode_postdata +from ..utils.traversal import traverse_obj class GetCourseRuPlayerIE(InfoExtractor): From 21c6dfaf13043d87714252209fc80a7130e61b18 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 17:23:40 +0100 Subject: [PATCH 30/38] Use separate netrc machine per hostname --- yt_dlp/extractor/getcourseru.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 258872db5d0..6ed4e8a4bab 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -127,11 +127,10 @@ class GetCourseRuIE(InfoExtractor): rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', ] - def _login(self, url, username, password): - if self._get_cookies(url).get('PHPSESSID5'): + def _login(self, hostanme, username, password): + if self._get_cookies(f'https://{hostanme}').get('PHPSESSID5'): return - domain = urllib.parse.urlparse(url).netloc - login_url = f'https://{domain}{self._LOGIN_URL_PATH}' + login_url = f'https://{hostanme}{self._LOGIN_URL_PATH}' webpage = self._download_webpage(login_url, None) xdget_id = self._html_search_regex( @@ -158,9 +157,10 @@ def _login(self, url, username, password): })) def _real_extract(self, url): - username, password = self._get_login_info() + hostanme = urllib.parse.urlparse(url).hostname + username, password = self._get_login_info(netrc_machine=hostanme) if username: - self._login(url, username, password) + self._login(hostanme, username, password) display_id = self._match_id(url) # NB: 404 is returned due to yt-dlp not properly following redirects #9020 From 624158e0cc73cd4a5d3a6044519d979a7b2be806 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 17:46:45 +0100 Subject: [PATCH 31/38] typo --- yt_dlp/extractor/getcourseru.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 6ed4e8a4bab..dcbcef3ee6e 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -127,10 +127,10 @@ class GetCourseRuIE(InfoExtractor): rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', ] - def _login(self, hostanme, username, password): - if self._get_cookies(f'https://{hostanme}').get('PHPSESSID5'): + def _login(self, hostname, username, password): + if self._get_cookies(f'https://{hostname}').get('PHPSESSID5'): return - login_url = f'https://{hostanme}{self._LOGIN_URL_PATH}' + login_url = f'https://{hostname}{self._LOGIN_URL_PATH}' webpage = self._download_webpage(login_url, None) xdget_id = self._html_search_regex( @@ -157,10 +157,10 @@ def _login(self, hostanme, username, password): })) def _real_extract(self, url): - hostanme = urllib.parse.urlparse(url).hostname - username, password = self._get_login_info(netrc_machine=hostanme) + hostname = urllib.parse.urlparse(url).hostname + username, password = self._get_login_info(netrc_machine=hostname) if username: - self._login(hostanme, username, password) + self._login(hostname, username, password) display_id = self._match_id(url) # NB: 404 is returned due to yt-dlp not properly following redirects #9020 From e488d1040ae0943fa2a73361eaecebf8ddaedf40 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 21:28:22 +0100 Subject: [PATCH 32/38] Apply suggestions from code review Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/getcourseru.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index dcbcef3ee6e..f6cd55307ca 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -20,7 +20,7 @@ class GetCourseRuPlayerIE(InfoExtractor): }, 'skip': 'JWT expired', }] - _EMBED_REGEX = [rf'(?x)]+\bsrc=[\'"](?P{_VALID_URL}[^\'"]*)'] + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL}[^\'"]*)'] def _real_extract(self, url): webpage = self._download_webpage(url, None, 'Downloading player page') @@ -134,7 +134,7 @@ def _login(self, hostname, username, password): webpage = self._download_webpage(login_url, None) xdget_id = self._html_search_regex( - r']*class="[^"]*state-login[^"]*"[^>]*data-xdget-id="([^"]+)"', + r']+\bclass="[^"]*\bstate-login[^"]*"[^>]+\bdata-xdget-id="([^"]+)"', webpage, 'xdgetId') simple_sign = self._html_search_regex( From 07c7431951efec57815fa3d78ce0ac4e5b039967 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 21:30:06 +0100 Subject: [PATCH 33/38] Inline xdgetId and requestSimpleSign --- yt_dlp/extractor/getcourseru.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index f6cd55307ca..23ac3563ac9 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -131,21 +131,15 @@ def _login(self, hostname, username, password): if self._get_cookies(f'https://{hostname}').get('PHPSESSID5'): return login_url = f'https://{hostname}{self._LOGIN_URL_PATH}' - webpage = self._download_webpage(login_url, None) - xdget_id = self._html_search_regex( - r']+\bclass="[^"]*\bstate-login[^"]*"[^>]+\bdata-xdget-id="([^"]+)"', - webpage, 'xdgetId') - - simple_sign = self._html_search_regex( - r'window.requestSimpleSign\s*=\s*"([\da-f]+)"', - webpage, 'simple sign') self._request_webpage( login_url, None, 'Logging in', 'Failed to log in', data=urlencode_postdata({ 'action': 'processXdget', - 'xdgetId': xdget_id, + 'xdgetId': self._html_search_regex( + r']+\bclass="[^"]*\bstate-login[^"]*"[^>]+\bdata-xdget-id="([^"]+)"', + webpage, 'xdgetId'), 'params[action]': 'login', 'params[url]': login_url, 'params[object_type]': 'cms_page', @@ -153,7 +147,8 @@ def _login(self, hostname, username, password): 'params[email]': username, 'params[password]': password, 'requestTime': int(time.time()), - 'requestSimpleSign': simple_sign, + 'requestSimpleSign': self._html_search_regex( + r'window.requestSimpleSign\s*=\s*"([\da-f]+)"', webpage, 'simple sign'), })) def _real_extract(self, url): From cf8ae1df1077e52a2bbbfa3fb7965ca309da74a1 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 18 Jan 2024 21:48:32 +0100 Subject: [PATCH 34/38] Use correct netrc in error message --- yt_dlp/extractor/getcourseru.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 23ac3563ac9..2b1b4188786 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -3,7 +3,7 @@ import urllib.parse from .common import InfoExtractor -from ..utils import int_or_none, url_or_none, urlencode_postdata +from ..utils import ExtractorError, int_or_none, url_or_none, urlencode_postdata from ..utils.traversal import traverse_obj @@ -161,7 +161,9 @@ def _real_extract(self, url): # NB: 404 is returned due to yt-dlp not properly following redirects #9020 webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=404) if self._LOGIN_URL_PATH in urlh.url or urlh.status == 404: - self.raise_login_required() + raise ExtractorError( + f'This video is only available for registered users. {self._login_hint("any", netrc=hostname)}', + expected=True) playlist_id = self._search_regex( r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) From b8677c94ce75e6ecad06a1dc47165fe8c48e44e1 Mon Sep 17 00:00:00 2001 From: "Dr. Steven Strange" Date: Fri, 19 Jan 2024 01:11:52 +0100 Subject: [PATCH 35/38] [GetCourseRuPlayerIE]&[GetCourseRuIE] added on.psbook.ru domain and fixed an ID in test --- yt_dlp/extractor/getcourseru.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 2b1b4188786..01f25d637e5 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -105,7 +105,7 @@ class GetCourseRuIE(InfoExtractor): 'playlist_count': 1, 'playlist': [{ 'info_dict': { - 'id': '4242723', + 'id': '447479687', 'ext': 'mp4', 'title': 'Мотивация к тренировкам', 'thumbnail': 'https://preview-htz.vhcdn.com/preview/70ed5b9f489dd03b4aff55bfdff71a26/preview.jpg?version=1685115787&host=vh-71', @@ -120,6 +120,7 @@ class GetCourseRuIE(InfoExtractor): _DOMAINS = [ 'academymel.online', 'marafon.mani-beauty.com', + 'on.psbook.ru' ] _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' _VALID_URL = [ From 0db6db349f4465a66306723ca2295cf89faf750f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 19 Jan 2024 20:13:15 +0000 Subject: [PATCH 36/38] cosmetic --- yt_dlp/extractor/getcourseru.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 01f25d637e5..1a265aaf3b8 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -44,6 +44,16 @@ def _real_extract(self, url): class GetCourseRuIE(InfoExtractor): _NETRC_MACHINE = 'getcourseru' + _DOMAINS = [ + 'academymel.online', + 'marafon.mani-beauty.com', + 'on.psbook.ru' + ] + _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' + _VALID_URL = [ + rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P[^?#]+)', + rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', + ] _LOGIN_URL_PATH = '/cms/system/login' _TESTS = [{ 'url': 'http://academymel.online/3video_1', @@ -117,16 +127,6 @@ class GetCourseRuIE(InfoExtractor): 'url': 'https://gaismasmandalas.getcourse.io/ATLAUTSEVBUT', 'only_matching': True, }] - _DOMAINS = [ - 'academymel.online', - 'marafon.mani-beauty.com', - 'on.psbook.ru' - ] - _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' - _VALID_URL = [ - rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P[^?#]+)', - rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', - ] def _login(self, hostname, username, password): if self._get_cookies(f'https://{hostname}').get('PHPSESSID5'): From 6113c3d0866510f9686dee1a7475794027d26a94 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 19 Jan 2024 20:19:12 +0000 Subject: [PATCH 37/38] cosmetic pt2 --- yt_dlp/extractor/getcourseru.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 1a265aaf3b8..d3037c2ea9a 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -9,6 +9,7 @@ class GetCourseRuPlayerIE(InfoExtractor): _VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+' + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL}[^\'"]*)'] _TESTS = [{ 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', 'info_dict': { @@ -20,7 +21,6 @@ class GetCourseRuPlayerIE(InfoExtractor): }, 'skip': 'JWT expired', }] - _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL}[^\'"]*)'] def _real_extract(self, url): webpage = self._download_webpage(url, None, 'Downloading player page') @@ -54,7 +54,6 @@ class GetCourseRuIE(InfoExtractor): rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P[^?#]+)', rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', ] - _LOGIN_URL_PATH = '/cms/system/login' _TESTS = [{ 'url': 'http://academymel.online/3video_1', 'info_dict': { @@ -128,6 +127,8 @@ class GetCourseRuIE(InfoExtractor): 'only_matching': True, }] + _LOGIN_URL_PATH = '/cms/system/login' + def _login(self, hostname, username, password): if self._get_cookies(f'https://{hostname}').get('PHPSESSID5'): return From 9944e98d1cfac887d504854c52256d4952d5dbb1 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 19 Jan 2024 20:21:31 +0000 Subject: [PATCH 38/38] reverse title fallback --- yt_dlp/extractor/getcourseru.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index d3037c2ea9a..6fdbcd7366d 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -169,8 +169,7 @@ def _real_extract(self, url): playlist_id = self._search_regex( r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) - - title = self._html_extract_title(webpage) or self._og_search_title(webpage) + title = self._og_search_title(webpage) or self._html_extract_title(webpage) return self.playlist_from_matches( re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage),