Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GetCourseRuIE] & [AcademyMel] Add extractor #8873

Merged
merged 39 commits into from Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b48082a
[GetCourseRuIE] Add extractor
Dec 29, 2023
548d724
[AcademyMelIE] Add extractor; [GetCourseRuIE] extractor fixes
Jan 5, 2024
0763788
Merge branch 'yt-dlp:master' into getcourse-ru
divStar Jan 5, 2024
28e4eb2
[AcademyMelIE] extractor fixes to allow multiple videos per site
Jan 5, 2024
c3af377
[AcademyMelIE]&[GetCourseRuIE] extractor fixes
Jan 14, 2024
e90283e
[AcademyMelIE]&[GetCourseRuIE] removed unused variables and to_screen…
Jan 14, 2024
14f69f7
[AcademyMelIE]&[GetCourseRuIE] implemented review remarks
Jan 14, 2024
72f9a57
[ie/academymel] Cleanup
bashonly Jan 14, 2024
965935f
[ie/getcourseru] Cleanup
bashonly Jan 14, 2024
8a54a76
[ie/getcourseru] More lenient `_VALID_URL`
bashonly Jan 14, 2024
effa2ea
[AcademyMelIE]&[GetCourseRuIE] test the playlist
Jan 14, 2024
7759ab6
[GetCourseRuPlayerIE]&[GetCourseRuIE] adding more generic getcourse.r…
Jan 17, 2024
2323a99
[GetCourseRuPlayerIE]&[GetCourseRuIE] fixing URLs and playlist_id
Jan 18, 2024
24bb540
[GetCourseRuPlayerIE]&[GetCourseRuIE] fixing flake8 remarks, login-re…
Jan 18, 2024
b0302b5
[GetCourseRuPlayerIE]&[GetCourseRuIE] xdgetId is now parsed from logi…
Jan 18, 2024
7149abe
[GetCourseRuPlayerIE]&[GetCourseRuIE] added another test-case, that w…
Jan 18, 2024
36c6dd1
Only match exactly player02 and add getcourse.io
seproDev Jan 18, 2024
b5748f6
Add Embed regex
seproDev Jan 18, 2024
d7fa3c4
Use gcFileId as this is also used in webpage
seproDev Jan 18, 2024
d7b18d9
Add getcourse.io test
seproDev Jan 18, 2024
db82da7
Add marafon.mani-beauty.com domain
seproDev Jan 18, 2024
9ac446d
Use embed regex for consistency to find iframes
seproDev Jan 18, 2024
7bbcc4e
Extract og title as fallback.
seproDev Jan 18, 2024
d60d40a
Fix regex
seproDev Jan 18, 2024
799f5a6
Only login if username/password is passed
seproDev Jan 18, 2024
a2d1d06
Refactor login code
seproDev Jan 18, 2024
273782d
Add simple sign to login
seproDev Jan 18, 2024
17e21ce
Don't abort on no login
seproDev Jan 18, 2024
0cf1aa6
Extract proper id for pages
seproDev Jan 18, 2024
c765ee8
Cleanup imports
seproDev Jan 18, 2024
21c6dfa
Use separate netrc machine per hostname
seproDev Jan 18, 2024
624158e
typo
seproDev Jan 18, 2024
e488d10
Apply suggestions from code review
seproDev Jan 18, 2024
07c7431
Inline xdgetId and requestSimpleSign
seproDev Jan 18, 2024
cf8ae1d
Use correct netrc in error message
seproDev Jan 18, 2024
b8677c9
[GetCourseRuPlayerIE]&[GetCourseRuIE] added on.psbook.ru domain and f…
Jan 19, 2024
0db6db3
cosmetic
bashonly Jan 19, 2024
6113c3d
cosmetic pt2
bashonly Jan 19, 2024
9944e98
reverse title fallback
bashonly Jan 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions yt_dlp/extractor/_extractors.py
Expand Up @@ -42,6 +42,7 @@
AbemaTVTitleIE,
)
from .academicearth import AcademicEarthCourseIE
from .academymel import AcademyMelIE
from .acast import (
ACastIE,
ACastChannelIE,
Expand Down Expand Up @@ -680,6 +681,9 @@
GeniusIE,
GeniusLyricsIE,
)
from .getcourseru import (
GetCourseRuIE
)
divStar marked this conversation as resolved.
Show resolved Hide resolved
from .gettr import (
GettrIE,
GettrStreamingIE,
Expand Down
86 changes: 86 additions & 0 deletions yt_dlp/extractor/academymel.py
@@ -0,0 +1,86 @@
import re
import time

from datetime import datetime
from .common import InfoExtractor
from ..utils import urlencode_postdata, ExtractorError
divStar marked this conversation as resolved.
Show resolved Hide resolved


class AcademyMelIE(InfoExtractor):
_TEST_EMAIL = 'meriat@jaga.email' # use this as username in the test/local_parameters.json if running the test
_TEST_PASSWORD = 'bBY-ccbp$8' # use this as password in the test/local_parameters.json if running the test
bashonly marked this conversation as resolved.
Show resolved Hide resolved

_CACHE_KEY = 'academymel'
_CACHE_SUBKEY = 'login-cookie-header'
divStar marked this conversation as resolved.
Show resolved Hide resolved

_NETRC_MACHINE = 'academymel'
_LOGIN_URL = 'https://academymel.online/cms/system/login'
_VALID_URL = r'^https?:\/\/academymel\.online\/(?P<url>.*)$'
divStar marked this conversation as resolved.
Show resolved Hide resolved

_TESTS = [{
'url': 'http://academymel.online/3video_1',
'info_dict': {
'id': '4885302',
'title': 'Промоуроки Академии МЕЛ',
'ext': 'mp4',
'duration': 1693
}
}]
divStar marked this conversation as resolved.
Show resolved Hide resolved

def _perform_login(self, username, password):
login_body = urlencode_postdata({
'action': 'processXdget',
'xdgetId': 'r6335_1_1',
'params[action]': 'login',
'params[url]': 'http://academymel.online/cms/system/login?required=true',
'params[object_type]': 'cms_page',
'params[object_id]': -1,
'params[email]': username,
'params[password]': password,
'requestTime': int(time.time())
})

self._request_webpage(self._LOGIN_URL,
None,
data=login_body,
note='Logging into the academymel.online',
errnote='Failed to log in into academymel.online',
fatal=True)
divStar marked this conversation as resolved.
Show resolved Hide resolved

def playlist_from_entries(self, entries, valid_url):
current_timestamp = int(time.time())
current_datetime = datetime.fromtimestamp(current_timestamp)
formatted_datetime = current_datetime.strftime("%d.%m.%Y, %H:%M")

return self.playlist_result(entries,
'academymel-playlist-%d' % current_timestamp,
'AcademyMel playlist (%s)' % formatted_datetime,
'AcademyMel playlist for %s (at %s)' % (valid_url, formatted_datetime))
divStar marked this conversation as resolved.
Show resolved Hide resolved

def _real_extract(self, url):
valid_url = self._match_valid_url(url)

if not valid_url:
raise ExtractorError('Invalid URL found', expected=True)

webpage = self._download_webpage(url,
None,
fatal=True,
note='Downloading video website',
errnote='Failed to download video website')

title = self._search_regex(r'<title>(?P<title>.*)</title>', webpage, 'title')

entries = []
processed_urls = set() # Set to keep track of processed URLs
divStar marked this conversation as resolved.
Show resolved Hide resolved
divStar marked this conversation as resolved.
Show resolved Hide resolved

for video_url in re.findall(
r'data-iframe-src=\"(?P<url>https?://[^/]+\.getcourse\.ru/sign-player/\?.*?)\"',
webpage,
re.DOTALL + re.VERBOSE):
# Check if the URL has not been processed before
if video_url not in processed_urls:
entries.append(self.url_result(video_url, 'GetCourseRu', url_transparent=True, title=title))
processed_urls.add(video_url) # Add the URL to the set of processed URLs

return self.playlist_from_entries(entries, valid_url)
divStar marked this conversation as resolved.
Show resolved Hide resolved
58 changes: 58 additions & 0 deletions yt_dlp/extractor/getcourseru.py
@@ -0,0 +1,58 @@
from .common import InfoExtractor
from ..utils import ExtractorError
divStar marked this conversation as resolved.
Show resolved Hide resolved


class GetCourseRuIE(InfoExtractor):
_NETRC_MACHINE = 'getcourseru'
_VALID_URL = r'^https?:\/\/[^\/]+\.getcourse\.ru\/sign-player\/\?.*$'
divStar marked this conversation as resolved.
Show resolved Hide resolved

_TESTS = [{
'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiZTJlZWE3MTI5ZDk3OWQzYzYzMDYzMDUzOGJkMzZlZjEiLCJ1c2VyX2lkIjozNTc3NjY5NjIsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4My44NSIsImdjX2hvc3QiOiJhY2FkZW15bWVsLm9ubGluZSIsInRpbWUiOjE3MDM4MDY1NzksInBheWxvYWQiOiJ1XzM1Nzc2Njk2MiIsInVpX2xhbmd1YWdlIjoicnUiLCJpc19oYXZlX2N1c3RvbV9zdHlsZSI6dHJ1ZX0=&s=a2ed5bd648a2ae7a4f7684abe815ec7a',
divStar marked this conversation as resolved.
Show resolved Hide resolved
'info_dict': {
'id': 'master.m3u8?user-cdn=cdnvideo&acc-id=714517&user-id=357766962&loc-mode=ru&version=10:2:1:0:2:cdnvideo&consumer=vod&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyLWlkIjozNTc3NjY5NjJ9',
'title': 'master',
'ext': 'mp4',
'duration': 1871
# note: the original URL is necessary to obtain an up-to-date URL, because the URL is always changing
},
'skip': 'Requires authentication',
'note': 'This extractor is used by AcademyMel extractor, which has a login feature'
}]

def _real_extract(self, url):
valid_url = self._match_valid_url(url)

if not valid_url:
raise ExtractorError('Invalid URL found', expected=True)

webpage = self._download_webpage(url,
None,
fatal=True,
note='Retrieving metadata...',
errnote='Failed to retrieve metadata')
divStar marked this conversation as resolved.
Show resolved Hide resolved

window_configs = self._search_json(
r'window\.configs\s*=\s*',
webpage,
'config',
video_id=None,
fatal=True)

self.to_screen('videoId: %s, videoHash: %s, masterPlaylistUrl: %s, thumbnail_url: %s'
% (window_configs.get('videoId'),
window_configs.get('videoHash'),
window_configs.get('masterPlaylistUrl'),
window_configs.get('previewUrl')))
divStar marked this conversation as resolved.
Show resolved Hide resolved

formats, subtitles = self._extract_m3u8_formats_and_subtitles(
window_configs.get('masterPlaylistUrl'),
window_configs.get('videoId'))

return {
'id': str(window_configs.get('videoId')),
'title': window_configs.get('videoHash'),
'thumbnail': window_configs.get('thumbnailUrl'),
'duration': int(window_configs.get('videoDuration')),
'formats': formats,
'subtitles': subtitles
}
divStar marked this conversation as resolved.
Show resolved Hide resolved