From 19aa313a37eb803a2d497b9ca128cce4b3116902 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 19 Oct 2023 14:26:27 +0200 Subject: [PATCH] Rewrite OnDemandKorea Extractor --- yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/ondemandkorea.py | 192 ++++++++++++++++++++---------- 2 files changed, 131 insertions(+), 66 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index ca457118282..2f0523a6de4 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1387,7 +1387,10 @@ from .oktoberfesttv import OktoberfestTVIE from .olympics import OlympicsReplayIE from .on24 import On24IE -from .ondemandkorea import OnDemandKoreaIE +from .ondemandkorea import ( + OnDemandKoreaIE, + OnDemandKoreaProgramIE, +) from .onefootball import OneFootballIE from .onenewsnz import OneNewsNZIE from .oneplace import OnePlacePodcastIE diff --git a/yt_dlp/extractor/ondemandkorea.py b/yt_dlp/extractor/ondemandkorea.py index dd7d1d7dead..de9599b140d 100644 --- a/yt_dlp/extractor/ondemandkorea.py +++ b/yt_dlp/extractor/ondemandkorea.py @@ -1,87 +1,149 @@ +import functools import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( ExtractorError, - js_to_json, + float_or_none, + int_or_none, + join_nonempty, + OnDemandPagedList, + parse_age_limit, + parse_qs, + random_uuidv4, + unified_strdate, + url_or_none, ) +from ..utils.traversal import traverse_obj class OnDemandKoreaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?:en/)?player/vod/[a-z0-9-]+\?(?:[^#]+&)?contentId=(?P\d+)' _GEO_COUNTRIES = ['US', 'CA'] + _TESTS = [{ - 'url': 'https://www.ondemandkorea.com/ask-us-anything-e351.html', + 'url': 'https://www.ondemandkorea.com/player/vod/ask-us-anything?contentId=686471', + 'md5': 'e2ff77255d989e3135bde0c5889fbce8', 'info_dict': { - 'id': 'ask-us-anything-e351', + 'id': '686471', 'ext': 'mp4', - 'title': 'Ask Us Anything : Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won - 09/24/2022', - 'description': 'A talk show/game show with a school theme where celebrity guests appear as “transfer students.”', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Ask Us Anything: Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won', + 'thumbnail': 'https://sp.ondemandkorea.com/wp-content/themes/ondemandkorea/uploads/thumbnail/1891035_20220924_1.jpg', + 'duration': 5486.955, + 'release_date': '20220924', + 'series': 'Ask Us Anything', + 'series_id': 11790, + 'episode_number': 351, + 'episode': 'Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won', }, - 'params': { - 'skip_download': 'm3u8 download' - } }, { - 'url': 'https://www.ondemandkorea.com/work-later-drink-now-e1.html', + 'url': 'https://www.ondemandkorea.com/en/player/vod/joint-security-area?contentId=464622', + 'md5': '44e274d2b04977e03fc7f3941fbcb355', 'info_dict': { - 'id': 'work-later-drink-now-e1', + 'id': '464622', 'ext': 'mp4', - 'title': 'Work Later, Drink Now : E01', - 'description': 'Work Later, Drink First follows three women who find solace in a glass of liquor at the end of the day. So-hee, who gets comfort from a cup of soju af', - 'thumbnail': r're:^https?://.*\.png$', - 'subtitles': { - 'English': 'mincount:1', - }, + 'title': 'Joint Security Area: Main Movie', + 'thumbnail': 'https://sp.ondemandkorea.com/wp-content/themes/ondemandkorea/uploads/thumbnail/jsa.1080p.4896k_3410.901645.jpg', + 'age_limit': 15, + 'duration': 6525.0, + 'release_date': '20200114', }, - 'params': { - 'skip_download': 'm3u8 download' - } }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, fatal=False) - - if not webpage: - # Page sometimes returns captcha page with HTTP 403 - raise ExtractorError( - 'Unable to access page. You may have been blocked.', - expected=True) - - if 'msg_block_01.png' in webpage: - self.raise_geo_restricted( - msg='This content is not available in your region', - countries=self._GEO_COUNTRIES) - - if 'This video is only available to ODK PLUS members.' in webpage: - raise ExtractorError( - 'This video is only available to ODK PLUS members.', - expected=True) - - if 'ODK PREMIUM Members Only' in webpage: - raise ExtractorError( - 'This video is only available to ODK PREMIUM members.', - expected=True) - - title = self._search_regex( - r'class=["\']episode_title["\'][^>]*>([^<]+)', - webpage, 'episode_title', fatal=False) or self._og_search_title(webpage) - - jw_config = self._parse_json( - self._search_regex(( - r'(?P{\s*[\'"]tracks[\'"].*?})[)\];]+$', - r'playlist\s*=\s*\[(?P.+)];?$', - r'odkPlayer\.init.*?(?P{[^;]+}).*?;', - ), webpage, 'jw config', flags=re.MULTILINE | re.DOTALL, group='options'), - video_id, transform_source=js_to_json) - info = self._parse_jwplayer_data( - jw_config, video_id, require_title=False, m3u8_id='hls', - base_url=url) - - info.update({ - 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage) - }) - return info + + data = self._download_json(f'https://odkmedia.io/odx/api/v3/playback/{video_id}/', video_id, + fatal=False, headers={'service-name': 'odk'}, + query={'did': random_uuidv4()}, expected_status=(200, 403)) + if not data.get('result'): + raise ExtractorError(traverse_obj(data, ('messages', '__default'), 'title'), expected=True) + + potential_urls = traverse_obj(data, ('result', 'sources', ..., 'url'), ('result', 'manifests', ..., 'url')) + # Try to bypass geo-restricted ad proxy + potential_urls = [ + alt_url if (alt_url := traverse_obj(url, ({parse_qs}, 'stream_url', 0, {url_or_none}))) else url + for url in potential_urls + ] + # Try to upgrade quality + potential_urls = [ + mod_url if self._request_webpage( + HEADRequest(mod_url := re.sub(r'_720(p?)\.m3u8', r'_1080\1.m3u8', url)), video_id, + note='Checking if higher quality format is available', fatal=False) else url + for url in potential_urls + ] + + formats = [] + for url in potential_urls: + formats.extend(self._extract_m3u8_formats(url, video_id, fatal=False)) + + subtitles = {} + for track in traverse_obj(data, ('result', 'text_tracks', lambda _, v: url_or_none(v['url']))): + subtitles.setdefault(track.get('language', 'und'), []).append({ + 'url': track['url'], + 'ext': track.get('codec'), + 'name': track.get('label'), + }) + + return { + 'id': video_id, + 'title': join_nonempty( + ('result', 'episode', 'program', 'title'), + ('result', 'episode', 'title'), from_dict=data, delim=': '), + **traverse_obj(data, ('result', { + 'thumbnail': ('episode', 'images', 'thumbnail', {url_or_none}), + 'release_date': ('episode', 'release_date', {lambda x: x.replace('-', '')}, {unified_strdate}), + 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), + 'age_limit': ('age_rating', 'name', {lambda x: x.replace('R', '')}, {parse_age_limit}), + 'series': ('episode', {lambda x: x['program'] if x['kind'] == 'series' else None}, 'title'), + 'series_id': ('episode', {lambda x: x['program'] if x['kind'] == 'series' else None}, 'id'), + 'episode': ('episode', {lambda x: x['title'] if x['kind'] == 'series' else None},), + 'episode_number': ('episode', {lambda x: x['number'] if x['kind'] == 'series' else None}, {int_or_none}), + }), get_all=False), + 'formats': formats, + 'subtitles': subtitles, + } + + +class OnDemandKoreaProgramIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?:en/)?player/vod/(?P[a-z0-9-]+)(?:$|[?#])' + _GEO_COUNTRIES = ['US', 'CA'] + + _TESTS = [{ + 'url': 'https://www.ondemandkorea.com/player/vod/uskn-news', + 'info_dict': { + 'id': 'uskn-news', + }, + 'playlist_count': 755, + }, { + 'url': 'https://www.ondemandkorea.com/en/player/vod/joint-security-area', + 'info_dict': { + 'id': 'joint-security-area', + }, + 'playlist_count': 2, + }] + + _PAGE_SIZE = 100 + + def _fetch_page(self, display_id, page): + page += 1 + page_data = self._download_json( + f'https://odkmedia.io/odx/api/v3/program/{display_id}/episodes/', display_id, + headers={'service-name': 'odk'}, query={ + 'page': page, + 'page_size': self._PAGE_SIZE, + }, note=f'Downloading page {page}') + for episode in traverse_obj(page_data, ('result', 'results')): + yield self.url_result( + f'https://www.ondemandkorea.com/player/vod/{display_id}?contentId={episode["id"]}', + ie=OnDemandKoreaIE, video_title=episode.get('title') + ) + + def _real_extract(self, url): + display_id = self._match_id(url) + + entries = OnDemandPagedList(functools.partial( + self._fetch_page, display_id), self._PAGE_SIZE) + + return self.playlist_result(entries, display_id)