From db51f0f9bb5d122d6619a97d050918dc0fbbce9f Mon Sep 17 00:00:00 2001 From: Maxence Date: Wed, 3 Jan 2024 21:15:15 -0500 Subject: [PATCH 01/11] Fix turbo extractor --- yt_dlp/extractor/turbo.py | 78 +++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 44 deletions(-) diff --git a/yt_dlp/extractor/turbo.py b/yt_dlp/extractor/turbo.py index cdb7dcff85a..936937bf5cf 100644 --- a/yt_dlp/extractor/turbo.py +++ b/yt_dlp/extractor/turbo.py @@ -1,64 +1,54 @@ import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( - ExtractorError, - int_or_none, - qualities, - xpath_text, + get_element_html_by_class, ) class TurboIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?turbo\.fr/videos-voiture/(?P[0-9]+)-' - _API_URL = 'http://www.turbo.fr/api/tv/xml.php?player_generique=player_generique&id={0:}' + _VALID_URL = r'https?://(?:www\.)?turbo\.fr/((?P[_\d\w]+).xml)?' + _API_URL = 'https://www.viously.com/video/hls/{0:}/index.m3u8' _TEST = { 'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html', - 'md5': '33f4b91099b36b5d5a91f84b5bcba600', + 'md5': '37a6c3381599381ff53a7e1e0575c0bc', 'info_dict': { - 'id': '454443', + 'id': 'F_xQzS2jwb3', 'ext': 'mp4', - 'duration': 3715, - 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ', + 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', - 'thumbnail': r're:^https?://.*\.jpg$', } } - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - playlist = self._download_xml(self._API_URL.format(video_id), video_id) - item = playlist.find('./channel/item') - if item is None: - raise ExtractorError('Playlist item was not found', expected=True) - - title = xpath_text(item, './title', 'title') - duration = int_or_none(xpath_text(item, './durate', 'duration')) - thumbnail = xpath_text(item, './visuel_clip', 'thumbnail') - description = self._html_search_meta('description', webpage) - - formats = [] - get_quality = qualities(['3g', 'sd', 'hq']) - for child in item: - m = re.search(r'url_video_(?P.+)', child.tag) - if m: - quality = compat_str(m.group('quality')) - formats.append({ - 'format_id': quality, - 'url': child.text, - 'quality': get_quality(quality), - }) - + def _entries(self, playlist): + items = playlist.findall('./channel/item') + for item in items: + if item is None or item.find('./link').text is None: + continue + yield self._extract_video(item.find('./link').text) + + def _extract_video(self, url): + webpage = self._download_webpage(url, None) + viously_player = get_element_html_by_class('viously-player-wrapper', webpage) + video_id = self._html_search_regex(r'id="([-_\w]+)"', viously_player, 'video_id') + title = self._html_extract_title(webpage) return { 'id': video_id, 'title': title, - 'duration': duration, - 'thumbnail': thumbnail, - 'description': description, - 'formats': formats, + 'description': title, + 'formats': self._extract_m3u8_formats(self._API_URL.format(video_id), video_id), } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + playlist_id = mobj.group('playlist') + + if playlist_id and self._yes_playlist(playlist_id, None): + playlist = self._download_xml(url, playlist_id) + return self.playlist_result( + self._entries(playlist), + playlist_id, + playlist_title=playlist.find('./channel/title').text, + ) + + return self._extract_video(url) From d317430cb807cacb7cbc13008126be82a001bd79 Mon Sep 17 00:00:00 2001 From: Maxence Date: Wed, 3 Jan 2024 21:33:46 -0500 Subject: [PATCH 02/11] lint --- yt_dlp/extractor/turbo.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/yt_dlp/extractor/turbo.py b/yt_dlp/extractor/turbo.py index 936937bf5cf..f9c90f046fc 100644 --- a/yt_dlp/extractor/turbo.py +++ b/yt_dlp/extractor/turbo.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( get_element_html_by_class, From 8a98f168e4ebcfaf580eee3b2a54b7840e1fae2f Mon Sep 17 00:00:00 2001 From: Maxence Date: Sun, 7 Jan 2024 16:06:46 -0500 Subject: [PATCH 03/11] Add viously embedded extractor --- supportedsites.md | 1 + yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/viously.py | 38 +++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 yt_dlp/extractor/viously.py diff --git a/supportedsites.md b/supportedsites.md index 96681c16b99..765f50e74e1 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1601,6 +1601,7 @@ - **ViMP:Playlist** - **Vine** - **vine:user** + - **viously** - **Viqeo** - **Viu** - **viu:ott**: [*viu*](## "netrc machine") diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6f7a1e4f106..2c74af6714b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2223,6 +2223,7 @@ VikiIE, VikiChannelIE, ) +from .viously import ViouslyIE from .viqeo import ViqeoIE from .viu import ( ViuIE, diff --git a/yt_dlp/extractor/viously.py b/yt_dlp/extractor/viously.py new file mode 100644 index 00000000000..7761a04bd43 --- /dev/null +++ b/yt_dlp/extractor/viously.py @@ -0,0 +1,38 @@ +from .common import InfoExtractor +from ..utils import ( + get_element_html_by_class, + get_elements_html_by_class, +) + + +class ViouslyIE(InfoExtractor): + _VALID_URL = False + _API_URL = 'https://www.viously.com/video/hls/{0:}/index.m3u8' + _TEST = { + 'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html', + 'md5': '37a6c3381599381ff53a7e1e0575c0bc', + 'info_dict': { + 'id': 'F_xQzS2jwb3', + 'ext': 'mp4', + 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', + 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', + 'age_limit': 0, + 'upload_date': str, + 'timestamp': float, + } + } + + def _extract_from_webpage(self, url, webpage): + has_vously_player = get_element_html_by_class('viously-player', webpage) or get_element_html_by_class('vsly-player', webpage) + if not has_vously_player: + return + viously_players = get_elements_html_by_class('viously-player', webpage) + get_elements_html_by_class('vsly-player', webpage) + for viously_player in viously_players: + video_id = self._html_search_regex(r'id="([-_\w]+)"', viously_player, 'video_id') + title = self._html_extract_title(webpage) + yield { + 'id': video_id, + 'title': title, + 'description': title, + 'formats': self._extract_m3u8_formats(self._API_URL.format(video_id), video_id), + } From b657473c27d5fe61e6592b8c4997f0adced9a992 Mon Sep 17 00:00:00 2001 From: Maxence Date: Sun, 7 Jan 2024 16:06:59 -0500 Subject: [PATCH 04/11] Remove Turbo extractor --- supportedsites.md | 1 - yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/turbo.py | 52 --------------------------------- 3 files changed, 54 deletions(-) delete mode 100644 yt_dlp/extractor/turbo.py diff --git a/supportedsites.md b/supportedsites.md index 765f50e74e1..321e93b2d75 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1471,7 +1471,6 @@ - **TuneInPodcast** - **TuneInPodcastEpisode** - **TuneInStation** - - **Turbo** - **tv.dfb.de** - **TV2** - **TV2Article** diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2c74af6714b..557ff944704 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2019,7 +2019,6 @@ TuneInPodcastEpisodeIE, TuneInShortenerIE, ) -from .turbo import TurboIE from .tv2 import ( TV2IE, TV2ArticleIE, diff --git a/yt_dlp/extractor/turbo.py b/yt_dlp/extractor/turbo.py deleted file mode 100644 index f9c90f046fc..00000000000 --- a/yt_dlp/extractor/turbo.py +++ /dev/null @@ -1,52 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - get_element_html_by_class, -) - - -class TurboIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?turbo\.fr/((?P[_\d\w]+).xml)?' - _API_URL = 'https://www.viously.com/video/hls/{0:}/index.m3u8' - _TEST = { - 'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html', - 'md5': '37a6c3381599381ff53a7e1e0575c0bc', - 'info_dict': { - 'id': 'F_xQzS2jwb3', - 'ext': 'mp4', - 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', - 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', - } - } - - def _entries(self, playlist): - items = playlist.findall('./channel/item') - for item in items: - if item is None or item.find('./link').text is None: - continue - yield self._extract_video(item.find('./link').text) - - def _extract_video(self, url): - webpage = self._download_webpage(url, None) - viously_player = get_element_html_by_class('viously-player-wrapper', webpage) - video_id = self._html_search_regex(r'id="([-_\w]+)"', viously_player, 'video_id') - title = self._html_extract_title(webpage) - return { - 'id': video_id, - 'title': title, - 'description': title, - 'formats': self._extract_m3u8_formats(self._API_URL.format(video_id), video_id), - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - playlist_id = mobj.group('playlist') - - if playlist_id and self._yes_playlist(playlist_id, None): - playlist = self._download_xml(url, playlist_id) - return self.playlist_result( - self._entries(playlist), - playlist_id, - playlist_title=playlist.find('./channel/title').text, - ) - - return self._extract_video(url) From 9a17e3a919a25a97d57d2f7499e69212988c3712 Mon Sep 17 00:00:00 2001 From: Maxence Date: Sun, 7 Jan 2024 16:23:43 -0500 Subject: [PATCH 05/11] fix tests --- yt_dlp/extractor/viously.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/viously.py b/yt_dlp/extractor/viously.py index 7761a04bd43..0f9b8ca7a05 100644 --- a/yt_dlp/extractor/viously.py +++ b/yt_dlp/extractor/viously.py @@ -8,7 +8,7 @@ class ViouslyIE(InfoExtractor): _VALID_URL = False _API_URL = 'https://www.viously.com/video/hls/{0:}/index.m3u8' - _TEST = { + _WEBPAGE_TESTS = [{ 'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html', 'md5': '37a6c3381599381ff53a7e1e0575c0bc', 'info_dict': { @@ -20,7 +20,7 @@ class ViouslyIE(InfoExtractor): 'upload_date': str, 'timestamp': float, } - } + }] def _extract_from_webpage(self, url, webpage): has_vously_player = get_element_html_by_class('viously-player', webpage) or get_element_html_by_class('vsly-player', webpage) From 78583331111bf26017d56cc294d508f802eb2c05 Mon Sep 17 00:00:00 2001 From: Maxence Date: Sun, 7 Jan 2024 23:30:30 -0500 Subject: [PATCH 06/11] Revert supportedsites.md --- supportedsites.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/supportedsites.md b/supportedsites.md index 321e93b2d75..96681c16b99 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1471,6 +1471,7 @@ - **TuneInPodcast** - **TuneInPodcastEpisode** - **TuneInStation** + - **Turbo** - **tv.dfb.de** - **TV2** - **TV2Article** @@ -1600,7 +1601,6 @@ - **ViMP:Playlist** - **Vine** - **vine:user** - - **viously** - **Viqeo** - **Viu** - **viu:ott**: [*viu*](## "netrc machine") From d7c1751bb67151ed26611f2bd7cac52ad092a9d7 Mon Sep 17 00:00:00 2001 From: Maxence Date: Sun, 7 Jan 2024 23:40:56 -0500 Subject: [PATCH 07/11] Use viously api to get video info --- yt_dlp/extractor/viously.py | 42 +++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/viously.py b/yt_dlp/extractor/viously.py index 0f9b8ca7a05..4138965a058 100644 --- a/yt_dlp/extractor/viously.py +++ b/yt_dlp/extractor/viously.py @@ -1,13 +1,17 @@ +import base64 + from .common import InfoExtractor from ..utils import ( - get_element_html_by_class, + extract_attributes, get_elements_html_by_class, + int_or_none, + parse_iso8601, ) +from ..utils.traversal import traverse_obj class ViouslyIE(InfoExtractor): _VALID_URL = False - _API_URL = 'https://www.viously.com/video/hls/{0:}/index.m3u8' _WEBPAGE_TESTS = [{ 'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html', 'md5': '37a6c3381599381ff53a7e1e0575c0bc', @@ -23,16 +27,32 @@ class ViouslyIE(InfoExtractor): }] def _extract_from_webpage(self, url, webpage): - has_vously_player = get_element_html_by_class('viously-player', webpage) or get_element_html_by_class('vsly-player', webpage) - if not has_vously_player: - return viously_players = get_elements_html_by_class('viously-player', webpage) + get_elements_html_by_class('vsly-player', webpage) - for viously_player in viously_players: - video_id = self._html_search_regex(r'id="([-_\w]+)"', viously_player, 'video_id') - title = self._html_extract_title(webpage) + if not viously_players: + return + + def custom_decode(text): + STANDARD_ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=' + CUSTOM_ALPHABET = 'VIOUSLYABCDEFGHJKMNPQRTWXZviouslyabcdefghjkmnpqrtwxz9876543210+/=' + data = base64.b64decode(text.translate(str.maketrans(CUSTOM_ALPHABET, STANDARD_ALPHABET))) + return data.decode('utf-8').strip('\x00') + + for video_id in traverse_obj(viously_players, (..., {extract_attributes}, 'id')): + formats = self._extract_m3u8_formats( + f'https://www.viously.com/video/hls/{video_id}/index.m3u8', video_id, fatal=False) + data = self._download_json( + f'https://www.viously.com/export/json/{video_id}', video_id, + transform_source=custom_decode, fatal=False) + if not formats or not data: + continue yield { 'id': video_id, - 'title': title, - 'description': title, - 'formats': self._extract_m3u8_formats(self._API_URL.format(video_id), video_id), + 'formats': formats, + **traverse_obj(data, ('video', { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {int_or_none}), + 'timestamp': ('iso_date', {parse_iso8601}), + 'categories': ('category', {lambda x: [x['name']]}), + })), } From 1d07a96cf5d0c7ddf0ff1156a351e8af1fc50cd5 Mon Sep 17 00:00:00 2001 From: Maxence Date: Mon, 8 Jan 2024 19:07:42 -0500 Subject: [PATCH 08/11] use re.findall to find players --- yt_dlp/extractor/viously.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/viously.py b/yt_dlp/extractor/viously.py index 4138965a058..43d9960590c 100644 --- a/yt_dlp/extractor/viously.py +++ b/yt_dlp/extractor/viously.py @@ -1,9 +1,9 @@ import base64 +import re from .common import InfoExtractor from ..utils import ( extract_attributes, - get_elements_html_by_class, int_or_none, parse_iso8601, ) @@ -27,7 +27,7 @@ class ViouslyIE(InfoExtractor): }] def _extract_from_webpage(self, url, webpage): - viously_players = get_elements_html_by_class('viously-player', webpage) + get_elements_html_by_class('vsly-player', webpage) + viously_players = re.findall(r']*class="(?:[^"]*\s)?v(?:iou)?sly-player(?:\s[^"]*)?"[^>]*>', webpage) if not viously_players: return From 9e3beb1bfa7f44b0682643af3eb9f4fb30e6204a Mon Sep 17 00:00:00 2001 From: Maxence Date: Mon, 8 Jan 2024 19:08:33 -0500 Subject: [PATCH 09/11] Update tests --- yt_dlp/extractor/viously.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/viously.py b/yt_dlp/extractor/viously.py index 43d9960590c..3fe41cd4db3 100644 --- a/yt_dlp/extractor/viously.py +++ b/yt_dlp/extractor/viously.py @@ -18,11 +18,13 @@ class ViouslyIE(InfoExtractor): 'info_dict': { 'id': 'F_xQzS2jwb3', 'ext': 'mp4', - 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', - 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', + 'title': 'Turbo du 07/09/2014\xa0: Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', + 'description': 'Turbo du 07/09/2014\xa0: Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', 'age_limit': 0, - 'upload_date': str, - 'timestamp': float, + 'upload_date': '20230328', + 'timestamp': 1680037507, + 'duration': 3716, + 'categories': ['motors'], } }] From b2575bcb1124837e19f84dbcd0c6fc42be0d0930 Mon Sep 17 00:00:00 2001 From: Maxence Date: Mon, 8 Jan 2024 20:53:02 -0500 Subject: [PATCH 10/11] do extract if metadata not available --- yt_dlp/extractor/viously.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/viously.py b/yt_dlp/extractor/viously.py index 3fe41cd4db3..ce2f5ee01ee 100644 --- a/yt_dlp/extractor/viously.py +++ b/yt_dlp/extractor/viously.py @@ -42,11 +42,11 @@ def custom_decode(text): for video_id in traverse_obj(viously_players, (..., {extract_attributes}, 'id')): formats = self._extract_m3u8_formats( f'https://www.viously.com/video/hls/{video_id}/index.m3u8', video_id, fatal=False) + if not formats: + continue data = self._download_json( f'https://www.viously.com/export/json/{video_id}', video_id, transform_source=custom_decode, fatal=False) - if not formats or not data: - continue yield { 'id': video_id, 'formats': formats, From acafd165ef92469db1ffff93828087610a9aec23 Mon Sep 17 00:00:00 2001 From: Maxence Date: Mon, 8 Jan 2024 20:54:11 -0500 Subject: [PATCH 11/11] Fix returned dict --- yt_dlp/extractor/viously.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/viously.py b/yt_dlp/extractor/viously.py index ce2f5ee01ee..9ec7ed35f5d 100644 --- a/yt_dlp/extractor/viously.py +++ b/yt_dlp/extractor/viously.py @@ -51,10 +51,10 @@ def custom_decode(text): 'id': video_id, 'formats': formats, **traverse_obj(data, ('video', { - 'title': 'title', - 'description': 'description', + 'title': ('title', {str}), + 'description': ('description', {str}), 'duration': ('duration', {int_or_none}), 'timestamp': ('iso_date', {parse_iso8601}), - 'categories': ('category', {lambda x: [x['name']]}), + 'categories': ('category', 'name', {str}, {lambda x: [x] if x else None}), })), }