From 0a52904073961d2fc21dcdd75ec0b8c36302fb5a Mon Sep 17 00:00:00 2001 From: pasha Date: Wed, 6 Sep 2023 07:03:20 -0700 Subject: [PATCH 1/3] fixed funker530 rumble embed link extraction and video download --- yt_dlp/extractor/funker530.py | 123 +++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 53 deletions(-) diff --git a/yt_dlp/extractor/funker530.py b/yt_dlp/extractor/funker530.py index ba5ab7d4eeb..7637bbb6ccb 100644 --- a/yt_dlp/extractor/funker530.py +++ b/yt_dlp/extractor/funker530.py @@ -1,79 +1,96 @@ from .common import InfoExtractor -from .rumble import RumbleEmbedIE +from .generic import GenericIE from .youtube import YoutubeIE from ..utils import ExtractorError, clean_html, get_element_by_class, strip_or_none +import re + + +def extract_video_id(response): + # Define a regular expression pattern to match "Rumble" followed by parentheses and content inside them. + pattern = r'Rumble\s*\((.*?)\)' + + # Use re.search() to find the first occurrence of the pattern in the response. + match = re.search(pattern, response) + + # Check if a match is found. + if match: + # Extract the content inside parentheses using group(1). + content_inside_parentheses = match.group(1) + + # Use another regular expression to extract the video ID. + video_id_match = re.search(r'video:\s*"([^"]+)"', content_inside_parentheses) + + if video_id_match: + # Extract the video ID using group(1). + video_id = video_id_match.group(1) + return video_id + return None # Return None if "Rumble" followed by parentheses or video ID is not found. + + +def url_clean(self, display_id, video_id): + url = f"https://rumble.com/embedJS/{video_id}" + + webpage = self._download_webpage(url, display_id) + + # Extract the link after "url" from the response using regex. + link_match = re.search(r'"url":"(.*?)",', webpage) + + if link_match: + link = link_match.group(1) + + # Remove all "\" symbols from the link. + cleaned_link = link.replace('/', '') + cleaned_link = cleaned_link.replace('\\', '/') + return cleaned_link class Funker530IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?funker530\.com/video/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://funker530.com/video/azov-patrol-caught-in-open-under-automatic-grenade-launcher-fire/', - 'md5': '085f50fea27523a388bbc22e123e09c8', - 'info_dict': { - 'id': 'v2qbmu4', - 'ext': 'mp4', - 'title': 'Azov Patrol Caught In Open Under Automatic Grenade Launcher Fire', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Funker530', - 'channel': 'Funker530', - 'channel_url': 'https://rumble.com/c/c-1199543', - 'width': 1280, - 'height': 720, - 'fps': 25, - 'duration': 27, - 'upload_date': '20230608', - 'timestamp': 1686241321, - 'live_status': 'not_live', - 'description': 'md5:bea2e1f458095414e04b5ac189c2f980', - } - }, { - 'url': 'https://funker530.com/video/my-friends-joined-the-russians-civdiv/', - 'md5': 'a42c2933391210662e93e867d7124b70', + 'md5': 'fcb1880a5703f5c17e9191bab27fb822', 'info_dict': { - 'id': 'k-pk4bOvoac', + 'id': 'c1Mgk.caa', 'ext': 'mp4', - 'view_count': int, - 'channel': 'Civ Div', - 'comment_count': int, - 'channel_follower_count': int, - 'thumbnail': 'https://i.ytimg.com/vi/k-pk4bOvoac/maxresdefault.jpg', - 'uploader_id': '@CivDiv', - 'duration': 357, - 'channel_url': 'https://www.youtube.com/channel/UCgsCiwJ88up-YyMHo7hL5-A', - 'tags': [], - 'uploader_url': 'https://www.youtube.com/@CivDiv', - 'channel_id': 'UCgsCiwJ88up-YyMHo7hL5-A', - 'like_count': int, - 'description': 'md5:aef75ec3f59c07a0e39400f609b24429', - 'live_status': 'not_live', - 'age_limit': 0, - 'uploader': 'Civ Div', - 'categories': ['People & Blogs'], - 'title': 'My “Friends” joined the Russians.', - 'availability': 'public', + 'title': 'c1Mgk.caa', 'upload_date': '20230608', - 'playable_in_embed': True, - 'heatmap': 'count:100', + 'timestamp': 1686241352.0, + 'direct': True } }] def _real_extract(self, url): + info = {} display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage)) + + video_id = extract_video_id(webpage) + cleaned_link = url_clean(self, display_id, video_id) + rumble_url = cleaned_link + + youtube_url = list(YoutubeIE._extract_embed_urls(url, webpage)) + if rumble_url: - info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()} - else: - youtube_url = list(YoutubeIE._extract_embed_urls(url, webpage)) - if youtube_url: - info = {'url': youtube_url[0], 'ie_key': YoutubeIE.ie_key()} - if not info: + info = { + 'url': rumble_url, + 'info_dict': { + 'ext': 'mp4', + 'direct': True + }, + + 'ie_key': GenericIE.ie_key() + } + elif youtube_url: + info = {'url': youtube_url[0], 'ie_key': YoutubeIE.ie_key()} + if info == {}: raise ExtractorError('No videos found on webpage', expected=True) return { **info, - '_type': 'url_transparent', + '_type': 'url', 'description': strip_or_none(self._search_regex( r'(?s)(.+)About the Author', clean_html(get_element_by_class('video-desc-paragraph', webpage)), - 'description', default=None)) + 'description', default=None)), + + } From 7a6203acdc00c43511eee4b23158b300abb84100 Mon Sep 17 00:00:00 2001 From: pasha Date: Sun, 10 Sep 2023 08:36:34 -0700 Subject: [PATCH 2/3] instead of doing the rumble embed link extraction inside the funker extractor modify the _extract_embed_url method in the rumble extractor --- yt_dlp/extractor/funker530.py | 124 +++++++++++++++------------------- yt_dlp/extractor/rumble.py | 2 +- 2 files changed, 55 insertions(+), 71 deletions(-) diff --git a/yt_dlp/extractor/funker530.py b/yt_dlp/extractor/funker530.py index 7637bbb6ccb..ecf40a89bcd 100644 --- a/yt_dlp/extractor/funker530.py +++ b/yt_dlp/extractor/funker530.py @@ -1,96 +1,80 @@ from .common import InfoExtractor -from .generic import GenericIE +from .rumble import RumbleEmbedIE from .youtube import YoutubeIE from ..utils import ExtractorError, clean_html, get_element_by_class, strip_or_none -import re - - -def extract_video_id(response): - # Define a regular expression pattern to match "Rumble" followed by parentheses and content inside them. - pattern = r'Rumble\s*\((.*?)\)' - - # Use re.search() to find the first occurrence of the pattern in the response. - match = re.search(pattern, response) - - # Check if a match is found. - if match: - # Extract the content inside parentheses using group(1). - content_inside_parentheses = match.group(1) - - # Use another regular expression to extract the video ID. - video_id_match = re.search(r'video:\s*"([^"]+)"', content_inside_parentheses) - - if video_id_match: - # Extract the video ID using group(1). - video_id = video_id_match.group(1) - return video_id - return None # Return None if "Rumble" followed by parentheses or video ID is not found. - - -def url_clean(self, display_id, video_id): - url = f"https://rumble.com/embedJS/{video_id}" - - webpage = self._download_webpage(url, display_id) - - # Extract the link after "url" from the response using regex. - link_match = re.search(r'"url":"(.*?)",', webpage) - - if link_match: - link = link_match.group(1) - - # Remove all "\" symbols from the link. - cleaned_link = link.replace('/', '') - cleaned_link = cleaned_link.replace('\\', '/') - return cleaned_link class Funker530IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?funker530\.com/video/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://funker530.com/video/azov-patrol-caught-in-open-under-automatic-grenade-launcher-fire/', - 'md5': 'fcb1880a5703f5c17e9191bab27fb822', + 'md5': '085f50fea27523a388bbc22e123e09c8', 'info_dict': { - 'id': 'c1Mgk.caa', + 'id': 'v2qbmu4', 'ext': 'mp4', - 'title': 'c1Mgk.caa', + 'title': 'Azov Patrol Caught In Open Under Automatic Grenade Launcher Fire', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Funker530', + 'channel': 'Funker530', + 'channel_url': 'https://rumble.com/c/c-1199543', + 'width': 1280, + 'height': 720, + 'fps': 25, + 'duration': 27, 'upload_date': '20230608', - 'timestamp': 1686241352.0, - 'direct': True + 'timestamp': 1686241321, + 'live_status': 'not_live', + 'description': 'md5:bea2e1f458095414e04b5ac189c2f980', + } + }, { + 'url': 'https://funker530.com/video/my-friends-joined-the-russians-civdiv/', + 'md5': 'a42c2933391210662e93e867d7124b70', + 'info_dict': { + 'id': 'k-pk4bOvoac', + 'ext': 'mp4', + 'view_count': int, + 'channel': 'Civ Div', + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/k-pk4bOvoac/maxresdefault.jpg', + 'uploader_id': '@CivDiv', + 'duration': 357, + 'channel_url': 'https://www.youtube.com/channel/UCgsCiwJ88up-YyMHo7hL5-A', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@CivDiv', + 'channel_id': 'UCgsCiwJ88up-YyMHo7hL5-A', + 'like_count': int, + 'description': 'md5:aef75ec3f59c07a0e39400f609b24429', + 'live_status': 'not_live', + 'age_limit': 0, + 'uploader': 'Civ Div', + 'categories': ['People & Blogs'], + 'title': 'My “Friends” joined the Russians.', + 'availability': 'public', + 'upload_date': '20230608', + 'playable_in_embed': True, + 'heatmap': 'count:100', } }] def _real_extract(self, url): - info = {} display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - - video_id = extract_video_id(webpage) - cleaned_link = url_clean(self, display_id, video_id) - rumble_url = cleaned_link - - youtube_url = list(YoutubeIE._extract_embed_urls(url, webpage)) - + rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage)) + info = {} if rumble_url: - info = { - 'url': rumble_url, - 'info_dict': { - 'ext': 'mp4', - 'direct': True - }, - - 'ie_key': GenericIE.ie_key() - } - elif youtube_url: - info = {'url': youtube_url[0], 'ie_key': YoutubeIE.ie_key()} - if info == {}: + info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()} + else: + youtube_url = list(YoutubeIE._extract_embed_urls(url, webpage)) + if youtube_url: + info = {'url': youtube_url[0], 'ie_key': YoutubeIE.ie_key()} + if not info: raise ExtractorError('No videos found on webpage', expected=True) return { **info, - '_type': 'url', + '_type': 'url_transparent', 'description': strip_or_none(self._search_regex( r'(?s)(.+)About the Author', clean_html(get_element_by_class('video-desc-paragraph', webpage)), - 'description', default=None)), - - + 'description', default=None)) } diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index f8bf4a18258..96c192581d8 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -144,7 +144,7 @@ def _extract_embed_urls(cls, url, webpage): if embeds: return embeds return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( - r'