Skip to content

Commit

Permalink
Update to ytdl-commit-a08f2b7 (#10012)
Browse files Browse the repository at this point in the history
[ie] Rework JWPlayer extraction
- ytdl-org/youtube-dl@f663724
[ie/gbnews] Add extractor
- ytdl-org/youtube-dl@70f230f
[ie/caffeinetv] Add extractor
- ytdl-org/youtube-dl@40bd5c1
[ie/youporn] Improve extraction
- ytdl-org/youtube-dl@0b2ce36
[ie/youporn] Add playlist extractors
- ytdl-org/youtube-dl@668332b

Closes #9188, Closes #9523
Authored by: Grub4K, bashonly
  • Loading branch information
Grub4K committed May 26, 2024
1 parent e897bd8 commit a4da9db
Show file tree
Hide file tree
Showing 6 changed files with 588 additions and 45 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2123,7 +2123,7 @@ with yt_dlp.YoutubeDL(ydl_opts) as ydl:

### New features

* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@be008e6**](https://github.com/ytdl-org/youtube-dl/commit/be008e657d79832642e2158557c899249c9e31cd) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))
* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@a08f2b7**](https://github.com/ytdl-org/youtube-dl/commit/a08f2b7e4567cdc50c0614ee0a4ffdff49b8b6e6) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))

* **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API

Expand Down
12 changes: 11 additions & 1 deletion yt_dlp/extractor/_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@
from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE
from .c56 import C56IE
from .caffeinetv import CaffeineTVIE
from .callin import CallinIE
from .caltrans import CaltransIE
from .cam4 import CAM4IE
Expand Down Expand Up @@ -720,6 +721,7 @@
from .gamestar import GameStarIE
from .gaskrank import GaskrankIE
from .gazeta import GazetaIE
from .gbnews import GBNewsIE
from .gdcvault import GDCVaultIE
from .gedidigital import GediDigitalIE
from .generic import GenericIE
Expand Down Expand Up @@ -2501,7 +2503,15 @@
YouNowLiveIE,
YouNowMomentIE,
)
from .youporn import YouPornIE
from .youporn import (
YouPornCategoryIE,
YouPornChannelIE,
YouPornCollectionIE,
YouPornIE,
YouPornStarIE,
YouPornTagIE,
YouPornVideosIE,
)
from .zaiko import (
ZaikoETicketIE,
ZaikoIE,
Expand Down
74 changes: 74 additions & 0 deletions yt_dlp/extractor/caffeinetv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
parse_iso8601,
traverse_obj,
urljoin,
)


class CaffeineTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?caffeine\.tv/[^/?#]+/video/(?P<id>[\da-f-]+)'
_TESTS = [{
'url': 'https://www.caffeine.tv/TsuSurf/video/cffc0a00-e73f-11ec-8080-80017d29f26e',
'info_dict': {
'id': 'cffc0a00-e73f-11ec-8080-80017d29f26e',
'ext': 'mp4',
'title': 'GOOOOD MORNINNNNN #highlights',
'timestamp': 1654702180,
'upload_date': '20220608',
'uploader': 'RahJON Wicc',
'uploader_id': 'TsuSurf',
'duration': 3145,
'age_limit': 17,
'thumbnail': 'https://www.caffeine.tv/broadcasts/776b6f84-9cd5-42e3-af1d-4a776eeed697/replay/lobby.jpg',
'comment_count': int,
'view_count': int,
'like_count': int,
'tags': ['highlights', 'battlerap'],
},
'params': {
'skip_download': 'm3u8',
},
}]

def _real_extract(self, url):
video_id = self._match_id(url)
json_data = self._download_json(
f'https://api.caffeine.tv/social/public/activity/{video_id}', video_id)
broadcast_info = traverse_obj(json_data, ('broadcast_info', {dict})) or {}

video_url = broadcast_info['video_url']
ext = determine_ext(video_url)
if ext == 'm3u8':
formats = self._extract_m3u8_formats(video_url, video_id, 'mp4')
else:
formats = [{'url': video_url}]

return {
'id': video_id,
'formats': formats,
**traverse_obj(json_data, {
'like_count': ('like_count', {int_or_none}),
'view_count': ('view_count', {int_or_none}),
'comment_count': ('comment_count', {int_or_none}),
'tags': ('tags', ..., {str}, {lambda x: x or None}),
'uploader': ('user', 'name', {str}),
'uploader_id': (((None, 'user'), 'username'), {str}, any),
'is_live': ('is_live', {bool}),
}),
**traverse_obj(broadcast_info, {
'title': ('broadcast_title', {str}),
'duration': ('content_duration', {int_or_none}),
'timestamp': ('broadcast_start_time', {parse_iso8601}),
'thumbnail': ('preview_image_path', {lambda x: urljoin(url, x)}),
}),
'age_limit': {
# assume Apple Store ratings: https://en.wikipedia.org/wiki/Mobile_software_content_rating_system
'FOUR_PLUS': 0,
'NINE_PLUS': 9,
'TWELVE_PLUS': 12,
'SEVENTEEN_PLUS': 17,
}.get(broadcast_info.get('content_rating'), 17),
}
47 changes: 16 additions & 31 deletions yt_dlp/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3384,23 +3384,16 @@ def manifest_url(manifest):
return formats

def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search(
r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
webpage)
if mobj:
try:
jwplayer_data = self._parse_json(mobj.group('options'),
video_id=video_id,
transform_source=transform_source)
except ExtractorError:
pass
else:
if isinstance(jwplayer_data, dict):
return jwplayer_data
return self._search_json(
r'''(?<!-)\bjwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!</script>).)*?\.\s*(?:setup\s*\(|(?P<load>load)\s*\(\s*\[)''',
webpage, 'JWPlayer data', video_id,
# must be a {...} or sequence, ending
contains_pattern=r'\{(?s:.*)}(?(load)(?:\s*,\s*\{(?s:.*)})*)', end_pattern=r'(?(load)\]|\))',
transform_source=transform_source, default=None)

def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
def _extract_jwplayer_data(self, webpage, video_id, *args, transform_source=js_to_json, **kwargs):
jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json)
webpage, video_id, transform_source=transform_source)
return self._parse_jwplayer_data(
jwplayer_data, video_id, *args, **kwargs)

Expand Down Expand Up @@ -3432,22 +3425,14 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)

subtitles = {}
tracks = video_data.get('tracks')
if tracks and isinstance(tracks, list):
for track in tracks:
if not isinstance(track, dict):
continue
track_kind = track.get('kind')
if not track_kind or not isinstance(track_kind, str):
continue
if track_kind.lower() not in ('captions', 'subtitles'):
continue
track_url = urljoin(base_url, track.get('file'))
if not track_url:
continue
subtitles.setdefault(track.get('label') or 'en', []).append({
'url': self._proto_relative_url(track_url)
})
for track in traverse_obj(video_data, (
'tracks', lambda _, v: v['kind'].lower() in ('captions', 'subtitles'))):
track_url = urljoin(base_url, track.get('file'))
if not track_url:
continue
subtitles.setdefault(track.get('label') or 'en', []).append({
'url': self._proto_relative_url(track_url)
})

entry = {
'id': this_video_id,
Expand Down
107 changes: 107 additions & 0 deletions yt_dlp/extractor/gbnews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import functools

from .common import InfoExtractor
from ..utils import (
ExtractorError,
extract_attributes,
get_elements_html_by_class,
url_or_none,
)
from ..utils.traversal import traverse_obj


class GBNewsIE(InfoExtractor):
IE_DESC = 'GB News clips, features and live streams'
_VALID_URL = r'https?://(?:www\.)?gbnews\.(?:uk|com)/(?:\w+/)?(?P<id>[^#?]+)'

_PLATFORM = 'safari'
_SSMP_URL = 'https://mm-v2.simplestream.com/ssmp/api.php'
_TESTS = [{
'url': 'https://www.gbnews.com/news/bbc-claudine-gay-harvard-university-antisemitism-row',
'info_dict': {
'id': '52264136',
'ext': 'mp4',
'thumbnail': r're:https?://www\.gbnews\.\w+/.+\.(?:jpe?g|png|webp)',
'display_id': 'bbc-claudine-gay-harvard-university-antisemitism-row',
'description': 'The post was criticised by former employers of the broadcaster',
'title': 'BBC deletes post after furious backlash over headline downplaying antisemitism',
},
}, {
'url': 'https://www.gbnews.com/royal/prince-harry-in-love-with-kate-meghan-markle-jealous-royal',
'info_dict': {
'id': '52328390',
'ext': 'mp4',
'thumbnail': r're:https?://www\.gbnews\.\w+/.+\.(?:jpe?g|png|webp)',
'display_id': 'prince-harry-in-love-with-kate-meghan-markle-jealous-royal',
'description': 'Ingrid Seward has published 17 books documenting the highs and lows of the Royal Family',
'title': 'Royal author claims Prince Harry was \'in love\' with Kate - Meghan was \'jealous\'',
}
}, {
'url': 'https://www.gbnews.uk/watchlive',
'info_dict': {
'id': '1069',
'ext': 'mp4',
'thumbnail': r're:https?://www\.gbnews\.\w+/.+\.(?:jpe?g|png|webp)',
'display_id': 'watchlive',
'live_status': 'is_live',
'title': r're:^GB News Live',
},
'params': {'skip_download': 'm3u8'},
}]

@functools.lru_cache
def _get_ss_endpoint(self, data_id, data_env):
if not data_id:
data_id = 'GB003'
if not data_env:
data_env = 'production'

json_data = self._download_json(
self._SSMP_URL, None, 'Downloading Simplestream JSON metadata', query={
'id': data_id,
'env': data_env,
})
meta_url = traverse_obj(json_data, ('response', 'api_hostname', {url_or_none}))
if not meta_url:
raise ExtractorError('No API host found')

return meta_url

def _real_extract(self, url):
display_id = self._match_id(url).rpartition('/')[2]
webpage = self._download_webpage(url, display_id)

video_data = None
elements = get_elements_html_by_class('simplestream', webpage)
for html_tag in elements:
attributes = extract_attributes(html_tag)
if 'sidebar' not in (attributes.get('class') or ''):
video_data = attributes
if not video_data:
raise ExtractorError('Could not find video element', expected=True)

endpoint_url = self._get_ss_endpoint(video_data.get('data-id'), video_data.get('data-env'))

uvid = video_data['data-uvid']
video_type = video_data.get('data-type')
if not video_type or video_type == 'vod':
video_type = 'show'
stream_data = self._download_json(
f'{endpoint_url}/api/{video_type}/stream/{uvid}',
uvid, 'Downloading stream JSON', query={
'key': video_data.get('data-key'),
'platform': self._PLATFORM,
})
if traverse_obj(stream_data, 'drm'):
self.report_drm(uvid)

return {
'id': uvid,
'display_id': display_id,
'title': self._og_search_title(webpage, default=None),
'description': self._og_search_description(webpage, default=None),
'formats': self._extract_m3u8_formats(traverse_obj(stream_data, (
'response', 'stream', {url_or_none})), uvid, 'mp4'),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'is_live': video_type == 'live',
}
Loading

0 comments on commit a4da9db

Please sign in to comment.