Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[extractor/goplay] Fix GoPlay downloads #6654

Merged
merged 20 commits into from Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion yt_dlp/YoutubeDL.py
Expand Up @@ -3483,7 +3483,8 @@ def ffmpeg_fixup(cndn, msg, cls):
or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None,
'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
FFmpegFixupM3u8PP)
ffmpeg_fixup(info_dict.get('is_live') and downloader == 'dashsegments',
ffmpeg_fixup(downloader == 'dashsegments'
and (info_dict.get('is_live') or info_dict.get('is_dash_periods')),
'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)

ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
Expand Down
66 changes: 53 additions & 13 deletions yt_dlp/extractor/common.py
Expand Up @@ -247,6 +247,9 @@ class InfoExtractor:
(For internal use only)
* http_chunk_size Chunk size for HTTP downloads
* ffmpeg_args Extra arguments for ffmpeg downloader
MPD formats can have the additional field:
* is_dash_periods True if the format is the result of merging
multiple DASH periods.
pukkandan marked this conversation as resolved.
Show resolved Hide resolved
RTMP formats can also have the additional fields: page_url,
app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
rtmp_protocol, rtmp_real_time
Expand Down Expand Up @@ -2530,7 +2533,11 @@ def _extract_mpd_formats(self, *args, **kwargs):
self._report_ignoring_subs('DASH')
return fmts

def _extract_mpd_formats_and_subtitles(
def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
periods = self._extract_mpd_periods(*args, **kwargs)
return self._merge_mpd_periods(periods)

def _extract_mpd_periods(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}):

Expand All @@ -2543,26 +2550,55 @@ def _extract_mpd_formats_and_subtitles(
errnote='Failed to download MPD manifest' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
return [], {}
return []
mpd_doc, urlh = res
if mpd_doc is None:
return [], {}
return []

# We could have been redirected to a new url when we retrieved our mpd file.
mpd_url = urlh.url
mpd_base_url = base_url(mpd_url)

return self._parse_mpd_formats_and_subtitles(
mpd_doc, mpd_id, mpd_base_url, mpd_url)
return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)

def _parse_mpd_formats(self, *args, **kwargs):
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
if subs:
self._report_ignoring_subs('DASH')
return fmts

def _parse_mpd_formats_and_subtitles(
self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
periods = self._parse_mpd_periods(*args, **kwargs)
return self._merge_mpd_periods(periods)

def _merge_mpd_periods(self, periods):
"""
Combine all formats and subtitles from an MPD manifest into a single list,
by concatenate streams with similar formats.
"""
# create merged formats
formats = {}
subtitles = {}
for period in periods:
for f in period['formats']:
# ignore period-specific parameters when grouping periods into formats
format_key = tuple(v for k, v in f.items() if k not in (
('format_id', 'fragments', 'manifest_stream_number')))
assert 'is_dash_periods' not in f, 'format already processed'
f['is_dash_periods'] = True
if format_key not in formats:
# first period of this format
formats[format_key] = f
elif 'fragments' in f:
# follow-up period of a mergeable format
formats[format_key].setdefault('fragments', []).extend(f['fragments'])

for sub_lang, sub_info in period['subtitles'].items():
subtitles.setdefault(sub_lang, []).extend(sub_info)

return list(formats.values()), subtitles
pukkandan marked this conversation as resolved.
Show resolved Hide resolved

def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
Expand Down Expand Up @@ -2641,9 +2677,13 @@ def extract_Initialization(source):
return ms_info

mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
formats, subtitles = [], {}
stream_numbers = collections.defaultdict(int)
for period in mpd_doc.findall(_add_ns('Period')):
for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
period_entry = {
'id': period.get('id', f'period-{period_idx}'),
'formats': [],
'subtitles': collections.defaultdict(list),
}
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
'start_number': 1,
Expand Down Expand Up @@ -2893,11 +2933,11 @@ def add_segment_url():
if content_type in ('video', 'audio', 'image/jpeg'):
f['manifest_stream_number'] = stream_numbers[f['url']]
stream_numbers[f['url']] += 1
formats.append(f)
period_entry['formats'].append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)

return formats, subtitles
period_entry['subtitles'][lang or 'und'].append(f)
if period_entry['formats'] or period_entry['subtitles']:
yield period_entry
alard marked this conversation as resolved.
Show resolved Hide resolved

def _extract_ism_formats(self, *args, **kwargs):
fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
Expand Down
47 changes: 43 additions & 4 deletions yt_dlp/extractor/goplay.py
Expand Up @@ -40,6 +40,22 @@ class GoPlayIE(InfoExtractor):
'title': 'A Family for the Holidays',
},
'skip': 'This video is only available for registered users'
}, {
'url': 'https://www.goplay.be/video/de-mol/de-mol-s11/de-mol-s11-aflevering-1#autoplay',
'info_dict': {
'id': '03eb8f2f-153e-41cb-9805-0d3a29dab656',
'ext': 'mp4',
'title': 'S11 - Aflevering 1',
'episode': 'Episode 1',
'series': 'De Mol',
'season_number': 11,
'episode_number': 1,
'season': 'Season 11'
},
'params': {
'skip_download': True
},
'skip': 'This video is only available for registered users'
}]

_id_token = None
Expand Down Expand Up @@ -77,16 +93,39 @@ def _real_extract(self, url):

api = self._download_json(
f'https://api.goplay.be/web/v1/videos/long-form/{video_id}',
video_id, headers={'Authorization': 'Bearer %s' % self._id_token})
video_id, headers={
'Authorization': 'Bearer %s' % self._id_token,
**self.geo_verification_headers(),
})

if 'manifestUrls' in api:
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS')

formats, subs = self._extract_m3u8_formats_and_subtitles(
api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS')
else:
if 'ssai' not in api:
raise ExtractorError('expecting Google SSAI stream')

ssai_content_source_id = api['ssai']['contentSourceID']
ssai_video_id = api['ssai']['videoID']

dai = self._download_json(
f'https://dai.google.com/ondemand/dash/content/{ssai_content_source_id}/vid/{ssai_video_id}/streams',
video_id, data=b'{"api-key":"null"}',
headers={'content-type': 'application/json'})

periods = self._extract_mpd_periods(dai['stream_manifest'], video_id)

# skip pre-roll and mid-roll ads
periods = [p for p in periods if '-ad-' not in p['id']]

formats, subtitles = self._merge_mpd_periods(periods)

info_dict.update({
'id': video_id,
'formats': formats,
alard marked this conversation as resolved.
Show resolved Hide resolved
'subtitles': subtitles,
})

return info_dict


Expand Down