Skip to content

Commit

Permalink
[ie/francetv] Fix extractors (#9333)
Browse files Browse the repository at this point in the history
Closes #9323
Authored by: bashonly
  • Loading branch information
bashonly committed Mar 2, 2024
1 parent 413d367 commit 9749ac7
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 30 deletions.
70 changes: 44 additions & 26 deletions yt_dlp/extractor/francetv.py
@@ -1,21 +1,31 @@
import urllib.parse

from .common import InfoExtractor
from .dailymotion import DailymotionIE
from ..networking import HEADRequest
from ..utils import (
ExtractorError,
determine_ext,
filter_dict,
format_field,
int_or_none,
join_nonempty,
parse_iso8601,
parse_qs,
smuggle_url,
unsmuggle_url,
url_or_none,
)
from ..utils.traversal import traverse_obj


class FranceTVBaseInfoExtractor(InfoExtractor):
def _make_url_result(self, video_or_full_id, catalog=None):
def _make_url_result(self, video_or_full_id, catalog=None, url=None):
full_id = 'francetv:%s' % video_or_full_id
if '@' not in video_or_full_id and catalog:
full_id += '@%s' % catalog
if url:
full_id = smuggle_url(full_id, {'hostname': urllib.parse.urlparse(url).hostname})
return self.url_result(
full_id, ie=FranceTVIE.ie_key(),
video_id=video_or_full_id.split('@')[0])
Expand All @@ -35,6 +45,8 @@ class FranceTVIE(InfoExtractor):
)
'''
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1']
_GEO_COUNTRIES = ['FR']
_GEO_BYPASS = False

_TESTS = [{
# without catalog
Expand Down Expand Up @@ -76,10 +88,8 @@ class FranceTVIE(InfoExtractor):
'only_matching': True,
}]

def _extract_video(self, video_id, catalogue=None):
# Videos are identified by idDiffusion so catalogue part is optional.
# However when provided, some extra formats may be returned so we pass
# it if available.
def _extract_video(self, video_id, catalogue=None, hostname=None):
# TODO: Investigate/remove 'catalogue'/'catalog'; it has not been used since 2021
is_live = None
videos = []
title = None
Expand All @@ -94,15 +104,16 @@ def _extract_video(self, video_id, catalogue=None):
for device_type in ('desktop', 'mobile'):
dinfo = self._download_json(
'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
video_id, 'Downloading %s video JSON' % device_type, query={
video_id, f'Downloading {device_type} video JSON', query=filter_dict({
'device_type': device_type,
'browser': 'chrome',
}, fatal=False)
'domain': hostname,
}), fatal=False)

if not dinfo:
continue

video = dinfo.get('video')
video = traverse_obj(dinfo, ('video', {dict}))
if video:
videos.append(video)
if duration is None:
Expand All @@ -112,7 +123,7 @@ def _extract_video(self, video_id, catalogue=None):
if spritesheets is None:
spritesheets = video.get('spritesheets')

meta = dinfo.get('meta')
meta = traverse_obj(dinfo, ('meta', {dict}))
if meta:
if title is None:
title = meta.get('title')
Expand All @@ -126,22 +137,21 @@ def _extract_video(self, video_id, catalogue=None):
if timestamp is None:
timestamp = parse_iso8601(meta.get('broadcasted_at'))

formats = []
subtitles = {}
for video in videos:
formats, subtitles, video_url = [], {}, None
for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])):
video_url = video['url']
format_id = video.get('format')

video_url = None
if video.get('workflow') == 'token-akamai':
token_url = video.get('token')
if token_url:
token_json = self._download_json(
token_url, video_id,
'Downloading signed %s manifest URL' % format_id)
if token_json:
video_url = token_json.get('url')
if not video_url:
video_url = video.get('url')
token_url = url_or_none(video.get('token'))
if token_url and video.get('workflow') == 'token-akamai':
tokenized_url = traverse_obj(self._download_json(
token_url, video_id, f'Downloading signed {format_id} manifest URL',
fatal=False, query={
'format': 'json',
'url': video_url,
}), ('url', {url_or_none}))
if tokenized_url:
video_url = tokenized_url

ext = determine_ext(video_url)
if ext == 'f4m':
Expand Down Expand Up @@ -174,6 +184,13 @@ def _extract_video(self, video_id, catalogue=None):

# XXX: what is video['captions']?

if not formats and video_url:
urlh = self._request_webpage(
HEADRequest(video_url), video_id, 'Checking for geo-restriction',
fatal=False, expected_status=403)
if urlh and urlh.headers.get('x-errortype') == 'geo':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)

for f in formats:
if f.get('acodec') != 'none' and f.get('language') in ('qtz', 'qad'):
f['language_preference'] = -10
Expand Down Expand Up @@ -213,6 +230,7 @@ def _extract_video(self, video_id, catalogue=None):
}

def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
catalog = mobj.group('catalog')
Expand All @@ -224,7 +242,7 @@ def _real_extract(self, url):
if not video_id:
raise ExtractorError('Invalid URL', expected=True)

return self._extract_video(video_id, catalog)
return self._extract_video(video_id, catalog, hostname=smuggled_data.get('hostname'))


class FranceTVSiteIE(FranceTVBaseInfoExtractor):
Expand Down Expand Up @@ -314,7 +332,7 @@ def _real_extract(self, url):
r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
webpage, 'video ID').split('@')

return self._make_url_result(video_id, catalogue)
return self._make_url_result(video_id, catalogue, url=url)


class FranceTVInfoIE(FranceTVBaseInfoExtractor):
Expand Down Expand Up @@ -405,4 +423,4 @@ def _real_extract(self, url):
r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'),
webpage, 'video id')

return self._make_url_result(video_id)
return self._make_url_result(video_id, url=url)
7 changes: 3 additions & 4 deletions yt_dlp/extractor/lumni.py
@@ -1,8 +1,7 @@
from .common import InfoExtractor
from .francetv import FranceTVIE
from .francetv import FranceTVBaseInfoExtractor


class LumniIE(InfoExtractor):
class LumniIE(FranceTVBaseInfoExtractor):
_VALID_URL = r'https?://(?:www\.)?lumni\.fr/video/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.lumni.fr/video/l-homme-et-son-environnement-dans-la-revolution-industrielle',
Expand All @@ -21,4 +20,4 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, display_id)
video_id = self._html_search_regex(
r'<div[^>]+data-factoryid\s*=\s*["\']([^"\']+)', webpage, 'video id')
return self.url_result(f'francetv:{video_id}', FranceTVIE, video_id)
return self._make_url_result(video_id, url=url)

15 comments on commit 9749ac7

@boulderob
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @bashonly

I want to thank you for this timely commit. If you have time I have a question about how you solved this problem please.

This IE obviously relies on the hidden url 'https://player.webservices.francetelevisions.fr/v1/videos/... to extract the real video information. However using my browser devtools, no matter which france.tv video I choose or if I refresh reload, I never see this JSON URI fetched in the NETWORK tab which makes no sense to me!? How can I see this URL fetched in devtools? Put another way, how did whoever createe this IE find that JSON url in the first place.

I ask because if you hadn't come along this might have taken longer to get fixed. I'd like to be able to solve this myself in the future. My python / javascript skills are quite high. I'm just obviously missing some key ingredient into the underlying JSON fetch here that is blocking me from being effective in providing fixes.

You look like a very active contributor here so I'm hoping this is an easy question for you to answer and won't take up much of your time.

Thanks in advance!

@boulderob
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK i figured it out. The IE probably needs to be updated because that hard-coded JSON url used in the IE looks like it's being redirected to a different URL. That's why it still works as well as why I don't see the expected hard coded url being fetched in the NETWORK TAB of devtools.

The new redirect is to: https://k7.ftven.fr/videos/

Followup. Is there a way to quickly and easily spit out the underlying real video ID from the CLI for any site url? The same goes for the JSON url that is fetched to get the info extract?

I ask because it would make "discovery" of the underlying JSON a lot easier to create a fix rather than having to revisit the CODE and figure these out every time you come back to an IE since all IEs may have different author's and implementations?

Thanks!

@bashonly
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new redirect is to: https://k7.ftven.fr/videos/

Noted in case of breakage. But since I am geo-blocked from many links on these sites, I thought it unwise to change something that still worked; best leave that to someone who is in-region.

Followup. Is there a way to quickly and easily spit out the underlying real video ID from the CLI for any site url? The same goes for the JSON url that is fetched to get the info extract?

--print id will print what yt-dlp extracts as the id if extraction is successful.
--print-traffic will show you all URLs being requested and redirects followed.
--write-pages will write all responses received during extraction to files

@boulderob
Copy link

@boulderob boulderob commented on 9749ac7 Mar 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the rapid reply! Those work great! FYI --print-traffic shows the actual redirect that I had to figure out by manually pasting links into the browser bar.

Some followups.

  1. Your commit added a new hostname parameter to the _extract_video function and uses a helper function called smuggled_data to find it (see below). Can you give me any insight to why this was needed and what the "smuggle" is all about generally speaking?
return self._extract_video(video_id, catalog, hostname=smuggled_data.get('hostname'))
  1. I think some IEs in ytdlp have this notion of using self._make_url_result to turn site video URLs to simplified URNs that consist of an IE name key separated by a colon and the real vid. FranceTv seems to use this as well. I gather the rationale for this is to allow distinct IEs to accommodate new site valid url matches that require different extraction of the real vid but then turn around and hand them off to an existing IE that can actually use the same "backend" to DL the extract info. Does that sound right?

  2. Speaking of that URN, I can see from the ytdlp cli output that your commit added what looks to be a #__youtubedl_smuggle=... fragment to the new URN scheme and evidently it's need to make things work now per what I said in bullet 2 above...

[FranceTV] Extracting URL: francetv:da90cdba-57c4-4376-adfe-4172aba574ae#__youtubedl_smuggle=%7B%22hostname%22%3A+%22www.france.tv%22%7D

So along with bullet 1 above; What does smuggle_url and the fragment above do and why is it needed in the URN at all? Is this needed by a lot of sites / IEs or why only others? I've checked the utilities.py related helper function declarations and it's not clear what their're purpose is.

Thanks!

Pasting the URN creation method definition here for reference:

class FranceTVBaseInfoExtractor(InfoExtractor):
    def _make_url_result(self, video_or_full_id, catalog=None, url=None):
        full_id = 'francetv:%s' % video_or_full_id
        if '@' not in video_or_full_id and catalog:
            full_id += '@%s' % catalog
        if url:
            full_id = smuggle_url(full_id, {'hostname': urllib.parse.urlparse(url).hostname})
        return self.url_result(
            full_id, ie=FranceTVIE.ie_key(),
            video_id=video_or_full_id.split('@')[0])
@@ -35,6 +45,8 @@ class FranceTVIE(InfoExtractor):

@boulderob
Copy link

@boulderob boulderob commented on 9749ac7 Mar 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separate note. Up until your new commit, francetv primarily had HLS streams. Or at least that's what I have been using for years. After the your new commit, -F reports exclusively MPD/DASH.

Back in the day with the old youtube_dl there was a break in france.tv that resulted in a fix that resulted in only all DASH extracts as well similar to your recent commit. The DASH downloads were very slow to the point of being unusable compared to HLS. They also result in QSM subtitle files which don't work vs the VTT files associated and downloaded with HLS.

Around the time of the old youtube_dl DASH only fix, ytdlp forked and eventually some new commit fixes were added to bring back the M3U/HLS + VTT streams. I've was using these ytdlp HLS/VTT streams for francetv up until the break about a week ago.

My initial use of francetv with your fix commit seems to be very similar what I described with the old youtube_dl fix years ago. The MPD/DASH only downloads are very slow and the accompanying QSM subtitle files do NOT work vs the VTT that were accompanied by the old HLS. Is it possible that M3u/HLS streams and VTT subtitles are out there and just not being extracted? For whatever reason DASH / QSM is just too slow to be usable and the QSM subs just don't work.

FYI: the subtitles get downloaded like this but aren't actually subtitle files.

p3 yt_dlp/__main__.py --write-subs --skip-download https://www.france.tv/france-5/les-merveilles-de-l-asie-du-sud-est/5635959-le-nord.html  
[FranceTVSite] Extracting URL: https://www.france.tv/france-5/les-merveilles-de-l-asie-du-sud-est/5635959-le-nord.html
[FranceTVSite] 5635959-le-nord: Downloading webpage
[FranceTV] Extracting URL: francetv:46467ed3-ef4c-4154-a8ac-d980b0d43fab#__youtubedl_smuggle=%7B%22hostname%22%3A+%22www.france.tv%22%7D
[FranceTV] 46467ed3-ef4c-4154-a8ac-d980b0d43fab: Downloading desktop video JSON
[FranceTV] 46467ed3-ef4c-4154-a8ac-d980b0d43fab: Downloading mobile video JSON
[FranceTV] 46467ed3-ef4c-4154-a8ac-d980b0d43fab: Downloading signed dash manifest URL
[FranceTV] 46467ed3-ef4c-4154-a8ac-d980b0d43fab: Downloading MPD manifest
[FranceTV] 46467ed3-ef4c-4154-a8ac-d980b0d43fab: Downloading signed dash manifest URL
[FranceTV] 46467ed3-ef4c-4154-a8ac-d980b0d43fab: Downloading MPD manifest
[info] 46467ed3-ef4c-4154-a8ac-d980b0d43fab: Downloading subtitles: qsm
[info] 46467ed3-ef4c-4154-a8ac-d980b0d43fab: Downloading 1 format(s): dash-video=5000000-1+dash-audio_fre=96000-1
[info] Writing video subtitles to: Les merveilles de l'Asie du Sud-Est - Le nord [46467ed3-ef4c-4154-a8ac-d980b0d43fab].qsm.mp4
[dashsegments] Total fragments: 294
[download] Destination: Les merveilles de l'Asie du Sud-Est - Le nord [46467ed3-ef4c-4154-a8ac-d980b0d43fab].qsm.mp4
[download] 100% of  152.61KiB in 00:00:41 at 3.72KiB/s


file Les\ merveilles\ de\ l\'Asie\ du\ Sud-Est\ -\ Le\ nord\ \[46467ed3-ef4c-4154-a8ac-d980b0d43fab\].qsm.mp4 
Les merveilles de l'Asie du Sud-Est - Le nord [46467ed3-ef4c-4154-a8ac-d980b0d43fab].qsm.mp4: ISO Media, MP4 Base Media v6 

Thanks!

@bashonly
Copy link
Member Author

@bashonly bashonly commented on 9749ac7 Mar 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Up until your new commit, francetv primarily had HLS streams.

@boulderob see and/or try out #9347

Your other 3 questions are all related:

The FranceTV video API endpoint is now requiring the domain query parameter, which the extractor had not been including previously -- this was the actual cause of the "HTTP Error 400"s. Before my commit, the other francetv extractors only handed over the video IDs to FranceTVIE. They used the francetv: prefix, because while the actual URLs were not suitable for FranceTVIE (e.g. https://francetvinfo.fr/etcetc was not matched by FranceTVIE._VALID_URL), the extraction needed to be performed by FranceTVIE. We use smuggle_url to append the hostname to the URL in the URL fragment, because it's currently the only way to hand over arbitrary data (such as the referring URL/hostname) to the resolved extractor.

@boulderob
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bashonly as usual thanks for the rapid turnaround time. I'll test #9347 today and reply any findings to that PR itself. But initial inspection of code looks good. Love the cleanup of the old code. It's more manageable and understandable this way.

Regarding the HLS. I was on the right track. I thought by throttling the browser in devtools it might force the k7 URI to fetch in mobile mode. I presume my thinking there was right in terms of testing? (This must be something france.tv pages detect when making the call to k7 URL themselves though I'm not sure how or what they test for, perhaps HEADER AGENT or just pure speed?). But I missed that their k7 url was capable of accommodating different browser types as well.

Speaking of, I also presume that the only way to know the HLS discovery is thru trial and error inspection of and changing of the k47 URL query string parameters itself, yes? And the only way to test this is and discover that HLS lives under safari browser type is to actually run devtools on safari to see the different M3u8 vs MPD results revealed in the JSON, yes ? And this same logic applies in any IE not just this france.tv specific one?

I think if i can figure out the ytdlp / github automated testing protocol (https://github.com/yt-dlp/yt-dlp/pull/9347/checks) I'll be in pretty good shape with the required PR process. One thing that would make things easier is the ability to "code jump" from the PR commits to definitions in other parts of the code base, especially the utility functions. "Go to definition" usually works in the repo proper in github but not always. Unfortunately, I haven't got "Go to definition" to work in the PR commits for some reason outside of "jump within the same source file".

The FranceTV video API endpoint is now requiring the domain query parameter, which the extractor had not been including previously -- this was the actual cause of the "HTTP Error 400"s. Before my commit, the other francetv extractors only handed over the video IDs to FranceTVIE. They used the francetv: prefix, because while the actual URLs were not suitable for FranceTVIE (e.g. https://francetvinfo.fr/etcetc was not matched by FranceTVIE._VALID_URL), the extraction needed to be performed by FranceTVIE. We use smuggle_url to append the hostname to the URL in the URL fragment, because it's currently the only way to hand over arbitrary data (such as the referring URL/hostname) to the resolved extractor.

Thanks for the confirmation. There might be something a bit more semantic than smuggle_url as far as utility function names go though.

@bashonly
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mobile Safari does not support DASH (and HLS was developed by Apple)

@Jean-Daniel
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Speaking of, I also presume that the only way to know the HLS discovery is thru trial and error inspection of and changing of the k47 URL query string parameters itself, yes? And the only way to test this is and discover that HLS lives under safari browser type is to actually run devtools on safari to see the different M3u8 vs MPD results revealed in the JSON, yes ? And this same logic applies in any IE not just this france.tv specific one?

@boulderob For this kind of reverse engineering, browser dev tools are not the only way.
You can always intercept the https traffic and trace the exchange between the browser or a mobile app and the server using tool like mitmproxy. This is very helpful, especially when there is specific mobile App that fetch video information and file from simple api endpoint, vs using complexe API built into the web page.

@pokemaster974
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @bashonly for this new fix.
About the mp4 subtitles "linked" to dash (compared to vtt "linked" to hls), I also remark as @boulderob that there is an issue about it.

With this command yt-dlp.exe --allow-u --restrict-filenames --no-overwrites --no-continue -f "hls-522+hls-audio-aacl-96-Audio_Description+hls-audio-aacl-96-Audio_Français" --audio-multistreams -o "TEMP\%(title)s_%(id)s.%(ext)s" --ffmpeg-location "C:\ffmpeg\bin\ffmpeg.exe" --write-subs --sub-format "srt/vtt/ass/best" --sub-langs "qsm" --convert-subs "srt" "https://www.france.tv/france-2/les-petits-meurtres-d-agatha-christie/les-petits-meurtres-d-agatha-christie-saison-3/5699088-mortel-karma.html"

This is the yt-dlp log :

WARNING: You have asked for UNPLAYABLE formats to be listed/downloaded. This is a developer option intended for debugging. 
         If you experience any issues while using this option, DO NOT open a bug report
[FranceTVSite] Extracting URL: https://www.france.tv/france-2/les-petits-meurtres-d-agatha-christie/les-petits-meurtres-d-agatha...88-mortel-karma.html
[FranceTVSite] 5699088-mortel-karma: Downloading webpage
[FranceTV] Extracting URL: francetv:169b36ff-5be9-48e1-b575-a86b2dd6e95d#__youtubedl_smuggle=%7B%22hostname%22%3A+%22www.france.tv%22%7D
[FranceTV] 169b36ff-5be9-48e1-b575-a86b2dd6e95d: Downloading desktop video JSON
[FranceTV] 169b36ff-5be9-48e1-b575-a86b2dd6e95d: Downloading mobile video JSON
[FranceTV] 169b36ff-5be9-48e1-b575-a86b2dd6e95d: Downloading signed dash manifest URL
[FranceTV] 169b36ff-5be9-48e1-b575-a86b2dd6e95d: Downloading MPD manifest
[FranceTV] 169b36ff-5be9-48e1-b575-a86b2dd6e95d: Downloading signed dash manifest URL
[FranceTV] 169b36ff-5be9-48e1-b575-a86b2dd6e95d: Downloading MPD manifest
[info] 169b36ff-5be9-48e1-b575-a86b2dd6e95d: Downloading subtitles: qsm
[info] 169b36ff-5be9-48e1-b575-a86b2dd6e95d: Downloading 1 format(s): dash-video=400000-1+dash-audio_qad=96000-1
[info] Writing video subtitles to: TEMP\Les_petits_meurtres_d_Agatha_Christie_-_Mortel_karma_169b36ff-5be9-48e1-b575-a86b2dd6e95d.qsm.mp4
[dashsegments] Total fragments: 518
[download] Destination: TEMP\Les_petits_meurtres_d_Agatha_Christie_-_Mortel_karma_169b36ff-5be9-48e1-b575-a86b2dd6e95d.qsm.mp4

[download]   0.2% of ~ 333.87KiB at      0.00B/s ETA Unknown (frag 0/518)
[download]   0.1% of ~ 667.73KiB at      0.00B/s ETA Unknown (frag 1/518)
[download] 100.0% of ~ 277.45KiB at   26.33KiB/s ETA 00:00 (frag 517/518)
[download]  99.9% of ~ 277.72KiB at   26.33KiB/s ETA 00:00 (frag 518/518)
[download] 100% of  277.45KiB in 00:00:10 at 27.27KiB/s                  
[SubtitlesConvertor] Converting subtitles
ERROR: Preprocessing: Error opening output files: Invalid argument

This is what I get with ffmpeg :

ffmpeg version 2024-02-15-git-a2cfd6062c-full_build-www.gyan.dev Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 12.2.0 (Rev10, Built by MSYS2 project)
  configuration: --enable-gpl --enable-version3 --enable-static --pkg-config=pkgconf --disable-w32threads --disable-autodetect --enable-fontconfig --enable-iconv --enable-gnutls --enable-libxml2 --enable-gmp --enable-bzlib --enable-lzma --enable-libsnappy --enable-zlib --enable-librist --enable-libsrt --enable-libssh --enable-libzmq --enable-avisynth --enable-libbluray --enable-libcaca --enable-sdl2 --enable-libaribb24 --enable-libaribcaption --enable-libdav1d --enable-libdavs2 --enable-libuavs3d --enable-libzvbi --enable-librav1e --enable-libsvtav1 --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxavs2 --enable-libxvid --enable-libaom --enable-libjxl --enable-libopenjpeg --enable-libvpx --enable-mediafoundation --enable-libass --enable-frei0r --enable-libfreetype --enable-libfribidi --enable-libharfbuzz --enable-liblensfun --enable-libvidstab --enable-libvmaf --enable-libzimg --enable-amf --enable-cuda-llvm --enable-cuvid --enable-ffnvcodec --enable-nvdec --enable-nvenc --enable-dxva2 --enable-d3d11va --enable-libvpl --enable-libshaderc --enable-vulkan --enable-libplacebo --enable-opencl --enable-libcdio --enable-libgme --enable-libmodplug --enable-libopenmpt --enable-libopencore-amrwb 
--enable-libmp3lame --enable-libshine --enable-libtheora --enable-libtwolame --enable-libvo-amrwbenc --enable-libcodec2 --enable-libilbc --enable-libgsm --enable-libopencore-amrnb --enable-libopus --enable-libspeex --enable-libvorbis --enable-ladspa --enable-libbs2b --enable-libflite --enable-libmysofa --enable-librubberband --enable-libsoxr --enable-chromaprint
  libavutil      58. 38.100 / 58. 38.100
  libavcodec     60. 39.100 / 60. 39.100
  libavformat    60. 21.100 / 60. 21.100
  libavdevice    60.  4.100 / 60.  4.100
  libavfilter     9. 17.100 /  9. 17.100
  libswscale      7.  6.100 /  7.  6.100
  libswresample   4. 13.100 /  4. 13.100
  libpostproc    57.  4.100 / 57.  4.100
Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'Les_petits_meurtres_d_Agatha_Christie_-_Mortel_karma_169b36ff-5be9-48e1-b575-a86b2dd6e95d.qsm.mp4':
  Metadata:
    major_brand     : iso6
    minor_version   : 0
    compatible_brands: iso6dash
  Duration: 01:26:04.84, bitrate: 0 kb/s
  Stream #0:0[0x1](qsm): Data: none (wvtt / 0x74747677), 0 kb/s (default)
      Metadata:
        handler_name    : USP Text Handler
Output #0, srt, to 'subs.srt':
[out#0/srt @ 0000016194d8ec40] Output file does not contain any stream   
Error opening output file subs.srt.
Error opening output files: Invalid argument

It's not a big deal as I used hls streams and I will come back to them now, but if you need tests about this subtitle errors, I will can do it.

Regards.

@bashonly
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The mp4 subtitles are actual mp4 files with only a ttml subtitles stream (no video or audio streams). yt-dlp does not know what to do with them. But you could manually merge it with your video/audio mp4 using ffmpeg

@boulderob
Copy link

@boulderob boulderob commented on 9749ac7 Mar 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bashonly

The mp4 subtitles are actual mp4 files with only a ttml subtitles stream (no video or audio streams). yt-dlp does not know what to do with them. But you could manually merge it with your video/audio mp4 using ffmpeg

Is there a way to convert the mp4 subtitles to a usable subtitle text format/type without merging it into the video container so that media players can play it separately and automatically? I use the subtitle text to study foreign languages. The convenience of having them just work separately like hls mp4 + vtt does is of great benefit. Merging them requires more post download work and you lose the separation of concerns.

Also, since the qsm.mp4 ttml container is what the DASH mpd has been configured to return for the subtitles, how come a media player in the browser can resolve and play the subtitles back but a standalone media player can't? What is different between the two?

@boulderob
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Jean-Daniel

9749ac7#commitcomment-139309111

Thanks for the mitmproxy tip. Much appreciated.

@bashonly
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO this is getting off-topic for a commit comment. Please open a new issue if there is something that needs to be fixed, or open a question issue if you have a question.

But to resolve the remaining Q:

Is there a way to convert the mp4 subtitles to a usable subtitle text format/type without merging it into the video container

Not with yt-dlp. It doesn't know what to do with subtitles-only mp4 files. But you can use MP4Box to extract the subtitle stream as webvtt:

MP4Box -raw "0:output=output.vtt" "input.mp4"

I use the subtitle text to study foreign languages.

It should be noted that the actual subtitles provided by the DASH manifest in mp4 are the same as the webvtt provided by HLS. And with the latest nightly, yt-dlp will download the vtt format by default.

@bashonly
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@boulderob support for subs in mp4 container is being tracked in #5833

Please sign in to comment.