Skip to content

Commit

Permalink
[extractor/common] Improve m3u8 extraction (closes #12211)
Browse files Browse the repository at this point in the history
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod
  • Loading branch information
dstftw committed Apr 22, 2017
1 parent f779958 commit cb25208
Show file tree
Hide file tree
Showing 7 changed files with 466 additions and 60 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ updates_key.pem
*.swf
*.part
*.swp
test/testdata
test/local_parameters.json
.tox
youtube-dl.zsh
Expand Down
308 changes: 307 additions & 1 deletion test/test_InfoExtractor.py

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions test/testdata/m3u8/pluzz_francetv_11507.m3u8
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#EXTM3U
#EXT-X-VERSION:5
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Francais",DEFAULT=NO,FORCED=NO,URI="http://replayftv-pmd.francetv.fr/subtitles/2017/16/156589847-1492488987.m3u8",LANGUAGE="fra"
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="aac",LANGUAGE="fra",NAME="Francais",DEFAULT=YES, AUTOSELECT=YES
#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=180000,RESOLUTION=256x144,CODECS="avc1.66.30, mp4a.40.2"
http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_0_av.m3u8?null=0
#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=303000,RESOLUTION=320x180,CODECS="avc1.66.30, mp4a.40.2"
http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_1_av.m3u8?null=0
#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=575000,RESOLUTION=512x288,CODECS="avc1.66.30, mp4a.40.2"
http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_2_av.m3u8?null=0
#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=831000,RESOLUTION=704x396,CODECS="avc1.77.30, mp4a.40.2"
http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_3_av.m3u8?null=0
#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=1467000,RESOLUTION=1024x576,CODECS="avc1.77.30, mp4a.40.2"
http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_4_av.m3u8?null=0
Expand Down
16 changes: 16 additions & 0 deletions test/testdata/m3u8/teamcoco_11995.m3u8
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#EXTM3U
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio-0",NAME="Default",AUTOSELECT=YES,DEFAULT=YES,URI="hls/CONAN_020217_Highlight_show-audio-160k_v4.m3u8"
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio-1",NAME="Default",AUTOSELECT=YES,DEFAULT=YES,URI="hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8"
#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=37862000,CODECS="avc1.4d001f",URI="hls/CONAN_020217_Highlight_show-2m_iframe.m3u8"
#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=18750000,CODECS="avc1.4d001e",URI="hls/CONAN_020217_Highlight_show-1m_iframe.m3u8"
#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=6535000,CODECS="avc1.42001e",URI="hls/CONAN_020217_Highlight_show-400k_iframe.m3u8"
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2374000,RESOLUTION=1024x576,CODECS="avc1.4d001f,mp4a.40.2",AUDIO="audio-0"
hls/CONAN_020217_Highlight_show-2m_v4.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1205000,RESOLUTION=640x360,CODECS="avc1.4d001e,mp4a.40.2",AUDIO="audio-0"
hls/CONAN_020217_Highlight_show-1m_v4.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=522000,RESOLUTION=400x224,CODECS="avc1.42001e,mp4a.40.2",AUDIO="audio-0"
hls/CONAN_020217_Highlight_show-400k_v4.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=413000,RESOLUTION=400x224,CODECS="avc1.42001e,mp4a.40.5",AUDIO="audio-1"
hls/CONAN_020217_Highlight_show-400k_v4.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=71000,CODECS="mp4a.40.5",AUDIO="audio-1"
hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8
13 changes: 13 additions & 0 deletions test/testdata/m3u8/toggle_mobile_12211.m3u8
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#EXTM3U
#EXT-X-VERSION:4
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio",LANGUAGE="eng",NAME="English",URI="http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_sa2ntrdg/name/a.mp4/index.m3u8"
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio",LANGUAGE="und",NAME="Undefined",URI="http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_r7y0nitg/name/a.mp4/index.m3u8"

#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=155648,RESOLUTION=320x180,AUDIO="audio"
http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_qlk9hlzr/name/a.mp4/index.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=502784,RESOLUTION=480x270,AUDIO="audio"
http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_oefackmi/name/a.mp4/index.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=827392,RESOLUTION=640x360,AUDIO="audio"
http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_vyg9pj7k/name/a.mp4/index.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1396736,RESOLUTION=854x480,AUDIO="audio"
http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_50n4psvx/name/a.mp4/index.m3u8
20 changes: 20 additions & 0 deletions test/testdata/m3u8/twitch_vod.m3u8
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#EXTM3U
#EXT-X-TWITCH-INFO:ORIGIN="s3",CLUSTER="edgecast_vod",REGION="EU",MANIFEST-CLUSTER="edgecast_vod",USER-IP="109.171.17.81"
#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="chunked",NAME="Source",AUTOSELECT=YES,DEFAULT=YES
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=3214134,CODECS="avc1.100.31,mp4a.40.2",RESOLUTION="1280x720",VIDEO="chunked"
https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/chunked/index-muted-HM49I092CC.m3u8
#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="high",NAME="High",AUTOSELECT=YES,DEFAULT=YES
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1603789,CODECS="avc1.42C01F,mp4a.40.2",RESOLUTION="1280x720",VIDEO="high"
https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/high/index-muted-HM49I092CC.m3u8
#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="medium",NAME="Medium",AUTOSELECT=YES,DEFAULT=YES
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=893387,CODECS="avc1.42C01E,mp4a.40.2",RESOLUTION="852x480",VIDEO="medium"
https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/medium/index-muted-HM49I092CC.m3u8
#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="low",NAME="Low",AUTOSELECT=YES,DEFAULT=YES
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=628347,CODECS="avc1.42C01E,mp4a.40.2",RESOLUTION="640x360",VIDEO="low"
https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/low/index-muted-HM49I092CC.m3u8
#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="mobile",NAME="Mobile",AUTOSELECT=YES,DEFAULT=YES
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=280474,CODECS="avc1.42C00D,mp4a.40.2",RESOLUTION="400x226",VIDEO="mobile"
https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/mobile/index-muted-HM49I092CC.m3u8
#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="audio_only",NAME="Audio Only",AUTOSELECT=NO,DEFAULT=NO
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=182725,CODECS="mp4a.40.2",VIDEO="audio_only"
https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/audio_only/index-muted-HM49I092CC.m3u8
154 changes: 96 additions & 58 deletions youtube_dl/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1312,17 +1312,25 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None,
m3u8_id=None, note=None, errnote=None,
fatal=True, live=False):

res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
errnote=errnote or 'Failed to download m3u8 information',
fatal=fatal)

if res is False:
return []

m3u8_doc, urlh = res
m3u8_url = urlh.geturl()

return self._parse_m3u8_formats(
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
preference=preference, m3u8_id=m3u8_id, live=live)

def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
entry_protocol='m3u8', preference=None,
m3u8_id=None, live=False):
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return []

Expand All @@ -1333,19 +1341,21 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))

# We should try extracting formats only from master playlists [1], i.e.
# playlists that describe available qualities. On the other hand media
# playlists [2] should be returned as is since they contain just the media
# without qualities renditions.
# References:
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
# 2. https://github.com/rg3/youtube-dl/issues/12211

# We should try extracting formats only from master playlists [1, 4.3.4],
# i.e. playlists that describe available qualities. On the other hand
# media playlists [1, 4.3.3] should be returned as is since they contain
# just the media without qualities renditions.
# Fortunately, master playlist can be easily distinguished from media
# playlist based on particular tags availability. As of [1, 2] master
# playlist tags MUST NOT appear in a media playist and vice versa.
# As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
# and MUST NOT appear in master playlist thus we can clearly detect media
# playlist with this criterion.
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
# 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
# 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
# playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
# master playlist tags MUST NOT appear in a media playist and vice versa.
# As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
# media playlist and MUST NOT appear in master playlist thus we can
# clearly detect media playlist with this criterion.

if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
return [{
'url': m3u8_url,
Expand All @@ -1354,52 +1364,67 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
'protocol': entry_protocol,
'preference': preference,
}]
audio_in_video_stream = {}
last_info = {}
last_media = {}

groups = {}
last_stream_inf = {}

def extract_media(x_media_line):
media = parse_m3u8_attributes(x_media_line)
# As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
if not (media_type and group_id and name):
return
groups.setdefault(group_id, []).append(media)
if media_type not in ('VIDEO', 'AUDIO'):
return
media_url = media.get('URI')
if media_url:
format_id = []
for v in (group_id, name):
if v:
format_id.append(v)
f = {
'format_id': '-'.join(format_id),
'url': format_url(media_url),
'language': media.get('LANGUAGE'),
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
}
if media_type == 'AUDIO':
f['vcodec'] = 'none'
formats.append(f)

def build_stream_name():
# Despite specification does not mention NAME attribute for
# EXT-X-STREAM-INF it still sometimes may be present
stream_name = last_stream_inf.get('NAME')
if stream_name:
return stream_name
# If there is no NAME in EXT-X-STREAM-INF it will be obtained
# from corresponding rendition group
stream_group_id = last_stream_inf.get('VIDEO')
if not stream_group_id:
return
stream_group = groups.get(stream_group_id)
if not stream_group:
return stream_group_id
rendition = stream_group[0]
return rendition.get('NAME') or stream_group_id

for line in m3u8_doc.splitlines():
if line.startswith('#EXT-X-STREAM-INF:'):
last_info = parse_m3u8_attributes(line)
last_stream_inf = parse_m3u8_attributes(line)
elif line.startswith('#EXT-X-MEDIA:'):
media = parse_m3u8_attributes(line)
media_type = media.get('TYPE')
if media_type in ('VIDEO', 'AUDIO'):
group_id = media.get('GROUP-ID')
media_url = media.get('URI')
if media_url:
format_id = []
for v in (group_id, media.get('NAME')):
if v:
format_id.append(v)
f = {
'format_id': '-'.join(format_id),
'url': format_url(media_url),
'language': media.get('LANGUAGE'),
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
}
if media_type == 'AUDIO':
f['vcodec'] = 'none'
if group_id and not audio_in_video_stream.get(group_id):
audio_in_video_stream[group_id] = False
formats.append(f)
else:
# When there is no URI in EXT-X-MEDIA let this tag's
# data be used by regular URI lines below
last_media = media
if media_type == 'AUDIO' and group_id:
audio_in_video_stream[group_id] = True
extract_media(line)
elif line.startswith('#') or not line.strip():
continue
else:
tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
tbr = int_or_none(last_stream_inf.get('AVERAGE-BANDWIDTH') or last_stream_inf.get('BANDWIDTH'), scale=1000)
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
# Despite specification does not mention NAME attribute for
# EXT-X-STREAM-INF it still sometimes may be present
stream_name = last_info.get('NAME') or last_media.get('NAME')
stream_name = build_stream_name()
# Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided
# format_id intact.
Expand All @@ -1412,11 +1437,11 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
'manifest_url': manifest_url,
'tbr': tbr,
'ext': ext,
'fps': float_or_none(last_info.get('FRAME-RATE')),
'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
'protocol': entry_protocol,
'preference': preference,
}
resolution = last_info.get('RESOLUTION')
resolution = last_stream_inf.get('RESOLUTION')
if resolution:
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
if mobj:
Expand All @@ -1432,13 +1457,26 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
'vbr': vbr,
'abr': abr,
})
f.update(parse_codecs(last_info.get('CODECS')))
if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
# TODO: update acodec for audio only formats with the same GROUP-ID
f['acodec'] = 'none'
codecs = parse_codecs(last_stream_inf.get('CODECS'))
f.update(codecs)
audio_group_id = last_stream_inf.get('AUDIO')
# As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
# references a rendition group MUST have a CODECS attribute.
# However, this is not always respected, for example, [2]
# contains EXT-X-STREAM-INF tag which references AUDIO
# rendition group but does not have CODECS and despite
# referencing audio group an audio group, it represents
# a complete (with audio and video) format. So, for such cases
# we will ignore references to rendition groups and treat them
# as complete formats.
if audio_group_id and codecs and f.get('vcodec') != 'none':
audio_group = groups.get(audio_group_id)
if audio_group and audio_group[0].get('URI'):
# TODO: update acodec for audio only formats with
# the same GROUP-ID
f['acodec'] = 'none'
formats.append(f)
last_info = {}
last_media = {}
last_stream_inf = {}
return formats

@staticmethod
Expand Down

0 comments on commit cb25208

Please sign in to comment.