From 2089ea50c4b13a50595183dec8bf4e3b4b1197a8 Mon Sep 17 00:00:00 2001 From: felix Date: Sun, 23 May 2021 18:34:49 +0200 Subject: [PATCH 01/11] [downloader/mhtml] New downloader This downloader is intended to be used for streams that consist of a timed sequence of stand-alone images, such as slideshows or thumbnail streams. --- yt_dlp/downloader/__init__.py | 2 + yt_dlp/downloader/mhtml.py | 220 ++++++++++++++++++++++++++++++++++ 2 files changed, 222 insertions(+) create mode 100644 yt_dlp/downloader/mhtml.py diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index c7ba918627a..82d7623f626 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -22,6 +22,7 @@ def _get_real_downloader(info_dict, protocol=None, *args, **kwargs): from .rtmp import RtmpFD from .rtsp import RtspFD from .ism import IsmFD +from .mhtml import MhtmlFD from .niconico import NiconicoDmcFD from .youtube_live_chat import YoutubeLiveChatReplayFD from .external import ( @@ -39,6 +40,7 @@ def _get_real_downloader(info_dict, protocol=None, *args, **kwargs): 'f4m': F4mFD, 'http_dash_segments': DashSegmentsFD, 'ism': IsmFD, + 'mhtml': MhtmlFD, 'niconico_dmc': NiconicoDmcFD, 'youtube_live_chat_replay': YoutubeLiveChatReplayFD, } diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py new file mode 100644 index 00000000000..365e7ddb77a --- /dev/null +++ b/yt_dlp/downloader/mhtml.py @@ -0,0 +1,220 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .fragment import FragmentFD +import quopri +import uuid +import io +import re +from ..version import __version__ as YT_DLP_VERSION +from ..utils import ( + urljoin, + srt_subtitles_timecode, + formatSeconds, +) + + +class MhtmlFD(FragmentFD): + FD_NAME = 'mhtml' + _EXTENSION = 'mhtml' + + _STYLESHEET = """\ +html, body { + margin: 0; + padding: 0; + height: 100vh; +} + +html { + overflow-y: scroll; + scroll-snap-type: y mandatory; +} + +body { + scroll-snap-type: y mandatory; + display: flex; + flex-flow: column; +} + +body > figure { + max-width: 100vw; + max-height: 100vh; + scroll-snap-align: center; +} + +body > figure > figcaption { + text-align: center; + height: 2.5em; +} + +body > figure > img { + display: block; + margin: auto; + max-width: 100%; + max-height: calc(100vh - 5em); +} +""" + _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET) + _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET) + + @staticmethod + def _escape_html(text): + return ( + text + .replace('&', '&') + .replace('<', '<') + .replace('>', '>') + .replace('"', '"') + .replace("'", ''') + ) + + @staticmethod + def _escape_mime(s): + return '=?utf-8?Q?' + (b''.join( + bytes((b,)) if b >= 0x20 else b'=%02X' % b + for b in quopri.encodestring(s.encode('utf-8'), header=True) + )).decode('us-ascii') + '?=' + + def _gen_cid(self, i, fragment, frag_boundary): + return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary) + + def _gen_stub(self, *, fragments, frag_boundary, title): + output = io.StringIO() + + output.write(( + '' + '' + '' + '' '' + '' '{title}' + '' '' + '' + ).format( + version=self._escape_html(YT_DLP_VERSION), + styles=self._STYLESHEET, + title=self._escape_html(title) + )) + + t0 = 0 + for i, frag in enumerate(fragments): + output.write('
') + try: + t1 = t0 + frag['duration'] + output.write(( + '
Slide #{num}: {t0} – {t1} (duration: {duration})
' + ).format( + num=i + 1, + t0=srt_subtitles_timecode(t0), + t1=srt_subtitles_timecode(t1), + duration=formatSeconds(frag['duration']) + )) + except (KeyError, ValueError, TypeError): + t1 = None + output.write(( + '
Slide #{num}
' + ).format(num=i + 1)) + output.write(''.format( + cid=self._gen_cid(i, frag, frag_boundary))) + output.write('
') + t0 = t1 + + return output.getvalue() + + def real_download(self, filename, info_dict): + fragment_base_url = info_dict.get('fragment_base_url') + fragments = info_dict['fragments'][:1] if self.params.get( + 'test', False) else info_dict['fragments'] + title = info_dict['_download_params'].get('title', fragment_base_url) + origin = info_dict['_download_params'].get('origin') + + ctx = { + 'filename': filename, + 'total_frags': len(fragments), + } + + self._prepare_and_start_frag_download(ctx) + + extra_state = ctx.setdefault('extra_state', { + 'header_written': False, + 'mime_boundary': str(uuid.uuid4()).replace('-', ''), + }) + + frag_boundary = extra_state['mime_boundary'] + + if not extra_state['header_written']: + stub = self._gen_stub( + fragments=fragments, + frag_boundary=frag_boundary, + title=title + ) + + ctx['dest_stream'].write(( + 'MIME-Version: 1.0\r\n' + 'From: \r\n' + 'To: \r\n' + 'Subject: {title}\r\n' + 'Content-type: multipart/related; ' + '' 'boundary="{boundary}"; ' + '' 'type="text/html"\r\n' + 'X.yt-dlp.Origin: {origin}\r\n' + '\r\n' + '--{boundary}\r\n' + 'Content-Type: text/html; charset=utf-8\r\n' + 'Content-Length: {length}\r\n' + '\r\n' + '{stub}\r\n' + ).format( + origin=origin, + boundary=frag_boundary, + length=len(stub), + title=self._escape_mime(title), + stub=stub + ).encode('utf-8')) + extra_state['header_written'] = True + + for i, fragment in enumerate(fragments): + if (i + 1) <= ctx['fragment_index']: + continue + + fragment_url = urljoin(fragment_base_url, fragment['path']) + success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) + if not success: + continue + + mime_type = b'image/jpeg' + if frag_content.startswith(b'\x89PNG\r\n\x1a\n'): + mime_type = b'image/png' + if frag_content.startswith((b'GIF87a', b'GIF89a')): + mime_type = b'image/gif' + if frag_content.startswith(b'RIFF') and frag_content[8:12] == 'WEBP': + mime_type = b'image/webp' + + frag_header = io.BytesIO() + frag_header.write( + b'--%b\r\n' + % (frag_boundary.encode('us-ascii'),)) + frag_header.write( + b'Content-ID: <%b>\r\n' + % (self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'),)) + frag_header.write( + b'Content-type: %b\r\n' + % (mime_type,)) + frag_header.write( + b'Content-length: %u\r\n' + % (len(frag_content),)) + frag_header.write( + b'Content-location: %b\r\n' + % (fragment_url.encode('us-ascii'),)) + try: + frag_header.write( + b'X.yt-dlp.Duration: %f s\r\n' + % (fragment['duration'],)) + except KeyError: + pass + frag_header.write(b'\r\n') + self._append_fragment( + ctx, frag_header.getvalue() + frag_content + b'\r\n') + + ctx['dest_stream'].write( + b'--%b--\r\n\r\n' + % (frag_boundary.encode('us-ascii'),)) + self._finish_frag_download(ctx) From c5df915b3ce0ccbcba30b192775b64ba0a670d17 Mon Sep 17 00:00:00 2001 From: felix Date: Sun, 23 May 2021 18:34:53 +0200 Subject: [PATCH 02/11] [mediasite] Extract slides --- yt_dlp/extractor/mediasite.py | 67 ++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index 5d083a1cd1c..f1956c65e0d 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -122,6 +122,58 @@ def _extract_urls(webpage): r'(?xi)]+\bsrc=(["\'])(?P(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE, webpage)] + def __extract_slides(self, *, stream_id, snum, Stream, duration, images, title, origin): + slide_base_url = Stream['SlideBaseUrl'] + + fname_template = Stream['SlideImageFileNameTemplate'] + if fname_template != 'slide_{0:D4}.jpg': + self.report_warning('Unusual slide file name template; report a bug if slide downloading fails') + fname_template = re.sub(r'\{0:D([0-9]+)\}', r'{0:0\1}', fname_template) + + fragments = [] + for i, slide in enumerate(Stream['Slides']): + if i == 0: + if slide['Time'] > 0: + default_slide = images.get('DefaultSlide') + if default_slide is None: + default_slide = images.get('DefaultStreamImage') + if default_slide is not None: + default_slide = default_slide['ImageFilename'] + if default_slide is not None: + fragments.append({ + 'path': default_slide, + 'duration': slide['Time'] / 1000, + }) + + next_time = try_get(None, [ + lambda _: Stream['Slides'][i + 1]['Time'], + lambda _: duration, + lambda _: slide['Time'], + ], expected_type=(int, float)) + + fragments.append({ + 'path': fname_template.format(slide.get('Number', i + 1)), + 'duration': (next_time - slide['Time']) / 1000 + }) + + return { + 'format_id': '%s-%u.slides' % (stream_id, snum), + 'ext': 'mhtml', + 'url': slide_base_url, + 'protocol': 'mhtml', + 'acodec': 'none', + 'vcodec': 'jpeg', + 'quality': -12, + 'format_note': 'Slides', + 'fragments': fragments, + 'fragment_base_url': slide_base_url, + '_download_params': { + 'duration': duration, + 'title': title, + 'origin': origin, + }, + } + def _real_extract(self, url): url, data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) @@ -198,10 +250,17 @@ def _real_extract(self, url): 'ext': mimetype2ext(VideoUrl.get('MimeType')), }) - # TODO: if Stream['HasSlideContent']: - # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum) - # from Stream['Slides'] - # this will require writing a custom downloader... + if Stream.get('HasSlideContent', False): + images = player_options['PlayerLayoutOptions']['Images'] + stream_formats.append(self.__extract_slides( + stream_id=stream_id, + snum=snum, + Stream=Stream, + duration=presentation.get('Duration'), + images=images, + title=title, + origin=url + )) # disprefer 'secondary' streams if stream_type != 0: From 89124e63a906b06ef08a19980832ec2493d4d545 Mon Sep 17 00:00:00 2001 From: felix Date: Sun, 23 May 2021 20:54:00 +0200 Subject: [PATCH 03/11] [common] Extract thumbnail stream from DASH manifests --- yt_dlp/extractor/common.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 64ab8f7062f..74c63fb23a3 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2122,6 +2122,7 @@ def extract_media(x_media_line): format_id.append(str(format_index)) f = { 'format_id': '-'.join(format_id), + 'format_note': name, 'format_index': format_index, 'url': manifest_url, 'manifest_url': m3u8_url, @@ -2633,7 +2634,7 @@ def extract_Initialization(source): mime_type = representation_attrib['mimeType'] content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) - if content_type in ('video', 'audio', 'text'): + if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg': base_url = '' for element in (representation, adaptation_set, period, mpd_doc): base_url_e = element.find(_add_ns('BaseURL')) @@ -2650,9 +2651,15 @@ def extract_Initialization(source): url_el = representation.find(_add_ns('BaseURL')) filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) bandwidth = int_or_none(representation_attrib.get('bandwidth')) + if representation_id is not None: + format_id = representation_id + else: + format_id = content_type + if mpd_id: + format_id = mpd_id + '-' + format_id if content_type in ('video', 'audio'): f = { - 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, + 'format_id': format_id, 'manifest_url': mpd_url, 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), @@ -2672,6 +2679,16 @@ def extract_Initialization(source): 'manifest_url': mpd_url, 'filesize': filesize, } + elif mime_type == 'image/jpeg': + f = { + 'format_id': format_id, + 'ext': 'mhtml', + 'manifest_url': mpd_url, + 'format_note': 'DASH thumbnail', + 'preference': -10, + 'acodec': 'none', + 'vcodec': 'jpeg', + } representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) def prepare_template(template_name, identifiers): @@ -2690,7 +2707,8 @@ def prepare_template(template_name, identifiers): t += c # Next, $...$ templates are translated to their # %(...) counterparts to be used with % operator - t = t.replace('$RepresentationID$', representation_id) + if representation_id is not None: + t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) t.replace('$$', '$') @@ -2807,7 +2825,7 @@ def add_segment_url(): 'url': mpd_url or base_url, 'fragment_base_url': base_url, 'fragments': [], - 'protocol': 'http_dash_segments', + 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml', }) if 'initialization_url' in representation_ms_info: initialization_url = representation_ms_info['initialization_url'] @@ -2818,7 +2836,7 @@ def add_segment_url(): else: # Assuming direct URL to unfragmented media. f['url'] = base_url - if content_type in ('video', 'audio'): + if content_type in ('video', 'audio') or mime_type == 'image/jpeg': formats.append(f) elif content_type == 'text': subtitles.setdefault(lang or 'und', []).append(f) From 407878b327ec53c386d406ad696166d122f34a28 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 7 Jun 2021 13:07:02 +0530 Subject: [PATCH 04/11] Remove "Unknown MIME" warnings from tests --- yt_dlp/extractor/canvas.py | 6 +++--- yt_dlp/extractor/common.py | 2 ++ yt_dlp/extractor/viki.py | 5 +---- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py index 1b7c1d2ff7f..575f3d25cbb 100644 --- a/yt_dlp/extractor/canvas.py +++ b/yt_dlp/extractor/canvas.py @@ -24,7 +24,7 @@ class CanvasIE(InfoExtractor): _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'md5': '68993eda72ef62386a15ea2cf3c93107', + 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', 'info_dict': { 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', @@ -32,9 +32,9 @@ class CanvasIE(InfoExtractor): 'title': 'Nachtwacht: De Greystook', 'description': 'Nachtwacht: De Greystook', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.04, + 'duration': 1468.02, }, - 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], + 'expected_warnings': ['is not a supported codec'], }, { 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'only_matching': True, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 74c63fb23a3..f2ddb186aa5 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2680,6 +2680,8 @@ def extract_Initialization(source): 'filesize': filesize, } elif mime_type == 'image/jpeg': + # See test case in VikiIE + # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1 f = { 'format_id': format_id, 'ext': 'mhtml', diff --git a/yt_dlp/extractor/viki.py b/yt_dlp/extractor/viki.py index 98d16f4d134..19bcf1d7be7 100644 --- a/yt_dlp/extractor/viki.py +++ b/yt_dlp/extractor/viki.py @@ -142,6 +142,7 @@ class VikiIE(VikiBaseIE): IE_NAME = 'viki' _VALID_URL = r'%s(?:videos|player)/(?P[0-9]+v)' % VikiBaseIE._VALID_URL_BASE _TESTS = [{ + 'note': 'Free non-DRM video with storyboards in MPD', 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1', 'info_dict': { 'id': '1175236v', @@ -155,7 +156,6 @@ class VikiIE(VikiBaseIE): 'params': { 'format': 'bestvideo', }, - 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { @@ -173,7 +173,6 @@ class VikiIE(VikiBaseIE): 'format': 'bestvideo', }, 'skip': 'Blocked in the US', - 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # clip 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', @@ -225,7 +224,6 @@ class VikiIE(VikiBaseIE): 'params': { 'format': 'bestvideo', }, - 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -264,7 +262,6 @@ class VikiIE(VikiBaseIE): 'params': { 'format': 'bestvideo', }, - 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }] def _real_extract(self, url): From 5f323f1f512f497fb4ca4fcd2d063f76a57dd933 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 7 Jun 2021 14:12:57 +0530 Subject: [PATCH 05/11] Remove `_download_params` --- yt_dlp/downloader/mhtml.py | 4 ++-- yt_dlp/extractor/mediasite.py | 9 +-------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py index 365e7ddb77a..e141ac52f43 100644 --- a/yt_dlp/downloader/mhtml.py +++ b/yt_dlp/downloader/mhtml.py @@ -123,8 +123,8 @@ def real_download(self, filename, info_dict): fragment_base_url = info_dict.get('fragment_base_url') fragments = info_dict['fragments'][:1] if self.params.get( 'test', False) else info_dict['fragments'] - title = info_dict['_download_params'].get('title', fragment_base_url) - origin = info_dict['_download_params'].get('origin') + title = info_dict['title'] + origin = info_dict['webpage_url'] ctx = { 'filename': filename, diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index f1956c65e0d..6ec2e414f3e 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -122,7 +122,7 @@ def _extract_urls(webpage): r'(?xi)]+\bsrc=(["\'])(?P(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE, webpage)] - def __extract_slides(self, *, stream_id, snum, Stream, duration, images, title, origin): + def __extract_slides(self, *, stream_id, snum, Stream, duration, images): slide_base_url = Stream['SlideBaseUrl'] fname_template = Stream['SlideImageFileNameTemplate'] @@ -167,11 +167,6 @@ def __extract_slides(self, *, stream_id, snum, Stream, duration, images, title, 'format_note': 'Slides', 'fragments': fragments, 'fragment_base_url': slide_base_url, - '_download_params': { - 'duration': duration, - 'title': title, - 'origin': origin, - }, } def _real_extract(self, url): @@ -258,8 +253,6 @@ def _real_extract(self, url): Stream=Stream, duration=presentation.get('Duration'), images=images, - title=title, - origin=url )) # disprefer 'secondary' streams From bda743f52ebd11874e131a5ffb207176eab16fc9 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 7 Jun 2021 13:16:25 +0530 Subject: [PATCH 06/11] Add `msec` to duration --- yt_dlp/downloader/mhtml.py | 2 +- yt_dlp/utils.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py index e141ac52f43..0fdd048162f 100644 --- a/yt_dlp/downloader/mhtml.py +++ b/yt_dlp/downloader/mhtml.py @@ -105,7 +105,7 @@ def _gen_stub(self, *, fragments, frag_boundary, title): num=i + 1, t0=srt_subtitles_timecode(t0), t1=srt_subtitles_timecode(t1), - duration=formatSeconds(frag['duration']) + duration=formatSeconds(frag['duration'], msec=True) )) except (KeyError, ValueError, TypeError): t1 = None diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 72fd8a0e7dd..3516ac950cf 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2321,13 +2321,14 @@ def decodeOption(optval): return optval -def formatSeconds(secs, delim=':'): +def formatSeconds(secs, delim=':', msec=False): if secs > 3600: - return '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60) + ret = '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60) elif secs > 60: - return '%d%s%02d' % (secs // 60, delim, secs % 60) + ret = '%d%s%02d' % (secs // 60, delim, secs % 60) else: - return '%d' % secs + ret = '%d' % secs + return '%s.%03d' % (ret, secs % 1) if msec else ret def make_HTTPS_handler(params, **kwargs): From 4b87c0ebc15d06a93f01f1891b4c5925bc771e34 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 7 Jun 2021 13:38:08 +0530 Subject: [PATCH 07/11] `escapeHTML` is a utility --- yt_dlp/downloader/mhtml.py | 15 ++------------- yt_dlp/utils.py | 9 +++++++++ 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py index 0fdd048162f..6c67229f92a 100644 --- a/yt_dlp/downloader/mhtml.py +++ b/yt_dlp/downloader/mhtml.py @@ -56,17 +56,6 @@ class MhtmlFD(FragmentFD): _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET) _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET) - @staticmethod - def _escape_html(text): - return ( - text - .replace('&', '&') - .replace('<', '<') - .replace('>', '>') - .replace('"', '"') - .replace("'", ''') - ) - @staticmethod def _escape_mime(s): return '=?utf-8?Q?' + (b''.join( @@ -89,9 +78,9 @@ def _gen_stub(self, *, fragments, frag_boundary, title): '' '' '' ).format( - version=self._escape_html(YT_DLP_VERSION), + version=escapeHTML(YT_DLP_VERSION), styles=self._STYLESHEET, - title=self._escape_html(title) + title=escapeHTML(title) )) t0 = 0 diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 3516ac950cf..0e6c8151443 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2241,6 +2241,15 @@ def unescapeHTML(s): return re.sub( r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) +def escapeHTML(text): + return ( + text + .replace('&', '&') + .replace('<', '<') + .replace('>', '>') + .replace('"', '"') + .replace("'", ''') + ) def process_communicate_or_kill(p, *args, **kwargs): try: From 891ab6169817ef335883a2343de4414e89561278 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 7 Jun 2021 13:31:27 +0530 Subject: [PATCH 08/11] [cleanup] --- yt_dlp/downloader/mhtml.py | 42 +++++++++++++++----------------------- yt_dlp/utils.py | 2 ++ 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py index 6c67229f92a..9ebf3654986 100644 --- a/yt_dlp/downloader/mhtml.py +++ b/yt_dlp/downloader/mhtml.py @@ -1,21 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -from .fragment import FragmentFD -import quopri -import uuid + import io +import quopri import re -from ..version import __version__ as YT_DLP_VERSION +import uuid + +from .fragment import FragmentFD from ..utils import ( - urljoin, - srt_subtitles_timecode, + escapeHTML, formatSeconds, + srt_subtitles_timecode, + urljoin, ) +from ..version import __version__ as YT_DLP_VERSION class MhtmlFD(FragmentFD): FD_NAME = 'mhtml' - _EXTENSION = 'mhtml' _STYLESHEET = """\ html, body { @@ -179,31 +181,21 @@ def real_download(self, filename, info_dict): frag_header = io.BytesIO() frag_header.write( - b'--%b\r\n' - % (frag_boundary.encode('us-ascii'),)) + b'--%b\r\n' % frag_boundary.encode('us-ascii')) frag_header.write( - b'Content-ID: <%b>\r\n' - % (self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'),)) + b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii')) frag_header.write( - b'Content-type: %b\r\n' - % (mime_type,)) + b'Content-type: %b\r\n' % mime_type) frag_header.write( - b'Content-length: %u\r\n' - % (len(frag_content),)) + b'Content-length: %u\r\n' % len(frag_content)) frag_header.write( - b'Content-location: %b\r\n' - % (fragment_url.encode('us-ascii'),)) - try: - frag_header.write( - b'X.yt-dlp.Duration: %f s\r\n' - % (fragment['duration'],)) - except KeyError: - pass + b'Content-location: %b\r\n' % fragment_url.encode('us-ascii')) + frag_header.write( + b'X.yt-dlp.Duration: %f\r\n' % fragment['duration']) frag_header.write(b'\r\n') self._append_fragment( ctx, frag_header.getvalue() + frag_content + b'\r\n') ctx['dest_stream'].write( - b'--%b--\r\n\r\n' - % (frag_boundary.encode('us-ascii'),)) + b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii')) self._finish_frag_download(ctx) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 0e6c8151443..8bc4c27e5de 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2241,6 +2241,7 @@ def unescapeHTML(s): return re.sub( r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) + def escapeHTML(text): return ( text @@ -2251,6 +2252,7 @@ def escapeHTML(text): .replace("'", ''') ) + def process_communicate_or_kill(p, *args, **kwargs): try: return p.communicate(*args, **kwargs) From 58ca4c9d256afee128b1b4b6ab716eaea2cbe71e Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 13 Jun 2021 02:33:00 +0530 Subject: [PATCH 09/11] [mhtml] `real_download` should return `True` on success --- yt_dlp/downloader/mhtml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py index 9ebf3654986..81d95c7cbef 100644 --- a/yt_dlp/downloader/mhtml.py +++ b/yt_dlp/downloader/mhtml.py @@ -199,3 +199,4 @@ def real_download(self, filename, info_dict): ctx['dest_stream'].write( b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii')) self._finish_frag_download(ctx) + return True From a662714561db1800ceea0d2bfcc165238234fcb3 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 13 Jun 2021 02:34:14 +0530 Subject: [PATCH 10/11] Remove `vcodec` --- yt_dlp/extractor/common.py | 5 ++--- yt_dlp/extractor/mediasite.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index f2ddb186aa5..8f916e1b0ba 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2686,10 +2686,9 @@ def extract_Initialization(source): 'format_id': format_id, 'ext': 'mhtml', 'manifest_url': mpd_url, - 'format_note': 'DASH thumbnail', - 'preference': -10, + 'format_note': 'DASH storyboards (jpeg)', 'acodec': 'none', - 'vcodec': 'jpeg', + 'vcodec': 'none', } representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index 6ec2e414f3e..a18cfc689f0 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -162,9 +162,8 @@ def __extract_slides(self, *, stream_id, snum, Stream, duration, images): 'url': slide_base_url, 'protocol': 'mhtml', 'acodec': 'none', - 'vcodec': 'jpeg', - 'quality': -12, - 'format_note': 'Slides', + 'vcodec': 'none', + 'format_note': 'Slides (jpeg)', 'fragments': fragments, 'fragment_base_url': slide_base_url, } From b4fa676a591bb4cf65323c3e8161a12ab48c1191 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 13 Jun 2021 17:37:37 +0530 Subject: [PATCH 11/11] The images are not necessarily jpeg --- yt_dlp/extractor/mediasite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index a18cfc689f0..c62233ab7c7 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -163,7 +163,7 @@ def __extract_slides(self, *, stream_id, snum, Stream, duration, images): 'protocol': 'mhtml', 'acodec': 'none', 'vcodec': 'none', - 'format_note': 'Slides (jpeg)', + 'format_note': 'Slides', 'fragments': fragments, 'fragment_base_url': slide_base_url, }