From 70fad8de8fc18cdd186ee431bbd433bbd4d440cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Velad=20Galv=C3=A1n?= Date: Mon, 28 Nov 2022 18:56:22 +0100 Subject: [PATCH] feat(cea): Add CEA parser for TS (#4697) Closes https://github.com/shaka-project/shaka-player/issues/3674 Co-authored-by: Joey Parrish --- README.md | 6 +- build/types/cea | 1 + externs/mux.js | 106 +--------- externs/shaka/player.js | 6 +- lib/cea/cea608_data_channel.js | 25 ++- lib/cea/cea_decoder.js | 13 ++ lib/cea/ts_cea_parser.js | 64 ++++++ lib/media/closed_caption_parser.js | 9 +- lib/media/media_source_engine.js | 44 +--- lib/media/transmuxer.js | 23 +-- lib/text/text_engine.js | 17 -- lib/util/ts_parser.js | 188 +++++++++++++++--- test/media/media_source_engine_integration.js | 83 ++++++-- test/media/media_source_engine_unit.js | 37 +--- test/media/transmuxer_integration.js | 21 +- 15 files changed, 367 insertions(+), 276 deletions(-) create mode 100644 lib/cea/ts_cea_parser.js diff --git a/README.md b/README.md index aa5e989cf4..65d9e925b7 100644 --- a/README.md +++ b/README.md @@ -225,11 +225,9 @@ Shaka Player supports: - TTML - Supported in both XML form and embedded in MP4 - CEA-608 - - Supported embedded in MP4 - - With help from [mux.js][] v6.2.0+, supported embedded in TS + - Supported embedded in MP4 and TS - CEA-708 - - Supported embedded in MP4 - - With help from [mux.js][] v6.2.0+, supported embedded in TS + - Supported embedded in MP4 and TS - SubRip (SRT) - UTF-8 encoding only - LyRiCs (LRC) diff --git a/build/types/cea b/build/types/cea index 31f1660566..0f5c3cc88d 100644 --- a/build/types/cea +++ b/build/types/cea @@ -12,3 +12,4 @@ +../../lib/cea/i_cea_parser.js +../../lib/cea/mp4_cea_parser.js +../../lib/cea/sei_processor.js ++../../lib/cea/ts_cea_parser.js diff --git a/externs/mux.js b/externs/mux.js index efdb20207a..aecf0a5961 100644 --- a/externs/mux.js +++ b/externs/mux.js @@ -23,29 +23,6 @@ var muxjs = {}; muxjs.mp4 = {}; -/** @const */ -muxjs.mp4.probe = class { - /** - * Parses an MP4 initialization segment and extracts the timescale - * values for any declared tracks. - * - * @param {Uint8Array} init The bytes of the init segment - * @return {!Object.} a hash of track ids to timescale - * values or null if the init segment is malformed. - */ - static timescale(init) {} - - /** - * Find the trackIds of the video tracks in this source. - * Found by parsing the Handler Reference and Track Header Boxes: - * - * @param {Uint8Array} init The bytes of the init segment for this source - * @return {!Array.} A list of trackIds - **/ - static videoTrackIds(init) {} -}; - - muxjs.mp4.Transmuxer = class { /** @param {Object=} options */ constructor(options) {} @@ -74,100 +51,19 @@ muxjs.mp4.Transmuxer = class { /** Remove all handlers and clean up. */ dispose() {} - - /** Reset captions. */ - resetCaptions() {} }; /** * @typedef {{ * initSegment: !Uint8Array, - * data: !Uint8Array, - * captions: !Array + * data: !Uint8Array * }} * * @description Transmuxed data from mux.js. * @property {!Uint8Array} initSegment * @property {!Uint8Array} data - * @property {!Array} captions * @exportDoc */ muxjs.mp4.Transmuxer.Segment; - -muxjs.mp4.CaptionParser = class { - /** - * Parser for CEA closed captions embedded in video streams for Dash. - * @constructor - * @struct - */ - constructor() {} - - /** Initializes the closed caption parser. */ - init() {} - - /** - * Return true if a new video track is selected or if the timescale is - * changed. - * @param {!Array.} videoTrackIds A list of video tracks found in the - * init segment. - * @param {!Object.} timescales The map of track Ids and the - * tracks' timescales in the init segment. - * @return {boolean} - */ - isNewInit(videoTrackIds, timescales) {} - - /** - * Parses embedded CEA closed captions and interacts with the underlying - * CaptionStream, and return the parsed captions. - * @param {!Uint8Array} segment The fmp4 segment containing embedded captions - * @param {!Array.} videoTrackIds A list of video tracks found in the - * init segment. - * @param {!Object.} timescales The timescales found in the - * init segment. - * @return {muxjs.mp4.ParsedClosedCaptions} - */ - parse(segment, videoTrackIds, timescales) {} - - /** Clear the parsed closed captions data for new data. */ - clearParsedCaptions() {} - - /** Reset the captions stream. */ - resetCaptionStream() {} -}; - - -/** - * @typedef {{ - * captionStreams: Object., - * captions: !Array. - * }} - * - * @description closed captions data parsed from mux.js caption parser. - * @property {Object.} captionStreams - * @property {Array.} captions - */ -muxjs.mp4.ParsedClosedCaptions; - - -/** - * @typedef {{ - * startPts: number, - * endPts: number, - * startTime: number, - * endTime: number, - * stream: string, - * text: string - * }} - * - * @description closed caption parsed from mux.js caption parser. - * @property {number} startPts - * @property {number} endPts - * @property {number} startTime - * @property {number} endTime - * @property {string} stream The channel id of the closed caption. - * @property {string} text The content of the closed caption. - */ -muxjs.mp4.ClosedCaption; - diff --git a/externs/shaka/player.js b/externs/shaka/player.js index e7dc898dd4..0b8ba42101 100644 --- a/externs/shaka/player.js +++ b/externs/shaka/player.js @@ -1012,10 +1012,8 @@ shaka.extern.ManifestConfiguration; * the default value unless you have a good reason not to. * @property {boolean} forceTransmux * If this is true, we will transmux AAC and TS content even if - * not strictly necessary for the assets to be played. Shaka Player - * currently only supports CEA 708 captions by transmuxing, so this value is - * necessary for enabling them on platforms with native TS support like Edge - * or Chromecast. This value defaults to false. + * not strictly necessary for the assets to be played. + * This value defaults to false. * @property {number} safeSeekOffset * The amount of seconds that should be added when repositioning the playhead * after falling out of the availability window or seek. This gives the player diff --git a/lib/cea/cea608_data_channel.js b/lib/cea/cea608_data_channel.js index a140e6c006..36263a8b18 100644 --- a/lib/cea/cea608_data_channel.js +++ b/lib/cea/cea608_data_channel.js @@ -54,7 +54,7 @@ shaka.cea.Cea608DataChannel = class { * Points to current buffer. * @private {!shaka.cea.Cea608Memory} */ - this.curbuf_ = this.displayedMemory_; + this.curbuf_ = this.nonDisplayedMemory_; /** * End time of the previous caption, serves as start time of next caption. @@ -73,14 +73,25 @@ shaka.cea.Cea608DataChannel = class { * Resets channel state. */ reset() { - this.type_ = shaka.cea.Cea608DataChannel.CaptionType.PAINTON; - this.curbuf_ = this.displayedMemory_; + this.type_ = shaka.cea.Cea608DataChannel.CaptionType.NONE; + this.curbuf_ = this.nonDisplayedMemory_; this.lastcp_ = null; this.displayedMemory_.reset(); this.nonDisplayedMemory_.reset(); this.text_.reset(); } + /** + * Set the initial PTS, which may not be 0 if we start decoding at a later + * point in the stream. Without this, the first cue's startTime can be way + * off. + * + * @param {number} pts + */ + firstPts(pts) { + this.prevEndTime_ = pts; + } + /** * Gets the row index from a Preamble Address Code byte pair. * @param {number} b1 Byte 1. @@ -155,12 +166,12 @@ shaka.cea.Cea608DataChannel = class { } buf.setRow(row); - this.curbuf_.setUnderline(underline); - this.curbuf_.setItalics(italics); - this.curbuf_.setTextColor(textColor); + buf.setUnderline(underline); + buf.setItalics(italics); + buf.setTextColor(textColor); // Clear the background color, since new row (PAC) should reset ALL styles. - this.curbuf_.setBackgroundColor(shaka.cea.CeaUtils.DEFAULT_BG_COLOR); + buf.setBackgroundColor(shaka.cea.CeaUtils.DEFAULT_BG_COLOR); } /** diff --git a/lib/cea/cea_decoder.js b/lib/cea/cea_decoder.js index dbf3552703..1075a13c1b 100644 --- a/lib/cea/cea_decoder.js +++ b/lib/cea/cea_decoder.js @@ -77,6 +77,11 @@ shaka.cea.CeaDecoder = class { */ this.serviceNumberToService_ = new Map(); + /** + * @private {boolean} + */ + this.waitingForFirstPacket_ = true; + this.reset(); } @@ -106,6 +111,7 @@ shaka.cea.CeaDecoder = class { for (const stream of this.cea608ModeToStream_.values()) { stream.reset(); } + this.waitingForFirstPacket_ = true; } /** @@ -114,6 +120,13 @@ shaka.cea.CeaDecoder = class { * @override */ extract(userDataSeiMessage, pts) { + if (this.waitingForFirstPacket_) { + for (const stream of this.cea608ModeToStream_.values()) { + stream.firstPts(pts); + } + this.waitingForFirstPacket_ = false; + } + const reader = new shaka.util.DataViewReader( userDataSeiMessage, shaka.util.DataViewReader.Endianness.BIG_ENDIAN); diff --git a/lib/cea/ts_cea_parser.js b/lib/cea/ts_cea_parser.js new file mode 100644 index 0000000000..eeab6d3ab8 --- /dev/null +++ b/lib/cea/ts_cea_parser.js @@ -0,0 +1,64 @@ +/*! @license + * Shaka Player + * Copyright 2016 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +goog.provide('shaka.cea.TsCeaParser'); + +goog.require('shaka.cea.ICeaParser'); +goog.require('shaka.cea.SeiProcessor'); +goog.require('shaka.util.BufferUtils'); +goog.require('shaka.util.TsParser'); + +/** + * MPEG TS CEA parser. + * @implements {shaka.cea.ICeaParser} + */ +shaka.cea.TsCeaParser = class { + /** */ + constructor() { + /** + * SEI data processor. + * @private + * @const {!shaka.cea.SeiProcessor} + */ + this.seiProcessor_ = new shaka.cea.SeiProcessor(); + } + + /** + * @override + */ + init(initSegment) { + // TS hasn't init segment + } + + /** + * @override + */ + parse(mediaSegment) { + const ICeaParser = shaka.cea.ICeaParser; + + /** @type {!Array} **/ + const captionPackets = []; + + const uint8ArrayData = shaka.util.BufferUtils.toUint8(mediaSegment); + if (!shaka.util.TsParser.probe(uint8ArrayData)) { + return captionPackets; + } + const tsParser = new shaka.util.TsParser().parse(uint8ArrayData); + const videoNalus = tsParser.getVideoNalus(); + for (const nalu of videoNalus) { + if (nalu.type == ICeaParser.H264_NALU_TYPE_SEI && + nalu.time != null) { + for (const packet of this.seiProcessor_.process(nalu.data)) { + captionPackets.push({ + packet: packet, + pts: nalu.time, + }); + } + } + } + return captionPackets; + } +}; diff --git a/lib/media/closed_caption_parser.js b/lib/media/closed_caption_parser.js index 79e82d5d9e..a657bb2d92 100644 --- a/lib/media/closed_caption_parser.js +++ b/lib/media/closed_caption_parser.js @@ -10,6 +10,7 @@ goog.provide('shaka.media.IClosedCaptionParser'); goog.require('shaka.cea.CeaDecoder'); goog.require('shaka.cea.DummyCeaParser'); goog.require('shaka.cea.Mp4CeaParser'); +goog.require('shaka.cea.TsCeaParser'); goog.require('shaka.util.BufferUtils'); goog.requireType('shaka.cea.ICaptionDecoder'); goog.requireType('shaka.cea.ICeaParser'); @@ -59,10 +60,14 @@ shaka.media.ClosedCaptionParser = class { /** @private {!shaka.cea.ICeaParser} */ this.ceaParser_ = new shaka.cea.DummyCeaParser(); - if (mimeType.includes('video/mp4')) { - // MP4 Parser to extract closed caption packets from H.264 video. + if (mimeType.toLowerCase().includes('video/mp4')) { + // MP4 Parser to extract closed caption packets from H.264/H.265 video. this.ceaParser_ = new shaka.cea.Mp4CeaParser(); } + if (mimeType.toLowerCase().includes('video/mp2t')) { + // TS Parser to extract closed caption packets from H.264 video. + this.ceaParser_ = new shaka.cea.TsCeaParser(); + } /** * Decoder for decoding CEA-X08 data from closed caption packets. diff --git a/lib/media/media_source_engine.js b/lib/media/media_source_engine.js index bfa902f8b5..285632964a 100644 --- a/lib/media/media_source_engine.js +++ b/lib/media/media_source_engine.js @@ -580,6 +580,8 @@ shaka.media.MediaSourceEngine = class { return; } + let timestampOffset = this.sourceBuffers_[contentType].timestampOffset; + const uint8ArrayData = shaka.util.BufferUtils.toUint8(data); let mimeType = this.sourceBufferTypes_[contentType]; if (this.transmuxers_[contentType]) { @@ -590,7 +592,7 @@ shaka.media.MediaSourceEngine = class { // The SourceBuffer timestampOffset may or may not be set yet, so this is // the timestamp offset that would eventually compute for this segment // either way. - const timestampOffset = + timestampOffset = reference.startTime - (tsParser.getStartTime()[contentType] || 0); const metadata = tsParser.getMetadata(); if (metadata.length) { @@ -613,37 +615,7 @@ shaka.media.MediaSourceEngine = class { } } } - - if (this.transmuxers_[contentType]) { - // When seeked we should reset the transmuxer captionstreams - // so it does not ignores the captions from previous segments - if (seeked) { - this.transmuxers_[contentType].resetCaptions(); - } - - const transmuxedData = - await this.transmuxers_[contentType].transmux(data); - // For HLS CEA-608/708 CLOSED-CAPTIONS, text data is embedded in - // the video stream, so textEngine may not have been initialized. - if (!this.textEngine_) { - this.reinitText('text/vtt', this.sequenceMode_); - } - // This doesn't work for native TS support (ex. Edge/Chromecast), - // since no transmuxing is needed for native TS. - if (transmuxedData.captions && transmuxedData.captions.length) { - const videoOffset = - this.sourceBuffers_[ContentType.VIDEO].timestampOffset; - const closedCaptions = this.textEngine_ - .convertMuxjsCaptionsToShakaCaptions(transmuxedData.captions); - this.textEngine_.storeAndAppendClosedCaptions( - closedCaptions, - reference ? reference.startTime : null, - reference ? reference.endTime : null, - videoOffset); - } - - data = transmuxedData.data; - } else if (hasClosedCaptions && contentType == ContentType.VIDEO) { + if (hasClosedCaptions && contentType == ContentType.VIDEO) { if (!this.textEngine_) { this.reinitText('text/vtt', this.sequenceMode_); } @@ -657,17 +629,19 @@ shaka.media.MediaSourceEngine = class { } else { const closedCaptions = this.captionParser_.parseFrom(data); if (closedCaptions.length) { - const videoOffset = - this.sourceBuffers_[ContentType.VIDEO].timestampOffset; this.textEngine_.storeAndAppendClosedCaptions( closedCaptions, reference.startTime, reference.endTime, - videoOffset); + timestampOffset); } } } + if (this.transmuxers_[contentType]) { + data = await this.transmuxers_[contentType].transmux(data); + } + data = this.workAroundBrokenPlatforms_( data, reference ? reference.startTime : null, contentType); diff --git a/lib/media/transmuxer.js b/lib/media/transmuxer.js index eca30e3c8f..126a37f2e7 100644 --- a/lib/media/transmuxer.js +++ b/lib/media/transmuxer.js @@ -44,9 +44,6 @@ shaka.media.Transmuxer = class { /** @private {!Array.} */ this.transmuxedData_ = []; - /** @private {!Array.} */ - this.captions_ = []; - /** @private {boolean} */ this.isTransmuxing_ = false; @@ -210,8 +207,7 @@ shaka.media.Transmuxer = class { /** * Transmux from Transport stream to MP4, using the mux.js library. * @param {BufferSource} data - * @return {!Promise.<{data: !Uint8Array, - * captions: !Array.}>} + * @return {!Promise.} */ transmux(data) { goog.asserts.assert(!this.isTransmuxing_, @@ -219,7 +215,6 @@ shaka.media.Transmuxer = class { this.isTransmuxing_ = true; this.transmuxPromise_ = new shaka.util.PublicPromise(); this.transmuxedData_ = []; - this.captions_ = []; const dataArray = shaka.util.BufferUtils.toUint8(data); this.muxTransmuxer_.push(dataArray); @@ -239,13 +234,6 @@ shaka.media.Transmuxer = class { return this.transmuxPromise_; } - /** - * Reset captions from Transport stream to MP4, using the mux.js library. - */ - resetCaptions() { - this.muxTransmuxer_.resetCaptions(); - } - /** * Handles the 'data' event of the transmuxer. * Extracts the cues from the transmuxed segment, and adds them to an array. @@ -256,7 +244,6 @@ shaka.media.Transmuxer = class { * @private */ onTransmuxed_(segment) { - this.captions_ = segment.captions; this.transmuxedData_.push( shaka.util.Uint8ArrayUtils.concat(segment.initSegment, segment.data)); } @@ -268,12 +255,8 @@ shaka.media.Transmuxer = class { * @private */ onTransmuxDone_() { - const output = { - data: shaka.util.Uint8ArrayUtils.concat(...this.transmuxedData_), - captions: this.captions_, - }; - - this.transmuxPromise_.resolve(output); + const data = shaka.util.Uint8ArrayUtils.concat(...this.transmuxedData_); + this.transmuxPromise_.resolve(data); this.isTransmuxing_ = false; } }; diff --git a/lib/text/text_engine.js b/lib/text/text_engine.js index 0deebeea5e..6d30b0b07f 100644 --- a/lib/text/text_engine.js +++ b/lib/text/text_engine.js @@ -344,23 +344,6 @@ shaka.text.TextEngine = class { } } - /** - * @param {!Array} closedCaptions - * @return {!Array} - */ - convertMuxjsCaptionsToShakaCaptions(closedCaptions) { - const cues = []; - for (const caption of closedCaptions) { - const cue = new shaka.text.Cue( - caption.startTime, caption.endTime, caption.text); - cues.push({ - stream: caption.stream, - cue, - }); - } - return cues; - } - /** * @param {!shaka.text.Cue} cue the cue to apply the timestamp to recursively * @param {number} videoTimestampOffset the timestamp offset of the video diff --git a/lib/util/ts_parser.js b/lib/util/ts_parser.js index a0fa85404c..fc07a6591d 100644 --- a/lib/util/ts_parser.js +++ b/lib/util/ts_parser.js @@ -79,7 +79,7 @@ shaka.util.TsParser = class { // loop through TS packets for (let start = syncOffset; start < length; start += packetLength) { - if (data[start] === 0x47) { + if (data[start] == 0x47) { const payloadUnitStartIndicator = !!(data[start + 1] & 0x40); // pid is a 13-bit field starting at the last 5 bits of TS[1] const pid = ((data[start + 1] & 0x1f) << 8) + data[start + 2]; @@ -91,7 +91,7 @@ shaka.util.TsParser = class { if (adaptationFieldControl > 1) { offset = start + 5 + data[start + 4]; // continue if there is only adaptation field - if (offset === start + packetLength) { + if (offset == start + packetLength) { continue; } } else { @@ -113,7 +113,7 @@ shaka.util.TsParser = class { offset += data[offset] + 1; } - const parsedPIDs = this.parsePMT(data, offset); + const parsedPIDs = this.parsePMT_(data, offset); // only update track id if track PID found while parsing PMT // this is to avoid resetting the PID to -1 in case @@ -146,10 +146,12 @@ shaka.util.TsParser = class { } case this.videoPid_: { const videoData = data.subarray(offset, start + packetLength); - if (this.videoStartTime_ == null) { - const pes = this.parsePES(videoData); - if (pes && pes.pts != null) { - this.videoStartTime_ = pes.pts / timescale; + const pes = this.parsePES_(videoData); + if (pes && pes.pts != null) { + const startTime = Math.min(pes.dts, pes.pts) / timescale; + if (this.videoStartTime_ == null || + this.videoStartTime_ > startTime) { + this.videoStartTime_ = startTime; } } this.videoData_.push(videoData); @@ -157,10 +159,12 @@ shaka.util.TsParser = class { } case this.audioPid_: { const audioData = data.subarray(offset, start + packetLength); - if (this.audioStartTime_ == null) { - const pes = this.parsePES(audioData); - if (pes && pes.pts != null) { - this.audioStartTime_ = pes.pts / timescale; + const pes = this.parsePES_(audioData); + if (pes && pes.pts != null) { + const startTime = Math.min(pes.dts, pes.pts) / timescale; + if (this.audioStartTime_ == null || + this.audioStartTime_ > startTime) { + this.audioStartTime_ = startTime; } } this.audioData_.push(audioData); @@ -199,8 +203,9 @@ shaka.util.TsParser = class { * @param {Uint8Array} data * @param {number} offset * @return {!shaka.util.TsParser.PMT} + * @private */ - parsePMT(data, offset) { + parsePMT_(data, offset) { const result = { audio: -1, video: -1, @@ -224,14 +229,14 @@ shaka.util.TsParser = class { break; // ISO/IEC 13818-7 ADTS AAC (MPEG-2 lower bit-rate audio) case 0x0f: - if (result.audio === -1) { + if (result.audio == -1) { result.audio = pid; result.audioCodec = 'aac'; } break; // Packetized metadata (ID3) case 0x15: - if (result.id3 === -1) { + if (result.id3 == -1) { result.id3 = pid; } break; @@ -240,7 +245,7 @@ shaka.util.TsParser = class { break; // ITU-T Rec. H.264 and ISO/IEC 14496-10 (lower bit-rate video) case 0x1b: - if (result.video === -1) { + if (result.video == -1) { result.video = pid; result.videoCodec = 'avc'; } @@ -249,14 +254,14 @@ shaka.util.TsParser = class { // or ISO/IEC 13818-3 (MPEG-2 halved sample rate audio) case 0x03: case 0x04: - if (result.audio === -1) { + if (result.audio == -1) { result.audio = pid; result.audioCodec = 'mp3'; } break; // HEVC case 0x24: - if (result.video === -1) { + if (result.video == -1) { result.video = pid; result.videoCodec = 'hvc'; } @@ -277,8 +282,9 @@ shaka.util.TsParser = class { * * @param {Uint8Array} data * @return {?shaka.util.TsParser.PES} + * @private */ - parsePES(data) { + parsePES_(data) { const startPrefix = (data[0] << 16) | (data[1] << 8) | data[2]; // In certain live streams, the start of a TS fragment has ts packets // that are frame data that is continuing from the previous fragment. This @@ -337,6 +343,95 @@ shaka.util.TsParser = class { return pes; } + /** + * Parse AVC Nalus + * + * The code is based on hls.js + * Credit to https://github.com/video-dev/hls.js/blob/master/src/demux/tsdemuxer.ts + * + * @param {shaka.util.TsParser.PES} pes + * @return {!Array.} + * @private + */ + parseAvcNalus_(pes) { + const timescale = shaka.util.TsParser.Timescale_; + const time = pes.pts ? pes.pts / timescale : null; + const data = pes.data; + const len = data.byteLength; + + // A NALU does not contain is its size. + // The Annex B specification solves this by requiring ‘Start Codes’ to + // precede each NALU. A start code is 2 or 3 0x00 bytes followed with a + // 0x01 byte. e.g. 0x000001 or 0x00000001. + // More info in: https://stackoverflow.com/questions/24884827/possible-locations-for-sequence-picture-parameter-sets-for-h-264-stream/24890903#24890903 + let numZeros = 0; + + /** @type {!Array.} */ + const nalus = []; + + // Start position includes the first byte where we read the type. + // The data we extract begins at the next byte. + let lastNaluStart = -1; + // Extracted from the first byte. + let lastNaluType = 0; + + for (let i = 0; i < len; ++i) { + const value = data[i]; + if (!value) { + numZeros++; + } else if (numZeros >= 2 && value == 1) { + // We just read a start code. Consume the NALU we passed, if any. + if (lastNaluStart >= 0) { + // Because the start position includes the type, skip the first byte. + const firstByteToKeep = lastNaluStart + 1; + + // Compute the last byte to keep. The start code is at most 3 zeros. + // Any earlier zeros are not part of the start code. + const startCodeSize = (numZeros > 3 ? 3 : numZeros) + 1; + const lastByteToKeep = i - startCodeSize; + + /** @type {shaka.util.TsParser.AvcNalu} */ + const nalu = { + // subarray's end position is exclusive, so add one. + data: data.subarray(firstByteToKeep, lastByteToKeep + 1), + type: lastNaluType, + time: time, + }; + nalus.push(nalu); + } + + // We just read a start code, so there should be another byte here, at + // least, for the NALU type. Check just in case. + if (i >= len - 1) { + shaka.log.warning('Malformed TS, incomplete NALU, ignoring.'); + return nalus; + } + + // Advance and read the type of the next NALU. + i++; + lastNaluStart = i; + lastNaluType = data[i] & 0x1f; + numZeros = 0; + } else { + numZeros = 0; + } + } + + if (lastNaluStart >= 0 && numZeros >= 0) { + // The rest of the buffer was a NALU. + // Because the start position includes the type, skip the first byte. + const firstByteToKeep = lastNaluStart + 1; + /** @type {shaka.util.TsParser.AvcNalu} */ + const nalu = { + data: data.subarray(firstByteToKeep, len), + type: lastNaluType, + time: time, + }; + nalus.push(nalu); + } + return nalus; + } + /** * Return the ID3 metadata * @@ -347,7 +442,7 @@ shaka.util.TsParser = class { const Uint8ArrayUtils = shaka.util.Uint8ArrayUtils; const metadata = []; let prevId3Data = new Uint8Array(0); - // parsePES() only works if the data begins on a PES boundary. + // parsePES_() only works if the data begins on a PES boundary. // Try the last data blob first, and if it doesn't begin on a // PES boundary, prepend the previous blob and try again. // This way, a successful parse will always begin and end on @@ -356,7 +451,7 @@ shaka.util.TsParser = class { const data = this.id3Data_[i]; goog.asserts.assert(data, 'We should have a data'); const id3Data = Uint8ArrayUtils.concat(data, prevId3Data); - const pes = this.parsePES(id3Data); + const pes = this.parsePES_(id3Data); if (pes) { metadata.unshift({ cueTime: pes.pts ? pes.pts / timescale : null, @@ -397,6 +492,38 @@ shaka.util.TsParser = class { }; } + /** + * Return the video data + * + * @return {!Array.} + */ + getVideoNalus() { + const Uint8ArrayUtils = shaka.util.Uint8ArrayUtils; + let nalus = []; + let prevVideoData = new Uint8Array(0); + // parsePES_() only works if the data begins on a PES boundary. + // Try the last data blob first, and if it doesn't begin on a + // PES boundary, prepend the previous blob and try again. + // This way, a successful parse will always begin and end on + // the correct boundary, and no data will be skipped. + for (let i = this.videoData_.length - 1; i >= 0; i--) { + const data = this.videoData_[i]; + goog.asserts.assert(data, 'We should have a data'); + const videoData = Uint8ArrayUtils.concat(data, prevVideoData); + const pes = this.parsePES_(videoData); + if (pes) { + if (this.videoCodec_ == 'avc') { + nalus = nalus.concat(this.parseAvcNalus_(pes)); + } + prevVideoData = new Uint8Array(0); + } else { + prevVideoData = videoData; + } + } + // We need to invert the array to return it in the correct order. + return nalus.reverse(); + } + /** * Check if the passed data corresponds to an MPEG2-TS * @@ -430,9 +557,9 @@ shaka.util.TsParser = class { while (i < scanwindow) { // a TS fragment should contain at least 3 TS packets, a PAT, a PMT, and // one PID, each starting with 0x47 - if (data[i] === 0x47 && - data[i + packetLength] === 0x47 && - data[i + 2 * packetLength] === 0x47) { + if (data[i] == 0x47 && + data[i + packetLength] == 0x47 && + data[i + 2 * packetLength] == 0x47) { return i; } else { i++; @@ -497,3 +624,18 @@ shaka.util.TsParser.PMT; */ shaka.util.TsParser.PES; + +/** + * @typedef {{ + * data: !Uint8Array, + * type: number, + * time: ?number + * }} + * + * @summary AvcNalu. + * @property {!Uint8Array} data + * @property {number} type + * @property {?number} time + */ +shaka.util.TsParser.AvcNalu; + diff --git a/test/media/media_source_engine_integration.js b/test/media/media_source_engine_integration.js index 27c1cb40cb..a917d8ff97 100644 --- a/test/media/media_source_engine_integration.js +++ b/test/media/media_source_engine_integration.js @@ -38,21 +38,51 @@ describe('MediaSourceEngine', () => { startTime: Util.closeTo(0.767, 0.001), endTime: Util.closeTo(4.972, 0.001), textAlign: Cue.textAlign.CENTER, - payload: 'These are 608 captions\n(top left)', + nestedCues: [ + jasmine.objectContaining({ + payload: 'These are 608 captions', + textAlign: Cue.textAlign.CENTER, + }), + jasmine.objectContaining({lineBreak: true}), + jasmine.objectContaining({ + payload: '(top left)', + textAlign: Cue.textAlign.CENTER, + }), + ], }); const tsCeaCue1 = jasmine.objectContaining({ startTime: Util.closeTo(5.305, 0.001), endTime: Util.closeTo(11.979, 0.001), textAlign: Cue.textAlign.CENTER, - payload: 'These are 608 captions\n(middle)', + nestedCues: [ + jasmine.objectContaining({ + payload: 'These are 608 captions', + textAlign: Cue.textAlign.CENTER, + }), + jasmine.objectContaining({lineBreak: true}), + jasmine.objectContaining({ + payload: '(middle)', + textAlign: Cue.textAlign.CENTER, + }), + ], }); const tsCeaCue2 = jasmine.objectContaining({ startTime: Util.closeTo(12.312, 0.001), endTime: Util.closeTo(19.319, 0.001), textAlign: Cue.textAlign.CENTER, - payload: 'These are 608 captions\n(bottom left)', + nestedCues: [ + jasmine.objectContaining({ + payload: 'These are 608 captions', + textAlign: Cue.textAlign.CENTER, + }), + jasmine.objectContaining({lineBreak: true}), + jasmine.objectContaining({ + payload: '(bottom left)', + textAlign: Cue.textAlign.CENTER, + }), + ], }); // The same segments as above, but offset by 40 seconds (yes, 40), which is @@ -61,21 +91,51 @@ describe('MediaSourceEngine', () => { startTime: Util.closeTo(40.767, 0.001), endTime: Util.closeTo(44.972, 0.001), textAlign: Cue.textAlign.CENTER, - payload: 'These are 608 captions\n(top left)', + nestedCues: [ + jasmine.objectContaining({ + payload: 'These are 608 captions', + textAlign: Cue.textAlign.CENTER, + }), + jasmine.objectContaining({lineBreak: true}), + jasmine.objectContaining({ + payload: '(top left)', + textAlign: Cue.textAlign.CENTER, + }), + ], }); const tsCeaCue4 = jasmine.objectContaining({ startTime: Util.closeTo(45.305, 0.001), endTime: Util.closeTo(51.979, 0.001), textAlign: Cue.textAlign.CENTER, - payload: 'These are 608 captions\n(middle)', + nestedCues: [ + jasmine.objectContaining({ + payload: 'These are 608 captions', + textAlign: Cue.textAlign.CENTER, + }), + jasmine.objectContaining({lineBreak: true}), + jasmine.objectContaining({ + payload: '(middle)', + textAlign: Cue.textAlign.CENTER, + }), + ], }); const tsCeaCue5 = jasmine.objectContaining({ startTime: Util.closeTo(52.312, 0.001), endTime: Util.closeTo(59.319, 0.001), textAlign: Cue.textAlign.CENTER, - payload: 'These are 608 captions\n(bottom left)', + nestedCues: [ + jasmine.objectContaining({ + payload: 'These are 608 captions', + textAlign: Cue.textAlign.CENTER, + }), + jasmine.objectContaining({lineBreak: true}), + jasmine.objectContaining({ + payload: '(bottom left)', + textAlign: Cue.textAlign.CENTER, + }), + ], }); /** @@ -137,7 +197,7 @@ describe('MediaSourceEngine', () => { type, segment, reference, /* hasClosedCaptions= */ false); } - function appendWithSeek(type, segmentNumber) { + function appendWithSeekAndClosedCaptions(type, segmentNumber) { const segment = generators[type] .getSegment(segmentNumber, Date.now() / 1000); const reference = dummyReference(type, segmentNumber); @@ -145,7 +205,7 @@ describe('MediaSourceEngine', () => { type, segment, reference, - /* hasClosedCaptions= */ false, + /* hasClosedCaptions= */ true, /* seeked= */ true); } @@ -455,10 +515,9 @@ describe('MediaSourceEngine', () => { await mediaSourceEngine.init(initObject, /* forceTransmux= */ true); mediaSourceEngine.setSelectedClosedCaptionId('CC1'); - await append(ContentType.VIDEO, 0); + await appendWithClosedCaptions(ContentType.VIDEO, 0); expect(textDisplayer.appendSpy).toHaveBeenCalledTimes(3); - expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue0]); expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue1]); expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue2]); @@ -477,7 +536,7 @@ describe('MediaSourceEngine', () => { await mediaSourceEngine.init(initObject, /* forceTransmux= */ true); mediaSourceEngine.setSelectedClosedCaptionId('CC1'); - await append(ContentType.VIDEO, 2); + await appendWithClosedCaptions(ContentType.VIDEO, 2); expect(textDisplayer.appendSpy).toHaveBeenCalledTimes(3); expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue3]); @@ -485,7 +544,7 @@ describe('MediaSourceEngine', () => { expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue5]); textDisplayer.appendSpy.calls.reset(); - await appendWithSeek(ContentType.VIDEO, 0); + await appendWithSeekAndClosedCaptions(ContentType.VIDEO, 0); expect(textDisplayer.appendSpy).toHaveBeenCalledTimes(3); expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue0]); diff --git a/test/media/media_source_engine_unit.js b/test/media/media_source_engine_unit.js index 31db649d8a..cd3665df46 100644 --- a/test/media/media_source_engine_unit.js +++ b/test/media/media_source_engine_unit.js @@ -533,7 +533,7 @@ describe('MediaSourceEngine', () => { data, 0, 10); }); - it('appends transmuxed data and captions', async () => { + it('appends transmuxed data', async () => { const initObject = new Map(); initObject.set(ContentType.VIDEO, fakeTransportStream); @@ -548,7 +548,6 @@ describe('MediaSourceEngine', () => { await mediaSourceEngine.appendBuffer( ContentType.VIDEO, buffer, null, /* hasClosedCaptions= */ false); - expect(mockTextEngine.storeAndAppendClosedCaptions).toHaveBeenCalled(); expect(videoSourceBuffer.appendBuffer).toHaveBeenCalled(); }; @@ -563,38 +562,6 @@ describe('MediaSourceEngine', () => { await Promise.all([init(), delay()]); }); - it('appends only transmuxed data without embedded text', async () => { - const initObject = new Map(); - initObject.set(ContentType.VIDEO, fakeTransportStream); - - const output = { - data: new Uint8Array(1), - captions: [], - }; - mockTransmuxer.transmux.and.returnValue(Promise.resolve(output)); - - const init = async () => { - await mediaSourceEngine.init(initObject, false); - await mediaSourceEngine.appendBuffer( - ContentType.VIDEO, buffer, null, - /* hasClosedCaptions= */ false); - expect(mockTextEngine.storeAndAppendClosedCaptions) - .not.toHaveBeenCalled(); - expect(videoSourceBuffer.appendBuffer) - .toHaveBeenCalledWith(output.data); - }; - - // The 'updateend' event fires once the data is done appending to the - // media source. We only append to the media source once transmuxing is - // done. Since transmuxing is done using Promises, we need to delay the - // event until MediaSourceEngine calls appendBuffer. - const delay = async () => { - await Util.shortDelay(); - videoSourceBuffer.updateend(); - }; - await Promise.all([init(), delay()]); - }); - it('appends parsed closed captions from CaptionParser', async () => { const initObject = new Map(); initObject.set(ContentType.VIDEO, fakeVideoStream); @@ -1298,7 +1265,7 @@ describe('MediaSourceEngine', () => { mockTextEngine = jasmine.createSpyObj('TextEngine', [ 'initParser', 'destroy', 'appendBuffer', 'remove', 'setTimestampOffset', 'setAppendWindow', 'bufferStart', 'bufferEnd', 'bufferedAheadOf', - 'storeAndAppendClosedCaptions', 'convertMuxjsCaptionsToShakaCaptions', + 'storeAndAppendClosedCaptions', ]); const resolve = () => Promise.resolve(); diff --git a/test/media/transmuxer_integration.js b/test/media/transmuxer_integration.js index 4cb0bf69b6..63df2ed105 100644 --- a/test/media/transmuxer_integration.js +++ b/test/media/transmuxer_integration.js @@ -111,43 +111,40 @@ describe('Transmuxer', () => { let sawMDAT = false; const transmuxedData = await transmuxer.transmux(videoSegment); - expect(transmuxedData.data).toEqual(jasmine.any(Uint8Array)); - expect(transmuxedData.data.length).toBeGreaterThan(0); - expect(transmuxedData.captions).toEqual(jasmine.any(Array)); + expect(transmuxedData).toEqual(jasmine.any(Uint8Array)); + expect(transmuxedData.length).toBeGreaterThan(0); new shaka.util.Mp4Parser() .box('mdat', shaka.util.Mp4Parser.allData((data) => { sawMDAT = true; expect(data.byteLength).toBeGreaterThan(0); })) - .parse(transmuxedData.data); + .parse(transmuxedData); expect(sawMDAT).toBeTruthy(); }); it('transmux audio from TS to MP4', async () => { let sawMDAT = false; const transmuxedData = await transmuxer.transmux(audioSegment); - expect(transmuxedData.data).toEqual(jasmine.any(Uint8Array)); - expect(transmuxedData.data.length).toBeGreaterThan(0); - expect(transmuxedData.captions).toEqual(jasmine.any(Array)); + expect(transmuxedData).toEqual(jasmine.any(Uint8Array)); + expect(transmuxedData.length).toBeGreaterThan(0); new shaka.util.Mp4Parser() .box('mdat', shaka.util.Mp4Parser.allData((data) => { sawMDAT = true; expect(data.byteLength).toBeGreaterThan(0); })) - .parse(transmuxedData.data); + .parse(transmuxedData); expect(sawMDAT).toBeTruthy(); }); it('transmux empty video from TS to MP4', async () => { let sawMDAT = false; const transmuxedData = await transmuxer.transmux(emptySegment); - expect(transmuxedData.data).toEqual(jasmine.any(Uint8Array)); - expect(transmuxedData.captions).toEqual([]); + expect(transmuxedData).toEqual(jasmine.any(Uint8Array)); new shaka.util.Mp4Parser() .box('mdat', shaka.util.Mp4Parser.allData((data) => { sawMDAT = true; })) - .parse(transmuxedData.data); + .parse(transmuxedData); expect(sawMDAT).toBeFalsy(); }); @@ -171,7 +168,7 @@ describe('Transmuxer', () => { mp4Timestamp = parsedTFDTBox.baseMediaDecodeTime; parsed = true; }) - .parse(transmuxedData.data); + .parse(transmuxedData); expect(parsed).toBe(true); expect(mp4Timestamp).toBe(expectedMp4Timestamp);