From 70fad8de8fc18cdd186ee431bbd433bbd4d440cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Velad=20Galv=C3=A1n?= <ladvan91@hotmail.com>
Date: Mon, 28 Nov 2022 18:56:22 +0100
Subject: [PATCH] feat(cea): Add CEA parser for TS (#4697)

Closes https://github.com/shaka-project/shaka-player/issues/3674

Co-authored-by: Joey Parrish <joeyparrish@google.com>
---
 README.md                                     |   6 +-
 build/types/cea                               |   1 +
 externs/mux.js                                | 106 +---------
 externs/shaka/player.js                       |   6 +-
 lib/cea/cea608_data_channel.js                |  25 ++-
 lib/cea/cea_decoder.js                        |  13 ++
 lib/cea/ts_cea_parser.js                      |  64 ++++++
 lib/media/closed_caption_parser.js            |   9 +-
 lib/media/media_source_engine.js              |  44 +---
 lib/media/transmuxer.js                       |  23 +--
 lib/text/text_engine.js                       |  17 --
 lib/util/ts_parser.js                         | 188 +++++++++++++++---
 test/media/media_source_engine_integration.js |  83 ++++++--
 test/media/media_source_engine_unit.js        |  37 +---
 test/media/transmuxer_integration.js          |  21 +-
 15 files changed, 367 insertions(+), 276 deletions(-)
 create mode 100644 lib/cea/ts_cea_parser.js
diff --git a/README.md b/README.md
index aa5e989cf4..65d9e925b7 100644
--- a/README.md
+++ b/README.md
@@ -225,11 +225,9 @@ Shaka Player supports:
   - TTML
     - Supported in both XML form and embedded in MP4
   - CEA-608
-    - Supported embedded in MP4
-    - With help from [mux.js][] v6.2.0+, supported embedded in TS
+    - Supported embedded in MP4 and TS
   - CEA-708
-    - Supported embedded in MP4
-    - With help from [mux.js][] v6.2.0+, supported embedded in TS
+    - Supported embedded in MP4 and TS
   - SubRip (SRT)
     - UTF-8 encoding only
   - LyRiCs (LRC)
diff --git a/build/types/cea b/build/types/cea
index 31f1660566..0f5c3cc88d 100644
--- a/build/types/cea
+++ b/build/types/cea
@@ -12,3 +12,4 @@
 +../../lib/cea/i_cea_parser.js
 +../../lib/cea/mp4_cea_parser.js
 +../../lib/cea/sei_processor.js
++../../lib/cea/ts_cea_parser.js
diff --git a/externs/mux.js b/externs/mux.js
index efdb20207a..aecf0a5961 100644
--- a/externs/mux.js
+++ b/externs/mux.js
@@ -23,29 +23,6 @@ var muxjs = {};
 muxjs.mp4 = {};
 
 
-/** @const */
-muxjs.mp4.probe = class {
-  /**
-   * Parses an MP4 initialization segment and extracts the timescale
-   * values for any declared tracks.
-   *
-   * @param {Uint8Array} init The bytes of the init segment
-   * @return {!Object.<number, number>} a hash of track ids to timescale
-   * values or null if the init segment is malformed.
-   */
-  static timescale(init) {}
-
-  /**
-    * Find the trackIds of the video tracks in this source.
-    * Found by parsing the Handler Reference and Track Header Boxes:
-    *
-    * @param {Uint8Array} init The bytes of the init segment for this source
-    * @return {!Array.<number>} A list of trackIds
-   **/
-  static videoTrackIds(init) {}
-};
-
-
 muxjs.mp4.Transmuxer = class {
   /** @param {Object=} options */
   constructor(options) {}
@@ -74,100 +51,19 @@ muxjs.mp4.Transmuxer = class {
 
   /** Remove all handlers and clean up. */
   dispose() {}
-
-  /** Reset captions. */
-  resetCaptions() {}
 };
 
 
 /**
  * @typedef {{
  *   initSegment: !Uint8Array,
- *   data: !Uint8Array,
- *   captions: !Array
+ *   data: !Uint8Array
  * }}
  *
  * @description Transmuxed data from mux.js.
  * @property {!Uint8Array} initSegment
  * @property {!Uint8Array} data
- * @property {!Array} captions
  * @exportDoc
  */
 muxjs.mp4.Transmuxer.Segment;
 
-
-muxjs.mp4.CaptionParser = class {
-  /**
-   * Parser for CEA closed captions embedded in video streams for Dash.
-   * @constructor
-   * @struct
-   */
-  constructor() {}
-
-  /** Initializes the closed caption parser. */
-  init() {}
-
-  /**
-   * Return true if a new video track is selected or if the timescale is
-   * changed.
-   * @param {!Array.<number>} videoTrackIds A list of video tracks found in the
-   *    init segment.
-   * @param {!Object.<number, number>} timescales The map of track Ids and the
-   *    tracks' timescales in the init segment.
-   * @return {boolean}
-   */
-  isNewInit(videoTrackIds, timescales) {}
-
-  /**
-   * Parses embedded CEA closed captions and interacts with the underlying
-   * CaptionStream, and return the parsed captions.
-   * @param {!Uint8Array} segment The fmp4 segment containing embedded captions
-   * @param {!Array.<number>} videoTrackIds A list of video tracks found in the
-   *    init segment.
-   * @param {!Object.<number, number>} timescales The timescales found in the
-   *    init segment.
-   * @return {muxjs.mp4.ParsedClosedCaptions}
-   */
-  parse(segment, videoTrackIds, timescales) {}
-
-  /** Clear the parsed closed captions data for new data. */
-  clearParsedCaptions() {}
-
-  /** Reset the captions stream. */
-  resetCaptionStream() {}
-};
-
-
-/**
- * @typedef {{
- *   captionStreams: Object.<string, boolean>,
- *   captions: !Array.<muxjs.mp4.ClosedCaption>
- * }}
- *
- * @description closed captions data parsed from mux.js caption parser.
- * @property {Object.<string, boolean>} captionStreams
- * @property {Array.<muxjs.mp4.ClosedCaption>} captions
- */
-muxjs.mp4.ParsedClosedCaptions;
-
-
-/**
- * @typedef {{
- *   startPts: number,
- *   endPts: number,
- *   startTime: number,
- *   endTime: number,
- *   stream: string,
- *   text: string
- * }}
- *
- * @description closed caption parsed from mux.js caption parser.
- * @property {number} startPts
- * @property {number} endPts
- * @property {number} startTime
- * @property {number} endTime
- * @property {string} stream The channel id of the closed caption.
- * @property {string} text The content of the closed caption.
- */
-muxjs.mp4.ClosedCaption;
-
diff --git a/externs/shaka/player.js b/externs/shaka/player.js
index e7dc898dd4..0b8ba42101 100644
--- a/externs/shaka/player.js
+++ b/externs/shaka/player.js
@@ -1012,10 +1012,8 @@ shaka.extern.ManifestConfiguration;
  *   the default value unless you have a good reason not to.
  * @property {boolean} forceTransmux
  *   If this is <code>true</code>, we will transmux AAC and TS content even if
- *   not strictly necessary for the assets to be played.  Shaka Player
- *   currently only supports CEA 708 captions by transmuxing, so this value is
- *   necessary for enabling them on platforms with native TS support like Edge
- *   or Chromecast. This value defaults to <code>false</code>.
+ *   not strictly necessary for the assets to be played.
+ *   This value defaults to <code>false</code>.
  * @property {number} safeSeekOffset
  *   The amount of seconds that should be added when repositioning the playhead
  *   after falling out of the availability window or seek. This gives the player
diff --git a/lib/cea/cea608_data_channel.js b/lib/cea/cea608_data_channel.js
index a140e6c006..36263a8b18 100644
--- a/lib/cea/cea608_data_channel.js
+++ b/lib/cea/cea608_data_channel.js
@@ -54,7 +54,7 @@ shaka.cea.Cea608DataChannel = class {
      * Points to current buffer.
      * @private {!shaka.cea.Cea608Memory}
      */
-    this.curbuf_ = this.displayedMemory_;
+    this.curbuf_ = this.nonDisplayedMemory_;
 
     /**
      * End time of the previous caption, serves as start time of next caption.
@@ -73,14 +73,25 @@ shaka.cea.Cea608DataChannel = class {
    * Resets channel state.
    */
   reset() {
-    this.type_ = shaka.cea.Cea608DataChannel.CaptionType.PAINTON;
-    this.curbuf_ = this.displayedMemory_;
+    this.type_ = shaka.cea.Cea608DataChannel.CaptionType.NONE;
+    this.curbuf_ = this.nonDisplayedMemory_;
     this.lastcp_ = null;
     this.displayedMemory_.reset();
     this.nonDisplayedMemory_.reset();
     this.text_.reset();
   }
 
+  /**
+   * Set the initial PTS, which may not be 0 if we start decoding at a later
+   * point in the stream.  Without this, the first cue's startTime can be way
+   * off.
+   *
+   * @param {number} pts
+   */
+  firstPts(pts) {
+    this.prevEndTime_ = pts;
+  }
+
   /**
    * Gets the row index from a Preamble Address Code byte pair.
    * @param {number} b1 Byte 1.
@@ -155,12 +166,12 @@ shaka.cea.Cea608DataChannel = class {
     }
     buf.setRow(row);
 
-    this.curbuf_.setUnderline(underline);
-    this.curbuf_.setItalics(italics);
-    this.curbuf_.setTextColor(textColor);
+    buf.setUnderline(underline);
+    buf.setItalics(italics);
+    buf.setTextColor(textColor);
 
     // Clear the background color, since new row (PAC) should reset ALL styles.
-    this.curbuf_.setBackgroundColor(shaka.cea.CeaUtils.DEFAULT_BG_COLOR);
+    buf.setBackgroundColor(shaka.cea.CeaUtils.DEFAULT_BG_COLOR);
   }
 
   /**
diff --git a/lib/cea/cea_decoder.js b/lib/cea/cea_decoder.js
index dbf3552703..1075a13c1b 100644
--- a/lib/cea/cea_decoder.js
+++ b/lib/cea/cea_decoder.js
@@ -77,6 +77,11 @@ shaka.cea.CeaDecoder = class {
      */
     this.serviceNumberToService_ = new Map();
 
+    /**
+     * @private {boolean}
+     */
+    this.waitingForFirstPacket_ = true;
+
     this.reset();
   }
 
@@ -106,6 +111,7 @@ shaka.cea.CeaDecoder = class {
     for (const stream of this.cea608ModeToStream_.values()) {
       stream.reset();
     }
+    this.waitingForFirstPacket_ = true;
   }
 
   /**
@@ -114,6 +120,13 @@ shaka.cea.CeaDecoder = class {
    * @override
    */
   extract(userDataSeiMessage, pts) {
+    if (this.waitingForFirstPacket_) {
+      for (const stream of this.cea608ModeToStream_.values()) {
+        stream.firstPts(pts);
+      }
+      this.waitingForFirstPacket_ = false;
+    }
+
     const reader = new shaka.util.DataViewReader(
         userDataSeiMessage, shaka.util.DataViewReader.Endianness.BIG_ENDIAN);
 
diff --git a/lib/cea/ts_cea_parser.js b/lib/cea/ts_cea_parser.js
new file mode 100644
index 0000000000..eeab6d3ab8
--- /dev/null
+++ b/lib/cea/ts_cea_parser.js
@@ -0,0 +1,64 @@
+/*! @license
+ * Shaka Player
+ * Copyright 2016 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+goog.provide('shaka.cea.TsCeaParser');
+
+goog.require('shaka.cea.ICeaParser');
+goog.require('shaka.cea.SeiProcessor');
+goog.require('shaka.util.BufferUtils');
+goog.require('shaka.util.TsParser');
+
+/**
+ * MPEG TS CEA parser.
+ * @implements {shaka.cea.ICeaParser}
+ */
+shaka.cea.TsCeaParser = class {
+  /** */
+  constructor() {
+    /**
+     * SEI data processor.
+     * @private
+     * @const {!shaka.cea.SeiProcessor}
+     */
+    this.seiProcessor_ = new shaka.cea.SeiProcessor();
+  }
+
+  /**
+   * @override
+   */
+  init(initSegment) {
+    // TS hasn't init segment
+  }
+
+  /**
+   * @override
+   */
+  parse(mediaSegment) {
+    const ICeaParser = shaka.cea.ICeaParser;
+
+    /** @type {!Array<!shaka.cea.ICeaParser.CaptionPacket>} **/
+    const captionPackets = [];
+
+    const uint8ArrayData = shaka.util.BufferUtils.toUint8(mediaSegment);
+    if (!shaka.util.TsParser.probe(uint8ArrayData)) {
+      return captionPackets;
+    }
+    const tsParser = new shaka.util.TsParser().parse(uint8ArrayData);
+    const videoNalus = tsParser.getVideoNalus();
+    for (const nalu of videoNalus) {
+      if (nalu.type == ICeaParser.H264_NALU_TYPE_SEI &&
+          nalu.time != null) {
+        for (const packet of this.seiProcessor_.process(nalu.data)) {
+          captionPackets.push({
+            packet: packet,
+            pts: nalu.time,
+          });
+        }
+      }
+    }
+    return captionPackets;
+  }
+};
diff --git a/lib/media/closed_caption_parser.js b/lib/media/closed_caption_parser.js
index 79e82d5d9e..a657bb2d92 100644
--- a/lib/media/closed_caption_parser.js
+++ b/lib/media/closed_caption_parser.js
@@ -10,6 +10,7 @@ goog.provide('shaka.media.IClosedCaptionParser');
 goog.require('shaka.cea.CeaDecoder');
 goog.require('shaka.cea.DummyCeaParser');
 goog.require('shaka.cea.Mp4CeaParser');
+goog.require('shaka.cea.TsCeaParser');
 goog.require('shaka.util.BufferUtils');
 goog.requireType('shaka.cea.ICaptionDecoder');
 goog.requireType('shaka.cea.ICeaParser');
@@ -59,10 +60,14 @@ shaka.media.ClosedCaptionParser = class {
     /** @private {!shaka.cea.ICeaParser} */
     this.ceaParser_ = new shaka.cea.DummyCeaParser();
 
-    if (mimeType.includes('video/mp4')) {
-      // MP4 Parser to extract closed caption packets from H.264 video.
+    if (mimeType.toLowerCase().includes('video/mp4')) {
+      // MP4 Parser to extract closed caption packets from H.264/H.265 video.
       this.ceaParser_ = new shaka.cea.Mp4CeaParser();
     }
+    if (mimeType.toLowerCase().includes('video/mp2t')) {
+      // TS Parser to extract closed caption packets from H.264 video.
+      this.ceaParser_ = new shaka.cea.TsCeaParser();
+    }
 
     /**
      * Decoder for decoding CEA-X08 data from closed caption packets.
diff --git a/lib/media/media_source_engine.js b/lib/media/media_source_engine.js
index bfa902f8b5..285632964a 100644
--- a/lib/media/media_source_engine.js
+++ b/lib/media/media_source_engine.js
@@ -580,6 +580,8 @@ shaka.media.MediaSourceEngine = class {
       return;
     }
 
+    let timestampOffset = this.sourceBuffers_[contentType].timestampOffset;
+
     const uint8ArrayData = shaka.util.BufferUtils.toUint8(data);
     let mimeType = this.sourceBufferTypes_[contentType];
     if (this.transmuxers_[contentType]) {
@@ -590,7 +592,7 @@ shaka.media.MediaSourceEngine = class {
       // The SourceBuffer timestampOffset may or may not be set yet, so this is
       // the timestamp offset that would eventually compute for this segment
       // either way.
-      const timestampOffset =
+      timestampOffset =
           reference.startTime - (tsParser.getStartTime()[contentType] || 0);
       const metadata = tsParser.getMetadata();
       if (metadata.length) {
@@ -613,37 +615,7 @@ shaka.media.MediaSourceEngine = class {
         }
       }
     }
-
-    if (this.transmuxers_[contentType]) {
-      // When seeked we should reset the transmuxer captionstreams
-      // so it does not ignores the captions from previous segments
-      if (seeked) {
-        this.transmuxers_[contentType].resetCaptions();
-      }
-
-      const transmuxedData =
-          await this.transmuxers_[contentType].transmux(data);
-      // For HLS CEA-608/708 CLOSED-CAPTIONS, text data is embedded in
-      // the video stream, so textEngine may not have been initialized.
-      if (!this.textEngine_) {
-        this.reinitText('text/vtt', this.sequenceMode_);
-      }
-      // This doesn't work for native TS support (ex. Edge/Chromecast),
-      // since no transmuxing is needed for native TS.
-      if (transmuxedData.captions && transmuxedData.captions.length) {
-        const videoOffset =
-        this.sourceBuffers_[ContentType.VIDEO].timestampOffset;
-        const closedCaptions = this.textEngine_
-            .convertMuxjsCaptionsToShakaCaptions(transmuxedData.captions);
-        this.textEngine_.storeAndAppendClosedCaptions(
-            closedCaptions,
-            reference ? reference.startTime : null,
-            reference ? reference.endTime : null,
-            videoOffset);
-      }
-
-      data = transmuxedData.data;
-    } else if (hasClosedCaptions && contentType == ContentType.VIDEO) {
+    if (hasClosedCaptions && contentType == ContentType.VIDEO) {
       if (!this.textEngine_) {
         this.reinitText('text/vtt', this.sequenceMode_);
       }
@@ -657,17 +629,19 @@ shaka.media.MediaSourceEngine = class {
       } else {
         const closedCaptions = this.captionParser_.parseFrom(data);
         if (closedCaptions.length) {
-          const videoOffset =
-            this.sourceBuffers_[ContentType.VIDEO].timestampOffset;
           this.textEngine_.storeAndAppendClosedCaptions(
               closedCaptions,
               reference.startTime,
               reference.endTime,
-              videoOffset);
+              timestampOffset);
         }
       }
     }
 
+    if (this.transmuxers_[contentType]) {
+      data = await this.transmuxers_[contentType].transmux(data);
+    }
+
     data = this.workAroundBrokenPlatforms_(
         data, reference ? reference.startTime : null, contentType);
 
diff --git a/lib/media/transmuxer.js b/lib/media/transmuxer.js
index eca30e3c8f..126a37f2e7 100644
--- a/lib/media/transmuxer.js
+++ b/lib/media/transmuxer.js
@@ -44,9 +44,6 @@ shaka.media.Transmuxer = class {
     /** @private {!Array.<!Uint8Array>} */
     this.transmuxedData_ = [];
 
-    /** @private {!Array.<muxjs.mp4.ClosedCaption>} */
-    this.captions_ = [];
-
     /** @private {boolean} */
     this.isTransmuxing_ = false;
 
@@ -210,8 +207,7 @@ shaka.media.Transmuxer = class {
   /**
    * Transmux from Transport stream to MP4, using the mux.js library.
    * @param {BufferSource} data
-   * @return {!Promise.<{data: !Uint8Array,
-   *                     captions: !Array.<!muxjs.mp4.ClosedCaption>}>}
+   * @return {!Promise.<!Uint8Array>}
    */
   transmux(data) {
     goog.asserts.assert(!this.isTransmuxing_,
@@ -219,7 +215,6 @@ shaka.media.Transmuxer = class {
     this.isTransmuxing_ = true;
     this.transmuxPromise_ = new shaka.util.PublicPromise();
     this.transmuxedData_ = [];
-    this.captions_ = [];
 
     const dataArray = shaka.util.BufferUtils.toUint8(data);
     this.muxTransmuxer_.push(dataArray);
@@ -239,13 +234,6 @@ shaka.media.Transmuxer = class {
     return this.transmuxPromise_;
   }
 
-  /**
-   * Reset captions from Transport stream to MP4, using the mux.js library.
-  */
-  resetCaptions() {
-    this.muxTransmuxer_.resetCaptions();
-  }
-
   /**
    * Handles the 'data' event of the transmuxer.
    * Extracts the cues from the transmuxed segment, and adds them to an array.
@@ -256,7 +244,6 @@ shaka.media.Transmuxer = class {
    * @private
    */
   onTransmuxed_(segment) {
-    this.captions_ = segment.captions;
     this.transmuxedData_.push(
         shaka.util.Uint8ArrayUtils.concat(segment.initSegment, segment.data));
   }
@@ -268,12 +255,8 @@ shaka.media.Transmuxer = class {
    * @private
    */
   onTransmuxDone_() {
-    const output = {
-      data: shaka.util.Uint8ArrayUtils.concat(...this.transmuxedData_),
-      captions: this.captions_,
-    };
-
-    this.transmuxPromise_.resolve(output);
+    const data = shaka.util.Uint8ArrayUtils.concat(...this.transmuxedData_);
+    this.transmuxPromise_.resolve(data);
     this.isTransmuxing_ = false;
   }
 };
diff --git a/lib/text/text_engine.js b/lib/text/text_engine.js
index 0deebeea5e..6d30b0b07f 100644
--- a/lib/text/text_engine.js
+++ b/lib/text/text_engine.js
@@ -344,23 +344,6 @@ shaka.text.TextEngine = class {
     }
   }
 
-  /**
-  * @param {!Array<muxjs.mp4.ClosedCaption>} closedCaptions
-  * @return {!Array<!shaka.cea.ICaptionDecoder.ClosedCaption>}
-  */
-  convertMuxjsCaptionsToShakaCaptions(closedCaptions) {
-    const cues = [];
-    for (const caption of closedCaptions) {
-      const cue = new shaka.text.Cue(
-          caption.startTime, caption.endTime, caption.text);
-      cues.push({
-        stream: caption.stream,
-        cue,
-      });
-    }
-    return cues;
-  }
-
   /**
    * @param {!shaka.text.Cue} cue the cue to apply the timestamp to recursively
    * @param {number} videoTimestampOffset the timestamp offset of the video
diff --git a/lib/util/ts_parser.js b/lib/util/ts_parser.js
index a0fa85404c..fc07a6591d 100644
--- a/lib/util/ts_parser.js
+++ b/lib/util/ts_parser.js
@@ -79,7 +79,7 @@ shaka.util.TsParser = class {
 
     // loop through TS packets
     for (let start = syncOffset; start < length; start += packetLength) {
-      if (data[start] === 0x47) {
+      if (data[start] == 0x47) {
         const payloadUnitStartIndicator = !!(data[start + 1] & 0x40);
         // pid is a 13-bit field starting at the last 5 bits of TS[1]
         const pid = ((data[start + 1] & 0x1f) << 8) + data[start + 2];
@@ -91,7 +91,7 @@ shaka.util.TsParser = class {
         if (adaptationFieldControl > 1) {
           offset = start + 5 + data[start + 4];
           // continue if there is only adaptation field
-          if (offset === start + packetLength) {
+          if (offset == start + packetLength) {
             continue;
           }
         } else {
@@ -113,7 +113,7 @@ shaka.util.TsParser = class {
               offset += data[offset] + 1;
             }
 
-            const parsedPIDs = this.parsePMT(data, offset);
+            const parsedPIDs = this.parsePMT_(data, offset);
 
             // only update track id if track PID found while parsing PMT
             // this is to avoid resetting the PID to -1 in case
@@ -146,10 +146,12 @@ shaka.util.TsParser = class {
           }
           case this.videoPid_: {
             const videoData = data.subarray(offset, start + packetLength);
-            if (this.videoStartTime_ == null) {
-              const pes = this.parsePES(videoData);
-              if (pes && pes.pts != null) {
-                this.videoStartTime_ = pes.pts / timescale;
+            const pes = this.parsePES_(videoData);
+            if (pes && pes.pts != null) {
+              const startTime = Math.min(pes.dts, pes.pts) / timescale;
+              if (this.videoStartTime_ == null ||
+                  this.videoStartTime_ > startTime) {
+                this.videoStartTime_ = startTime;
               }
             }
             this.videoData_.push(videoData);
@@ -157,10 +159,12 @@ shaka.util.TsParser = class {
           }
           case this.audioPid_: {
             const audioData = data.subarray(offset, start + packetLength);
-            if (this.audioStartTime_ == null) {
-              const pes = this.parsePES(audioData);
-              if (pes && pes.pts != null) {
-                this.audioStartTime_ = pes.pts / timescale;
+            const pes = this.parsePES_(audioData);
+            if (pes && pes.pts != null) {
+              const startTime = Math.min(pes.dts, pes.pts) / timescale;
+              if (this.audioStartTime_ == null ||
+                  this.audioStartTime_ > startTime) {
+                this.audioStartTime_ = startTime;
               }
             }
             this.audioData_.push(audioData);
@@ -199,8 +203,9 @@ shaka.util.TsParser = class {
    * @param {Uint8Array} data
    * @param {number} offset
    * @return {!shaka.util.TsParser.PMT}
+   * @private
    */
-  parsePMT(data, offset) {
+  parsePMT_(data, offset) {
     const result = {
       audio: -1,
       video: -1,
@@ -224,14 +229,14 @@ shaka.util.TsParser = class {
           break;
         // ISO/IEC 13818-7 ADTS AAC (MPEG-2 lower bit-rate audio)
         case 0x0f:
-          if (result.audio === -1) {
+          if (result.audio == -1) {
             result.audio = pid;
             result.audioCodec = 'aac';
           }
           break;
         // Packetized metadata (ID3)
         case 0x15:
-          if (result.id3 === -1) {
+          if (result.id3 == -1) {
             result.id3 = pid;
           }
           break;
@@ -240,7 +245,7 @@ shaka.util.TsParser = class {
           break;
         // ITU-T Rec. H.264 and ISO/IEC 14496-10 (lower bit-rate video)
         case 0x1b:
-          if (result.video === -1) {
+          if (result.video == -1) {
             result.video = pid;
             result.videoCodec = 'avc';
           }
@@ -249,14 +254,14 @@ shaka.util.TsParser = class {
         // or ISO/IEC 13818-3 (MPEG-2 halved sample rate audio)
         case 0x03:
         case 0x04:
-          if (result.audio === -1) {
+          if (result.audio == -1) {
             result.audio = pid;
             result.audioCodec = 'mp3';
           }
           break;
         // HEVC
         case 0x24:
-          if (result.video === -1) {
+          if (result.video == -1) {
             result.video = pid;
             result.videoCodec = 'hvc';
           }
@@ -277,8 +282,9 @@ shaka.util.TsParser = class {
    *
    * @param {Uint8Array} data
    * @return {?shaka.util.TsParser.PES}
+   * @private
    */
-  parsePES(data) {
+  parsePES_(data) {
     const startPrefix = (data[0] << 16) | (data[1] << 8) | data[2];
     // In certain live streams, the start of a TS fragment has ts packets
     // that are frame data that is continuing from the previous fragment. This
@@ -337,6 +343,95 @@ shaka.util.TsParser = class {
     return pes;
   }
 
+  /**
+   * Parse AVC Nalus
+   *
+   * The code is based on hls.js
+   * Credit to https://github.com/video-dev/hls.js/blob/master/src/demux/tsdemuxer.ts
+   *
+   * @param {shaka.util.TsParser.PES} pes
+   * @return {!Array.<shaka.util.TsParser.AvcNalu>}
+   * @private
+   */
+  parseAvcNalus_(pes) {
+    const timescale = shaka.util.TsParser.Timescale_;
+    const time = pes.pts ? pes.pts / timescale : null;
+    const data = pes.data;
+    const len = data.byteLength;
+
+    // A NALU does not contain is its size.
+    // The Annex B specification solves this by requiring ‘Start Codes’ to
+    // precede each NALU. A start code is 2 or 3 0x00 bytes followed with a
+    // 0x01 byte. e.g. 0x000001 or 0x00000001.
+    // More info in: https://stackoverflow.com/questions/24884827/possible-locations-for-sequence-picture-parameter-sets-for-h-264-stream/24890903#24890903
+    let numZeros = 0;
+
+    /** @type {!Array.<shaka.util.TsParser.AvcNalu>} */
+    const nalus = [];
+
+    // Start position includes the first byte where we read the type.
+    // The data we extract begins at the next byte.
+    let lastNaluStart = -1;
+    // Extracted from the first byte.
+    let lastNaluType = 0;
+
+    for (let i = 0; i < len; ++i) {
+      const value = data[i];
+      if (!value) {
+        numZeros++;
+      } else if (numZeros >= 2 && value == 1) {
+        // We just read a start code.  Consume the NALU we passed, if any.
+        if (lastNaluStart >= 0) {
+          // Because the start position includes the type, skip the first byte.
+          const firstByteToKeep = lastNaluStart + 1;
+
+          // Compute the last byte to keep.  The start code is at most 3 zeros.
+          // Any earlier zeros are not part of the start code.
+          const startCodeSize = (numZeros > 3 ? 3 : numZeros) + 1;
+          const lastByteToKeep = i - startCodeSize;
+
+          /** @type {shaka.util.TsParser.AvcNalu} */
+          const nalu = {
+            // subarray's end position is exclusive, so add one.
+            data: data.subarray(firstByteToKeep, lastByteToKeep + 1),
+            type: lastNaluType,
+            time: time,
+          };
+          nalus.push(nalu);
+        }
+
+        // We just read a start code, so there should be another byte here, at
+        // least, for the NALU type.  Check just in case.
+        if (i >= len - 1) {
+          shaka.log.warning('Malformed TS, incomplete NALU, ignoring.');
+          return nalus;
+        }
+
+        // Advance and read the type of the next NALU.
+        i++;
+        lastNaluStart = i;
+        lastNaluType = data[i] & 0x1f;
+        numZeros = 0;
+      } else {
+        numZeros = 0;
+      }
+    }
+
+    if (lastNaluStart >= 0 && numZeros >= 0) {
+      // The rest of the buffer was a NALU.
+      // Because the start position includes the type, skip the first byte.
+      const firstByteToKeep = lastNaluStart + 1;
+      /** @type {shaka.util.TsParser.AvcNalu} */
+      const nalu = {
+        data: data.subarray(firstByteToKeep, len),
+        type: lastNaluType,
+        time: time,
+      };
+      nalus.push(nalu);
+    }
+    return nalus;
+  }
+
   /**
    * Return the ID3 metadata
    *
@@ -347,7 +442,7 @@ shaka.util.TsParser = class {
     const Uint8ArrayUtils = shaka.util.Uint8ArrayUtils;
     const metadata = [];
     let prevId3Data = new Uint8Array(0);
-    // parsePES() only works if the data begins on a PES boundary.
+    // parsePES_() only works if the data begins on a PES boundary.
     // Try the last data blob first, and if it doesn't begin on a
     // PES boundary, prepend the previous blob and try again.
     // This way, a successful parse will always begin and end on
@@ -356,7 +451,7 @@ shaka.util.TsParser = class {
       const data = this.id3Data_[i];
       goog.asserts.assert(data, 'We should have a data');
       const id3Data = Uint8ArrayUtils.concat(data, prevId3Data);
-      const pes = this.parsePES(id3Data);
+      const pes = this.parsePES_(id3Data);
       if (pes) {
         metadata.unshift({
           cueTime: pes.pts ? pes.pts / timescale : null,
@@ -397,6 +492,38 @@ shaka.util.TsParser = class {
     };
   }
 
+  /**
+   * Return the video data
+   *
+   * @return {!Array.<shaka.util.TsParser.AvcNalu>}
+   */
+  getVideoNalus() {
+    const Uint8ArrayUtils = shaka.util.Uint8ArrayUtils;
+    let nalus = [];
+    let prevVideoData = new Uint8Array(0);
+    // parsePES_() only works if the data begins on a PES boundary.
+    // Try the last data blob first, and if it doesn't begin on a
+    // PES boundary, prepend the previous blob and try again.
+    // This way, a successful parse will always begin and end on
+    // the correct boundary, and no data will be skipped.
+    for (let i = this.videoData_.length - 1; i >= 0; i--) {
+      const data = this.videoData_[i];
+      goog.asserts.assert(data, 'We should have a data');
+      const videoData = Uint8ArrayUtils.concat(data, prevVideoData);
+      const pes = this.parsePES_(videoData);
+      if (pes) {
+        if (this.videoCodec_ == 'avc') {
+          nalus = nalus.concat(this.parseAvcNalus_(pes));
+        }
+        prevVideoData = new Uint8Array(0);
+      } else {
+        prevVideoData = videoData;
+      }
+    }
+    // We need to invert the array to return it in the correct order.
+    return nalus.reverse();
+  }
+
   /**
    * Check if the passed data corresponds to an MPEG2-TS
    *
@@ -430,9 +557,9 @@ shaka.util.TsParser = class {
     while (i < scanwindow) {
       // a TS fragment should contain at least 3 TS packets, a PAT, a PMT, and
       // one PID, each starting with 0x47
-      if (data[i] === 0x47 &&
-          data[i + packetLength] === 0x47 &&
-          data[i + 2 * packetLength] === 0x47) {
+      if (data[i] == 0x47 &&
+          data[i + packetLength] == 0x47 &&
+          data[i + 2 * packetLength] == 0x47) {
         return i;
       } else {
         i++;
@@ -497,3 +624,18 @@ shaka.util.TsParser.PMT;
  */
 shaka.util.TsParser.PES;
 
+
+/**
+ * @typedef {{
+ *   data: !Uint8Array,
+ *   type: number,
+ *   time: ?number
+ * }}
+ *
+ * @summary AvcNalu.
+ * @property {!Uint8Array} data
+ * @property {number} type
+ * @property {?number} time
+ */
+shaka.util.TsParser.AvcNalu;
+
diff --git a/test/media/media_source_engine_integration.js b/test/media/media_source_engine_integration.js
index 27c1cb40cb..a917d8ff97 100644
--- a/test/media/media_source_engine_integration.js
+++ b/test/media/media_source_engine_integration.js
@@ -38,21 +38,51 @@ describe('MediaSourceEngine', () => {
     startTime: Util.closeTo(0.767, 0.001),
     endTime: Util.closeTo(4.972, 0.001),
     textAlign: Cue.textAlign.CENTER,
-    payload: 'These are 608 captions\n(top left)',
+    nestedCues: [
+      jasmine.objectContaining({
+        payload: 'These are 608 captions',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+      jasmine.objectContaining({lineBreak: true}),
+      jasmine.objectContaining({
+        payload: '(top left)',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+    ],
   });
 
   const tsCeaCue1 = jasmine.objectContaining({
     startTime: Util.closeTo(5.305, 0.001),
     endTime: Util.closeTo(11.979, 0.001),
     textAlign: Cue.textAlign.CENTER,
-    payload: 'These are 608 captions\n(middle)',
+    nestedCues: [
+      jasmine.objectContaining({
+        payload: 'These are 608 captions',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+      jasmine.objectContaining({lineBreak: true}),
+      jasmine.objectContaining({
+        payload: '(middle)',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+    ],
   });
 
   const tsCeaCue2 = jasmine.objectContaining({
     startTime: Util.closeTo(12.312, 0.001),
     endTime: Util.closeTo(19.319, 0.001),
     textAlign: Cue.textAlign.CENTER,
-    payload: 'These are 608 captions\n(bottom left)',
+    nestedCues: [
+      jasmine.objectContaining({
+        payload: 'These are 608 captions',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+      jasmine.objectContaining({lineBreak: true}),
+      jasmine.objectContaining({
+        payload: '(bottom left)',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+    ],
   });
 
   // The same segments as above, but offset by 40 seconds (yes, 40), which is
@@ -61,21 +91,51 @@ describe('MediaSourceEngine', () => {
     startTime: Util.closeTo(40.767, 0.001),
     endTime: Util.closeTo(44.972, 0.001),
     textAlign: Cue.textAlign.CENTER,
-    payload: 'These are 608 captions\n(top left)',
+    nestedCues: [
+      jasmine.objectContaining({
+        payload: 'These are 608 captions',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+      jasmine.objectContaining({lineBreak: true}),
+      jasmine.objectContaining({
+        payload: '(top left)',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+    ],
   });
 
   const tsCeaCue4 = jasmine.objectContaining({
     startTime: Util.closeTo(45.305, 0.001),
     endTime: Util.closeTo(51.979, 0.001),
     textAlign: Cue.textAlign.CENTER,
-    payload: 'These are 608 captions\n(middle)',
+    nestedCues: [
+      jasmine.objectContaining({
+        payload: 'These are 608 captions',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+      jasmine.objectContaining({lineBreak: true}),
+      jasmine.objectContaining({
+        payload: '(middle)',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+    ],
   });
 
   const tsCeaCue5 = jasmine.objectContaining({
     startTime: Util.closeTo(52.312, 0.001),
     endTime: Util.closeTo(59.319, 0.001),
     textAlign: Cue.textAlign.CENTER,
-    payload: 'These are 608 captions\n(bottom left)',
+    nestedCues: [
+      jasmine.objectContaining({
+        payload: 'These are 608 captions',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+      jasmine.objectContaining({lineBreak: true}),
+      jasmine.objectContaining({
+        payload: '(bottom left)',
+        textAlign: Cue.textAlign.CENTER,
+      }),
+    ],
   });
 
   /**
@@ -137,7 +197,7 @@ describe('MediaSourceEngine', () => {
         type, segment, reference, /* hasClosedCaptions= */ false);
   }
 
-  function appendWithSeek(type, segmentNumber) {
+  function appendWithSeekAndClosedCaptions(type, segmentNumber) {
     const segment = generators[type]
         .getSegment(segmentNumber, Date.now() / 1000);
     const reference = dummyReference(type, segmentNumber);
@@ -145,7 +205,7 @@ describe('MediaSourceEngine', () => {
         type,
         segment,
         reference,
-        /* hasClosedCaptions= */ false,
+        /* hasClosedCaptions= */ true,
         /* seeked= */ true);
   }
 
@@ -455,10 +515,9 @@ describe('MediaSourceEngine', () => {
     await mediaSourceEngine.init(initObject, /* forceTransmux= */ true);
     mediaSourceEngine.setSelectedClosedCaptionId('CC1');
 
-    await append(ContentType.VIDEO, 0);
+    await appendWithClosedCaptions(ContentType.VIDEO, 0);
 
     expect(textDisplayer.appendSpy).toHaveBeenCalledTimes(3);
-
     expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue0]);
     expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue1]);
     expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue2]);
@@ -477,7 +536,7 @@ describe('MediaSourceEngine', () => {
     await mediaSourceEngine.init(initObject, /* forceTransmux= */ true);
     mediaSourceEngine.setSelectedClosedCaptionId('CC1');
 
-    await append(ContentType.VIDEO, 2);
+    await appendWithClosedCaptions(ContentType.VIDEO, 2);
 
     expect(textDisplayer.appendSpy).toHaveBeenCalledTimes(3);
     expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue3]);
@@ -485,7 +544,7 @@ describe('MediaSourceEngine', () => {
     expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue5]);
 
     textDisplayer.appendSpy.calls.reset();
-    await appendWithSeek(ContentType.VIDEO, 0);
+    await appendWithSeekAndClosedCaptions(ContentType.VIDEO, 0);
 
     expect(textDisplayer.appendSpy).toHaveBeenCalledTimes(3);
     expect(textDisplayer.appendSpy).toHaveBeenCalledWith([tsCeaCue0]);
diff --git a/test/media/media_source_engine_unit.js b/test/media/media_source_engine_unit.js
index 31db649d8a..cd3665df46 100644
--- a/test/media/media_source_engine_unit.js
+++ b/test/media/media_source_engine_unit.js
@@ -533,7 +533,7 @@ describe('MediaSourceEngine', () => {
           data, 0, 10);
     });
 
-    it('appends transmuxed data and captions', async () => {
+    it('appends transmuxed data', async () => {
       const initObject = new Map();
       initObject.set(ContentType.VIDEO, fakeTransportStream);
 
@@ -548,7 +548,6 @@ describe('MediaSourceEngine', () => {
         await mediaSourceEngine.appendBuffer(
             ContentType.VIDEO, buffer, null,
             /* hasClosedCaptions= */ false);
-        expect(mockTextEngine.storeAndAppendClosedCaptions).toHaveBeenCalled();
         expect(videoSourceBuffer.appendBuffer).toHaveBeenCalled();
       };
 
@@ -563,38 +562,6 @@ describe('MediaSourceEngine', () => {
       await Promise.all([init(), delay()]);
     });
 
-    it('appends only transmuxed data without embedded text', async () => {
-      const initObject = new Map();
-      initObject.set(ContentType.VIDEO, fakeTransportStream);
-
-      const output = {
-        data: new Uint8Array(1),
-        captions: [],
-      };
-      mockTransmuxer.transmux.and.returnValue(Promise.resolve(output));
-
-      const init = async () => {
-        await mediaSourceEngine.init(initObject, false);
-        await mediaSourceEngine.appendBuffer(
-            ContentType.VIDEO, buffer, null,
-            /* hasClosedCaptions= */ false);
-        expect(mockTextEngine.storeAndAppendClosedCaptions)
-            .not.toHaveBeenCalled();
-        expect(videoSourceBuffer.appendBuffer)
-            .toHaveBeenCalledWith(output.data);
-      };
-
-      // The 'updateend' event fires once the data is done appending to the
-      // media source.  We only append to the media source once transmuxing is
-      // done.  Since transmuxing is done using Promises, we need to delay the
-      // event until MediaSourceEngine calls appendBuffer.
-      const delay = async () => {
-        await Util.shortDelay();
-        videoSourceBuffer.updateend();
-      };
-      await Promise.all([init(), delay()]);
-    });
-
     it('appends parsed closed captions from CaptionParser', async () => {
       const initObject = new Map();
       initObject.set(ContentType.VIDEO, fakeVideoStream);
@@ -1298,7 +1265,7 @@ describe('MediaSourceEngine', () => {
       mockTextEngine = jasmine.createSpyObj('TextEngine', [
         'initParser', 'destroy', 'appendBuffer', 'remove', 'setTimestampOffset',
         'setAppendWindow', 'bufferStart', 'bufferEnd', 'bufferedAheadOf',
-        'storeAndAppendClosedCaptions', 'convertMuxjsCaptionsToShakaCaptions',
+        'storeAndAppendClosedCaptions',
       ]);
 
       const resolve = () => Promise.resolve();
diff --git a/test/media/transmuxer_integration.js b/test/media/transmuxer_integration.js
index 4cb0bf69b6..63df2ed105 100644
--- a/test/media/transmuxer_integration.js
+++ b/test/media/transmuxer_integration.js
@@ -111,43 +111,40 @@ describe('Transmuxer', () => {
       let sawMDAT = false;
 
       const transmuxedData = await transmuxer.transmux(videoSegment);
-      expect(transmuxedData.data).toEqual(jasmine.any(Uint8Array));
-      expect(transmuxedData.data.length).toBeGreaterThan(0);
-      expect(transmuxedData.captions).toEqual(jasmine.any(Array));
+      expect(transmuxedData).toEqual(jasmine.any(Uint8Array));
+      expect(transmuxedData.length).toBeGreaterThan(0);
       new shaka.util.Mp4Parser()
           .box('mdat', shaka.util.Mp4Parser.allData((data) => {
             sawMDAT = true;
             expect(data.byteLength).toBeGreaterThan(0);
           }))
-          .parse(transmuxedData.data);
+          .parse(transmuxedData);
       expect(sawMDAT).toBeTruthy();
     });
 
     it('transmux audio from TS to MP4', async () => {
       let sawMDAT = false;
       const transmuxedData = await transmuxer.transmux(audioSegment);
-      expect(transmuxedData.data).toEqual(jasmine.any(Uint8Array));
-      expect(transmuxedData.data.length).toBeGreaterThan(0);
-      expect(transmuxedData.captions).toEqual(jasmine.any(Array));
+      expect(transmuxedData).toEqual(jasmine.any(Uint8Array));
+      expect(transmuxedData.length).toBeGreaterThan(0);
       new shaka.util.Mp4Parser()
           .box('mdat', shaka.util.Mp4Parser.allData((data) => {
             sawMDAT = true;
             expect(data.byteLength).toBeGreaterThan(0);
           }))
-          .parse(transmuxedData.data);
+          .parse(transmuxedData);
       expect(sawMDAT).toBeTruthy();
     });
 
     it('transmux empty video from TS to MP4', async () => {
       let sawMDAT = false;
       const transmuxedData = await transmuxer.transmux(emptySegment);
-      expect(transmuxedData.data).toEqual(jasmine.any(Uint8Array));
-      expect(transmuxedData.captions).toEqual([]);
+      expect(transmuxedData).toEqual(jasmine.any(Uint8Array));
       new shaka.util.Mp4Parser()
           .box('mdat', shaka.util.Mp4Parser.allData((data) => {
             sawMDAT = true;
           }))
-          .parse(transmuxedData.data);
+          .parse(transmuxedData);
       expect(sawMDAT).toBeFalsy();
     });
 
@@ -171,7 +168,7 @@ describe('Transmuxer', () => {
             mp4Timestamp = parsedTFDTBox.baseMediaDecodeTime;
             parsed = true;
           })
-          .parse(transmuxedData.data);
+          .parse(transmuxedData);
 
       expect(parsed).toBe(true);
       expect(mp4Timestamp).toBe(expectedMp4Timestamp);