From c878bea7c83df2b3911fbf5f36afd35020c4d802 Mon Sep 17 00:00:00 2001 From: Steve Anton Date: Wed, 4 Sep 2019 15:19:54 -0700 Subject: [PATCH] Add detailed design discussion for codec configuration --- explainer.md | 59 ++++++++++++ webidl.txt | 263 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 216 insertions(+), 106 deletions(-) diff --git a/explainer.md b/explainer.md index a16fc570..6377429c 100644 --- a/explainer.md +++ b/explainer.md @@ -293,6 +293,65 @@ input.readable.pipeInto(demuxer.writable); muxer.readable.pipeInto(output.writable); ``` +## Detailed design discussion + +### Codec configuration + +Many codecs and encoder/decoder implementations are highly configurable. WebCodecs intends to support most of the configuration options available in codecs today to efficiently allow for advanced use cases. + +Configuration options are classified into two types: +- **Parameters** are metadata required to construct a compliant bitstream. These are required when constructing the encoder/decoder and cannot be changed. For example, the VP9 profile. +- **Settings** are configuration options that influence the behavior of the encoder but do not change the type of bitstream produced. For example, target bitrate. + +Settings are further classified into three types: +- **Static codec settings** must be specified when constructing the encoder and cannot be changed. +- **Dynamic codec settings** apply to the lifetime of the encoder and can be changed at any point. +- **Frame settings** apply only to specific input frames. + +WebCodecs will maintain a standard definition of parameters for each supported codec. Additionally, the specification will establish common encoder settings that apply across codecs and implementation. However, we expect many settings will be implementation-specific. These will be available behind a feature detection and configuration API (TODO: sketch this). + +#### Configuration examples + +Both encoder and decoder constructors take in the codec name and required parameters. Encoders additionally take in a dictionary of codec settings. + +```javascript +const encoder = new VideoEncoder({ + codec: 'VP9', + profile: '1', + settings: { + targetBitRate: 80_000, + }, +}); +``` + +Codec settings can be changed on-the-fly by bundling the changed settings with the next input image. The changed settings will be applied before encoding the image and apply to subsequent images. + +```javascript +const encoder = new VideoEncoder(...); +const writer = encoder.writable.getWriter(); +writer.write({ + imageData: ..., + timestamp: ..., + changeCodecSettings: { + targetBitRate: 50_000, + }, +}); +``` + +Frame settings are also bundled with the next input image. These settings do not persist beyond encoding for the image on which they appear. + +```javascript +const encoder = new VideoEncoder(...); +const writer = encoder.writable.getWriter(); +writer.write({ + imageData: ..., + timestamp: ..., + settings: { + forceKeyFrame: true, + }, +}); +``` + ## Alternative designs considered Media Source Extensions (MSE) is already used widely for low-latency streaming. However, there are some problems: diff --git a/webidl.txt b/webidl.txt index e7e1d1af..9a800b9b 100644 --- a/webidl.txt +++ b/webidl.txt @@ -3,64 +3,38 @@ // TODO(when writing spec): // - Specify that encoding and decoding must happen off the main thread. -[Constructor(MediaStreamTrack track)] -interface AudioTrackReader { - readonly attribute ReadableStream readable; // of DecodedAudioPacket -} +// Common definitions used for both audio and video. -interface DecodedAudioPacket { - readonly attribute MediaTime timestamp; - // Sample count == duration.value - // Sample rate == duration.scale - readonly attribute MediaTime duration; - readonly attribute unsigned long channelCount -} - -[Constructor(AudioEncoderParams params)] -interface AudioEncoder { - void setParameters(AudioEncoderParams params); - readonly attribute WritableStream writable; // DecodedAudioPacket - readonly attribute ReadableStream readable; // EncodedAudioPacket +[Constructor(unsigned long long value, unsigned long long scale)] +interface MediaTime { + readonly attribute unsigned long long value; + readonly attribute unsigned long long scale; } -dictionary AudioEncoderParams { - DOMString mimeType; - // not supported by all codecs - // null/unset means use the codec default - unsigned long? bitsPerSecond; - - // codec-specific - // null/unset means use the codec default - unsigned long? complexity; +// Audio encoder and decoder interfaces. - // probably opus-specific - bool fec = false; // enabled or not - bool dtx = false; // enabled or not - bool cbr = false; // cbr or not (vbr if not) - bool speechMode = false; // speech-specific mode or not -} +dictionary AudioCodecParameters { + DOMString codec; -[Constructor(BufferSource data, MediaTime timestamp)] -interface EncodedAudioPacket { - readonly attribute MediaTime timestamp; - readonly attribute Uint8Array data; + // Defaults are codec-specific + unsigned long? sampleRate; + unsigned long? channelCount; } -[Constructor(AudioDecoderParams params)] +[Constructor(AudioDecoderInit init)] interface AudioDecoder { - readonly attribute WritableStream writable; // EncodedAudioPacket - readonly attribute ReadableStream readable; // DecodedAudioPacket - attribute EventHandler onerror; + readonly attribute WritableStream writable; // AudioDecoderInput + readonly attribute ReadableStream readable; // AudioDecoderOutput } -dictionary AudioDecoderParams { - DOMString codec; // For example, "opus" - - // Defaults are codec-specific - unsigned long? sampleRate; - unsigned long? channelCount; +[Constructor(AudioEncoderInit init)] +interface AudioEncoder { + readonly attribute WritableStream writable; // AudioEncoderInput + readonly attribute ReadableStream readable; // AudioEncoderOutput +} +dictionary AudioDecoderInit : AudioCodecParameters { // Optional byte data required to initialize audio decoders // such as Vorbis codebooks. BufferSource? extraData; @@ -72,50 +46,104 @@ dictionary AudioDecoderParams { MediaTime? codecDelay; } -[Constructor()] -interface AudioTrackWriter { - readonly attribute WritableStream writable; // of DecodedAudioPacket - readonly attribute MediaStreamTrack track; +dictionary AudioDecoderInput { + Uint8Array data; + MediaTime timestamp; } +dictionary AudioDecoderOutput { + AudioBuffer buffer; + // TODO: decode stats. +} -[Constructor(MediaStreamTrack track)] -interface VideoTrackReader { - readonly attribute ReadableStream readable; // of DecodedVideoFrame +dictionary AudioEncoderStaticSettings { + // probably opus-specific + bool fec = false; // enabled or not + bool dtx = false; // enabled or not + bool cbr = false; // cbr or not (vbr if not) + bool speechMode = false; // speech-specific mode or not +} + +dictionary AudioEncoderDynamicSettings { + // not supported by all codecs + // null/unset means use the codec default + unsigned long? bitsPerSecond; + + // codec-specific + // null/unset means use the codec default + unsigned long? complexity; } -interface DecodedVideoFrame { +dictionary AudioEncoderSettings : AudioEncoderStaticSettings, AudioEncoderDynamicSettings { +} + +dictionary AudioEncoderInit : AudioCodecParameters { + AudioEncoderSettings? settings; +} + +dictionary AudioEncoderFrameSettings { +} + +dictionary AudioEncoderInput { + MediaTime timestamp; + ArrayBuffer buffer; + AudioEncoderFrameSettings settings; + AudioEncoderDynamicSettings changeCodecSettings; +} + +dictionary AudioEncoderOutput { + Uint8Array data; + MediaTime timestamp; + // TODO: encode stats. +} + + +// Video encoder and decoder interfaces. + +interface VideoFrame { readonly attribute MediaTime timestamp; readonly attribute ImageData imageData; } -[Constructor(VideoEncoderParams params)] -interface VideoEncoder { - void setParameters(VideoEncoderParams params); - void generateKeyFrame(optional sequence layerIds); - readonly attribute WritableStream writable; // DecodedVideoFrame - readonly attribute ReadableStream readable; // EncodedVideoFrame - attribute EventHandler onerror; +dictionary VideoCodecParameters { + DOMString codec; + + // For VP9: + DOMString? profile; } -dictionary VideoEncoderParams { - // Cannot be changed once set - DOMString mimeType; +[Constructor(VideoDecoderInit init)] +interface VideoDecoder { + readonly attribute WritableStream writable; // VideoDecoderInput + readonly attribute ReadableStream readable; // VideoDecoderOutput +} - // Can be used to initialize the encoder faster - // than waiting for the first frame - unsigned long? expectedWidth; - unsigned long? expectedHeight; +[Constructor(VideoEncoderInit init)] +interface VideoEncoder { + readonly attribute WritableStream writable; // VideoEncoderInput + readonly attribute ReadableStream readable; // VideoEncoderOutput +} - // unset/null means the encoder will pick - // target will be exceeded for key frames - unsigned long bitsPerSecond; +dictionary VideoDecoderInit : VideoCodecParameters { + // Optional byte data required to initialize video decoders + // such as H264 with SPS and PPS. + BufferSource? extraData; - VideoEncodeContentMode contentMode; + // Can be used to initialize the decoder faster + // than waiting for the first frame + unsigned long long? expectedWidth; + unsigned long long? expectedHeight; +} - sequence layers; -} +dictionary VideoDecoderInput { + Uint8Array data; + MediaTime timestamp; +} +dictionary VideoDecoderOutput { + VideoFrame frame; + // TODO: add decode stats. +} enum VideoEncodeContentMode { "screen" // For screen sharing/recording @@ -148,51 +176,74 @@ dictionary VideoEncodeLayer { unsigned long? bitsPerSecond; } -[Constructor(BufferSource data, MediaTime timestamp)] -interface EncodedVideoFrame { - readonly attribute Uint8Array data; - readonly attribute MediaTime timestamp; - // Info provided as a result from the encoder - // Not needed as input to a decoder - readonly attribute VideoEncodeResult? encoded; +dictionary VideoEncoderStaticSettings { + // Can be used to initialize the encoder faster + // than waiting for the first frame + unsigned long? expectedWidth; + unsigned long? expectedHeight; + + VideoEncodeContentMode contentMode; + + sequence layers; +} + +dictionary VideoEncoderDynamicSettings { + // unset/null means the encoder will pick + // target will be exceeded for key frames + unsigned long long? targetBitRate; +} + +dictionary VideoEncoderSettings : VideoEncoderStaticCodecSettings, VideoEncoderDynamicCodecSettings { } -interface VideoEncodeResult { +dictionary VideoEncoderInit : VideoCodecParameters { + VideoEncoderSettings settings; +} + +dictionary VideoEncoderFrameSettings { + boolean? forceKeyFrame; +} + +dictionary VideoEncoderInput { + VideoFrame frame; + VideoEncoderFrameSettings settings; + VideoEncoderDynamicSettings changeCodecSettings; +} + +dictionary VideoEncoderOutput { + Uint8Array data; + MediaTime timestamp; + // If using multiple layers, which layer is it? - readonly attribute DOMString? layerId; + DOMString? layerId; // Whether or not it's a key frame meaning it depends on // no other frames - readonly attribute bool keyFrame; -} + boolean keyFrame; -[Constructor(VideoDecoderParams params)] -interface VideoDecoder { - readonly attribute WritableStream writable; // EncodedVideoFrame - readonly attribute ReadableStream readable; // DecodedVideoFrame - attribute EventHandler onerror; + // TODO: per-frame encode stats. } -dictionary VideoDecoderParams { - DOMString mimeType; - // Can be used to initialize the decoder faster - // than waiting for the first frame - unsigned long long? expectedWidth; - unsigned long long? expectedHeight; +// MediaStreamTrack integration. - // Optional byte data required to initialize video decoders - // such as H264 with SPS and PPS. - BufferSource? extraData; -} +[Constructor(MediaStreamTrack track)] +interface AudioTrackReader { + readonly attribute ReadableStream readable; // of DecodedAudioPacket +} [Constructor()] -interface VideoTrackWriter { - readonly attribute WritableStream writable; // of DecodedVideoFrame +interface AudioTrackWriter { + readonly attribute WritableStream writable; // of DecodedAudioPacket readonly attribute MediaStreamTrack track; } -[Constructor(unsigned long long value, unsigned long long scale)] -interface MediaTime { - readonly attribute unsigned long long value; - readonly attribute unsigned long long scale; +[Constructor(MediaStreamTrack track)] +interface VideoTrackReader { + readonly attribute ReadableStream readable; // of VideoFrame +} + +[Constructor()] +interface VideoTrackWriter { + readonly attribute WritableStream writable; // VideoFrame + readonly attribute MediaStreamTrack track; }