binary/utf8.js

/**
 * @fileoverview UTF8 encoding and decoding routines
 */
goog.provide('jspb.binary.utf8');

goog.require('jspb.asserts');


/**
 * Whether to use the browser based `TextEncoder` and `TextDecoder` APIs for
 * handling utf8.
 *
 * <p>Enabled by default for `goog.FEATURESET_YEAR >= 2020`.  The code also
 * performs feature detection for this API and will always use it if available,
 * this variable enables us to not ship the polyfill.
 *
 * <p>See http://go/jscompiler-flags#browser-featureset-year-options for the
 * behavior here.
 *
 * @define {boolean}
 */
const USE_TEXT_ENCODING =
  goog.define('jspb.binary.USE_TEXTENCODING', goog.FEATURESET_YEAR >= 2020);

const /** number */ MIN_SURROGATE = 0xD800;
const /** number */ MIN_HIGH_SURROGATE = MIN_SURROGATE;
const /** number */ MAX_HIGH_SURROGATE = 0xDBFF;
const /** number */ MIN_LOW_SURROGATE = 0xDC00;
const /** number */ MAX_LOW_SURROGATE = 0xDFFF;
const /** number */ MAX_SURROGATE = MAX_LOW_SURROGATE;

/**
 * Returns whether the byte is not a valid continuation of the form
 * '10XXXXXX'.
 * @return {boolean}
 */
function isNotTrailingByte(/** number */ byte) {
  // 0xC0 is '11000000' in binary
  // 0x80 is '10000000' in binary
  return (byte & 0xC0) !== 0x80;
}


/**
 * Either throws an error or appends a replacement codepoint of invalid utf8
 */
function invalid(
    /** boolean */ parsingErrorsAreFatal, /** !Array<number> */ codeUnits) {
  if (parsingErrorsAreFatal) {
    throw new Error('Invalid UTF8');
  }
  codeUnits.push(0xFFFD);  // utf8 replacement character
}

/** @return {string} */
function codeUnitsToString(
    /** string? */ accum, /** !Array<number> */ utf16CodeUnits) {
  const suffix = String.fromCharCode.apply(null, utf16CodeUnits);
  return accum == null ? suffix : accum + suffix;
}

/**
 * Our handwritten UTF8 decoder.
 *
 * https://en.wikipedia.org/wiki/UTF-8#Encoding describes the bit layout
 *
 * https://en.wikipedia.org/wiki/UTF-8#Invalid_sequences_and_error_handling
 * describes important cases to check for which are namely:
 * - overlong encodings, meaning a value expressable in N bytes could have been
 * expressed in fewer bytes
 * - invalid bytes, meaning bytes that are generally out of range
 * - surrogate codepoints, utf8 never encodes directly a utf16 surrogate value
 * - underflow where there aren't enough bytes for the sequence we are parsing
 * - out of range codepoints.
 *
 * @return {string}
 */
jspb.binary.utf8.polyfillDecodeUtf8 = function (
    /** !Uint8Array */ bytes, /** number */ offset, /** number */ length,
    /** boolean */ parsingErrorsAreFatal) {
  let cursor = offset;
  const end = cursor + length;
  const codeUnits = [];
  let result = null;

  // This is significantly slower than the TextDecoder implementation.
  // Ideas for improving performance:
  // 1. Reduce branching with non-shortcircuting operators, e.g.
  // https://stackoverflow.com/q/5652363
  // 2. improve isNotTrailingByte using xor?
  // 3. consider having a dedicate ascii loop (java impls do this)
  let c1, c2, c3, c4;
  while (cursor < end) {
    c1 = bytes[cursor++];
    if (c1 < 0x80) {  // Regular 7-bit ASCII.
      codeUnits.push(c1);
    } else if (c1 < 0xE0) {  // UTF-8 with two bytes.
      if (cursor >= end) {
        invalid(parsingErrorsAreFatal, codeUnits);
      } else {
        c2 = bytes[cursor++];
        // Make sure that c1 is a valid leading byte and c2 is a valid
        // trailing byte
        // 0xC2 is '11000010', if c1 is less than this then we have an overlong
        // encoding because there would only be 7 significant bits.
        if (c1 < 0xC2 || isNotTrailingByte(c2)) {
          cursor--;  // push c2 back since it isn't 'accepted'
          invalid(parsingErrorsAreFatal, codeUnits);
        } else {
          // The codeUnit is the lower 6 bits from c2 and the lower 5 bits from
          // c1
          const codeUnit = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
          // Consistency check that the computed code is in range for a 2 byte
          // sequence.
          jspb.asserts.assert(codeUnit >= 0x80 && codeUnit <= 0x07FF);
          codeUnits.push(codeUnit);
        }
      }
    } else if (c1 < 0xF0) {  // UTF-8 with three bytes.
      if (cursor >= end - 1) {
        invalid(parsingErrorsAreFatal, codeUnits);
      } else {
        c2 = bytes[cursor++];
        if (isNotTrailingByte(c2) ||
          // These checks were taken from
          // java/com/google/protobuf/Utf8.java
          // overlong? 5 most significant bits must not all be zero
          (c1 === 0xE0 && c2 < 0xA0)
          // check for illegal surrogate codepoints
          || (c1 === 0xED && c2 >= 0xA0) ||
          // We delay reading c3 until now so than an error in c2 or c1 will
          // preserve c3 for the next loop iteration
          isNotTrailingByte(c3 = bytes[cursor++])) {
          cursor--;  // push back c2 or c3, depending on how far we made it
          invalid(parsingErrorsAreFatal, codeUnits);
        } else {
          // 4 bits from the first byte
          // 6 bits from each of the two lower bytes
          // == 16 bits total
          const codeUnit =
            ((c1 & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
          // Consistency check, this is the valid range for a 3 byte character
          jspb.asserts.assert(codeUnit >= 0x800 && codeUnit <= 0xFFFF);
          // And that Utf16 surrogates are disallowed
          jspb.asserts.assert(codeUnit < MIN_SURROGATE || codeUnit > MAX_SURROGATE);
          codeUnits.push(codeUnit);
        }
      }
    } else if (c1 <= 0xF4) {  // UTF-8 with 4 bytes.
      // 0xF8 matches the bitpattern for utf8 with 4 bytes, but all leading
      // bytes > 0xF4 are either overlong encodings or exceed the valid range.
      if (cursor >= end - 2) {
        invalid(parsingErrorsAreFatal, codeUnits);
      } else {
        c2 = bytes[cursor++];
        if (isNotTrailingByte(c2) ||
          // This check was inspired by
          // java/com/google/protobuf/Utf8.java
          // Tricky optimized form of:
          //   valid 4-byte leading byte?
          // if (byte1 > (byte) 0xF4 ||
          //   overlong? 4 most significant bits must not all be zero
          //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
          //   codepoint larger than the highest code point (U+10FFFF)?
          //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
          (((c1 << 28) + (c2 - 0x90)) >> 30) !== 0 ||
          // We delay reading c3 and c4 until now so than an error in c2 or c1
          // will preserve them for the next loop iteration.
          isNotTrailingByte(c3 = bytes[cursor++]) ||
          isNotTrailingByte(c4 = bytes[cursor++])) {
          cursor--;  // push back c2, c3 or c4 depending on how far we made it
          invalid(parsingErrorsAreFatal, codeUnits);
        } else {
          // Characters written on 4 bytes have 21 bits for a codepoint.
          // We can't fit that on 16bit characters, so we use surrogates.
          // 3 bits from the uppermost byte, 6 bits from each of the lower 3
          // bytes. This is 21 bits which is too big for a 16 bit utf16 code
          // unit so we use surrogates.
          let codepoint = ((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12) |
            ((c3 & 0x3F) << 6) | (c4 & 0x3F);
          // Consistency check, this is the valid range for a 4 byte character.
          jspb.asserts.assert(codepoint >= 0x10000 && codepoint <= 0x10FFFF);
          // Surrogates formula from wikipedia.
          // 1. Subtract 0x10000 from codepoint
          codepoint -= 0x10000;
          // 2. Split this into the high 10-bit value and the low 10-bit value
          // 3. Add 0xD800 to the high value to form the high surrogate
          // 4. Add 0xDC00 to the low value to form the low surrogate:
          const low = (codepoint & 0x3FF) + MIN_LOW_SURROGATE;
          const high = ((codepoint >> 10) & 0x3FF) + MIN_HIGH_SURROGATE;
          codeUnits.push(high, low);
        }
      }
    } else {
      // initial byte is too large for utf8
      invalid(parsingErrorsAreFatal, codeUnits);
    }
    // Accumulate as we go to avoid exceeding the maximum stack size when
    // calling `apply`.
    if (codeUnits.length >= 8192) {
      result = codeUnitsToString(result, codeUnits);
      codeUnits.length = 0;
    }
  }
  // ensure we don't overflow or underflow
  jspb.asserts.assert(cursor === end, `expected ${cursor} === ${end}`);
  return codeUnitsToString(result, codeUnits);
}


/** @type {boolean|undefined} */
let isFatalTextDecoderCachableAfterThrowing_ =
  // chrome version >= 2020 are not subject to https://crbug.com/910292
  goog.FEATURESET_YEAR >= 2020 ? true : undefined;

/** @return {boolean} */
function isFatalTextDecoderCachableAfterThrowing(/** !TextDecoder */ decoder) {
  // Test if the decoder is subject to https://crbug.com/910292
  // chrome versions with this bug cause one failed decode to cause all later
  // decodes to throw.
  if (isFatalTextDecoderCachableAfterThrowing_ === undefined) {
    // In theory we shouldn't need to generate an error here since this function
    // is only called in the context of a failed decode.  However, the buggy
    // chrome versions are not 'consistent' in corrupting their internal state
    // since it depends on where in the decode stream the error occurs.  This
    // error however does consistently trigger the bug based on manual testing.
    try {
      // A lonely continuation byte
      decoder.decode(new Uint8Array([0x80]));
    } catch (e) {
      // expected
    }
    try {
      // 'a' in hex
      decoder.decode(new Uint8Array([0x61]));
      isFatalTextDecoderCachableAfterThrowing_ = true;
    } catch (e) {
      // This decode should not throw, if it does it means our chrome version
      // is buggy and we need to flush our cached decoder when failures occur
      isFatalTextDecoderCachableAfterThrowing_ = false;
    }
  }
  return isFatalTextDecoderCachableAfterThrowing_;
}

/** @type {!TextDecoder|undefined} */
let fatalDecoderInstance;

/** @return {!TextDecoder}*/
function getFatalDecoderInstance() {
  let instance = fatalDecoderInstance;
  if (!instance) {
    instance = fatalDecoderInstance = new TextDecoder('utf-8', { fatal: true });
  }
  return instance;
}

/** @type {!TextDecoder|undefined} */
let nonFatalDecoderInstance;

/** @return {!TextDecoder}*/
function getNonFatalDecoderInstance() {
  let instance = nonFatalDecoderInstance;
  if (!instance) {
    instance = nonFatalDecoderInstance =
      new TextDecoder('utf-8', { fatal: false });
  }
  return instance;
}

/**
 * A `subarray` implementation that avoids calling `subarray` if it isn't needed
 *
 * `subarray` tends to be surprisingly slow.
 * @return {!Uint8Array}
 */
function subarray(
    /** !Uint8Array*/ bytes, /** number */ offset, /** number */ end) {
  return offset === 0 && end === bytes.length ? bytes :
    bytes.subarray(offset, end);
}

/**
 * @return {string}
 */
jspb.binary.utf8.textDecoderDecodeUtf8 = function (
    /** !Uint8Array*/ bytes, /** number */ offset, /** number */ length,
    /** boolean*/ parsingErrorsAreFatal) {
  const /** !TextDecoder */ decoder = parsingErrorsAreFatal ?
    getFatalDecoderInstance() :
    getNonFatalDecoderInstance();

  bytes = subarray(bytes, offset, offset + length);
  try {
    return decoder.decode(bytes);
  } catch (e) {
    if (parsingErrorsAreFatal &&
      !isFatalTextDecoderCachableAfterThrowing(decoder)) {
      fatalDecoderInstance = undefined;
    }
    throw e;
  }
}

/** @const {boolean} */
const useTextDecoderDecode =
  USE_TEXT_ENCODING || typeof TextDecoder !== 'undefined';

/**
 * A utf8 decoding routine either based upon TextDecoder if available or using
 * our polyfill implementation
 * @return {string}
 */
jspb.binary.utf8.decodeUtf8 = function (
    /** !Uint8Array*/ bytes, /** number */ offset, /** number */ length,
    /** boolean*/ parsingErrorsAreFatal) {
  return useTextDecoderDecode ?
    jspb.binary.utf8.textDecoderDecodeUtf8(bytes, offset, length, parsingErrorsAreFatal) :
    jspb.binary.utf8.polyfillDecodeUtf8(bytes, offset, length, parsingErrorsAreFatal);
}

/** @type {!TextEncoder|undefined} */
let textEncoderInstance;

/** @return {!Uint8Array} */
jspb.binary.utf8.textEncoderEncode = function (
    /** string */ s, /** boolean */ rejectUnpairedSurrogates) {
  if (rejectUnpairedSurrogates) {
    jspb.binary.utf8.checkWellFormed(s);
  }

  if (!textEncoderInstance) {
    textEncoderInstance = new TextEncoder();
  }
  return textEncoderInstance.encode(s);
}

// isWellFormed landed in major browsers in early 2023 so it will only be
// definitely available in 2024 See
// http://go/mdn/JavaScript/Reference/Global_Objects/String/isWellFormed
const /** boolean */ HAS_WELL_FORMED_METHOD = goog.FEATURESET_YEAR > 2023 ||
  typeof String.prototype.isWellFormed === 'function';

jspb.binary.utf8.checkWellFormed = function (/** string */ text) {
  if (HAS_WELL_FORMED_METHOD ?
    // Externs don't contain the definition of this function yet.
    // http://go/mdn/JavaScript/Reference/Global_Objects/String/isWellFormed
    !(/** @type{{isWellFormed:function():boolean}}*/ (
                /** @type {?} */ (text))
      .isWellFormed()) :
    /(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/
      .test(text)) {
    throw new Error('Found an unpaired surrogate');
  }
}


/** @return {!Uint8Array} */
jspb.binary.utf8.polyfillEncode = function (
    /** string */ s, /** boolean */ rejectUnpairedSurrogates) {
  let bi = 0;
  // The worse case is that every character requires 3 output bytes, so we
  // allocate for this.  This assumes that the buffer will be short lived.
  // Callers can always `slice` if needed
  const buffer = new Uint8Array(3 * s.length);
  for (let ci = 0; ci < s.length; ci++) {
    let c = s.charCodeAt(ci);
    if (c < 0x80) {
      buffer[bi++] = c;
    } else if (c < 0x800) {
      buffer[bi++] = (c >> 6) | 0xC0;
      buffer[bi++] = (c & 63) | 0x80;
    } else {
      jspb.asserts.assert(c < 65536);
      // Look for surrogates
      // First check if it is surrogate range
      if (c >= MIN_SURROGATE && c <= MAX_SURROGATE) {
        // is it a high surrogate?
        if (c <= MAX_HIGH_SURROGATE && ci < s.length) {
          const c2 = s.charCodeAt(++ci);
          if (c2 >= MIN_LOW_SURROGATE && c2 <= MAX_LOW_SURROGATE) {
            // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
            const codePoint =
              (c - MIN_SURROGATE) * 0x400 + c2 - MIN_LOW_SURROGATE + 0x10000;
            buffer[bi++] = (codePoint >> 18) | 0xF0;
            buffer[bi++] = ((codePoint >> 12) & 63) | 0x80;
            buffer[bi++] = ((codePoint >> 6) & 63) | 0x80;
            buffer[bi++] = (codePoint & 63) | 0x80;
            continue;
          } else {
            // else c2 not in low surrogate range, treat c as a lone surrogate
            // and back up ci so we process c2 on the next loop as an
            // independent character
            ci--;
          }
        }  // else c not a high surrogate
        if (rejectUnpairedSurrogates) {
          throw new Error('Found an unpaired surrogate');
        }
        c = 0xFFFD;  // Error! Unpaired surrogate
      }
      buffer[bi++] = (c >> 12) | 0xE0;
      buffer[bi++] = ((c >> 6) & 63) | 0x80;
      buffer[bi++] = (c & 63) | 0x80;
    }
  }
  return subarray(buffer, 0, bi);
}

/** @const {boolean} */
const useTextEncoderEncode =
  (USE_TEXT_ENCODING || typeof TextEncoder !== 'undefined');

/**
 * A utf8 encoding routine either based upon TextEncoder if available or using
 * our polyfill implementation
 * @return {!Uint8Array}
 */
jspb.binary.utf8.encodeUtf8 = function (
    /**string*/ string, /** boolean=*/ rejectUnpairedSurrogates = false) {
  jspb.asserts.assertString(string);
  return useTextEncoderEncode ?
    jspb.binary.utf8.textEncoderEncode(string, rejectUnpairedSurrogates) :
    jspb.binary.utf8.polyfillEncode(string, rejectUnpairedSurrogates);
}