Skip to content

Commit 6209fc0

Browse files
committed
Chore: bugs about surrogate pairs in character reference
1 parent be63050 commit 6209fc0

File tree

4 files changed

+559
-1
lines changed

4 files changed

+559
-1
lines changed

src/html/tokenizer.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1705,10 +1705,19 @@ export class Tokenizer {
17051705
* @returns The next state.
17061706
*/
17071707
protected CHARACTER_REFERENCE_END(_cp: number): TokenizerState {
1708+
assert(this.currentToken != null)
1709+
1710+
// The this.buffer.length is not new length since it includes surrogate pairs.
1711+
// Calculate new length.
1712+
const token = this.currentToken as Token
1713+
const len0 = token.value.length
17081714
for (const cp1 of this.buffer) {
17091715
this.appendTokenValue(cp1, null)
17101716
}
1711-
for (let i = this.crStartOffset + this.buffer.length; i < this.offset; ++i) {
1717+
const newLength = token.value.length - len0
1718+
1719+
// Make gaps in the difference of length.
1720+
for (let i = this.crStartOffset + newLength; i < this.offset; ++i) {
17121721
this.gaps.push(i)
17131722
}
17141723

0 commit comments

Comments
 (0)