From 9beefaa1d5e711b63293d52a6c77e8aaf5e8f4c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Donny/=EA=B0=95=EB=8F=99=EC=9C=A4?= Date: Sat, 11 Mar 2023 21:13:00 +0900 Subject: [PATCH] perf(es/lexer): Use jump table for `read_token` (#7058) --- crates/swc_ecma_parser/src/lexer/mod.rs | 259 ++++++++-------------- crates/swc_ecma_parser/src/lexer/table.rs | 175 +++++++++++++++ 2 files changed, 265 insertions(+), 169 deletions(-) create mode 100644 crates/swc_ecma_parser/src/lexer/table.rs diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 11efbc5d3132..05b855f18a2b 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -9,7 +9,12 @@ use swc_atoms::{Atom, AtomGenerator}; use swc_common::{comments::Comments, input::StringInput, BytePos, Span}; use swc_ecma_ast::{op, EsVersion}; -use self::{comments_buffer::CommentsBuffer, state::State, util::*}; +use self::{ + comments_buffer::CommentsBuffer, + state::State, + table::{ByteHandler, BYTE_HANDLERS}, + util::*, +}; pub use self::{ input::Input, state::{TokenContext, TokenContexts}, @@ -25,6 +30,7 @@ pub mod input; mod jsx; mod number; mod state; +mod table; #[cfg(test)] mod tests; pub mod util; @@ -161,178 +167,24 @@ impl<'a> Lexer<'a> { /// babel: `getTokenFromCode` fn read_token(&mut self) -> LexResult> { - let c = self.input.cur_as_ascii(); - - match c { - None => {} - Some(c) => { - match c { - b'#' => return self.read_token_number_sign(), - - // - b'.' => return self.read_token_dot().map(Some), - - b'(' | b')' | b';' | b',' | b'[' | b']' | b'{' | b'}' | b'@' | b'`' | b'~' => { - // These tokens are emitted directly. - self.input.bump(); - return Ok(Some(match c { - b'(' => LParen, - b')' => RParen, - b';' => Semi, - b',' => Comma, - b'[' => LBracket, - b']' => RBracket, - b'{' => LBrace, - b'}' => RBrace, - b'@' => At, - b'`' => tok!('`'), - b'~' => tok!('~'), - - _ => unreachable!(), - })); - } - - b'?' => return self.read_token_question_mark().map(Some), - - b':' => return self.read_token_colon().map(Some), - - b'0' => return self.read_token_zero().map(Some), - - b'1'..=b'9' => { - return self - .read_number(false) - .map(|v| match v { - Left((value, raw)) => Num { value, raw }, - Right((value, raw)) => BigInt { value, raw }, - }) - .map(Some); - } - - b'"' | b'\'' => return self.read_str_lit().map(Some), - - b'/' => return self.read_slash(), - - b'%' | b'*' => return self.read_token_mul_mod(c).map(Some), - - // Logical operators - b'|' | b'&' => return self.read_token_logical(c).map(Some), - b'^' => { - // Bitwise xor - self.input.bump(); - return Ok(Some(if self.input.cur() == Some('=') { - self.input.bump(); - AssignOp(BitXorAssign) - } else { - BinOp(BitXor) - })); - } - - b'+' | b'-' => { - let start = self.cur_pos(); - - self.input.bump(); - - // '++', '--' - return Ok(Some(if self.input.cur() == Some(c as char) { - self.input.bump(); - - // Handle --> - if self.state.had_line_break && c == b'-' && self.eat(b'>') { - self.emit_module_mode_error( - start, - SyntaxError::LegacyCommentInModule, - ); - self.skip_line_comment(0); - self.skip_space::()?; - return self.read_token(); - } - - if c == b'+' { - PlusPlus - } else { - MinusMinus - } - } else if self.input.eat_byte(b'=') { - AssignOp(if c == b'+' { AddAssign } else { SubAssign }) - } else { - BinOp(if c == b'+' { Add } else { Sub }) - })); - } - - b'<' | b'>' => return self.read_token_lt_gt(), - - b'!' | b'=' => { - let start = self.cur_pos(); - let had_line_break_before_last = self.had_line_break_before_last(); - - self.input.bump(); - - return Ok(Some(if self.input.eat_byte(b'=') { - // "==" - - if self.input.eat_byte(b'=') { - if c == b'!' { - BinOp(NotEqEq) - } else { - // ======= - // ^ - if had_line_break_before_last && self.is_str("====") { - self.emit_error_span( - fixed_len_span(start, 7), - SyntaxError::TS1185, - ); - self.skip_line_comment(4); - self.skip_space::()?; - return self.read_token(); - } - - BinOp(EqEqEq) - } - } else if c == b'!' { - BinOp(NotEq) - } else { - BinOp(EqEq) - } - } else if c == b'=' && self.input.eat_byte(b'>') { - // "=>" - - Arrow - } else if c == b'!' { - Bang - } else { - AssignOp(Assign) - })); - } + let byte = match self.input.as_str().as_bytes().first() { + Some(&v) => v, + None => return Ok(None), + }; - b'a'..=b'z' | b'A'..=b'Z' | b'$' | b'_' | b'\\' => { - // Fast path for ascii identifiers. - return self.read_ident_or_keyword().map(Some); - } - _ => {} - } - } - } + let handler = unsafe { *(&BYTE_HANDLERS as *const ByteHandler).offset(byte as isize) }; - let c = match self.input.cur() { - Some(c) => c, + match handler { + Some(handler) => handler(self), None => { - return Ok(None); - } - }; - - let token = { - // Identifier or keyword. '\uXXXX' sequences are allowed in - // identifiers, so '\' also dispatches to that. - if c == '\\' || c.is_ident_start() { - return self.read_ident_or_keyword().map(Some); + let start = self.cur_pos(); + self.input.bump_bytes(1); + self.error_span( + pos_span(start), + SyntaxError::UnexpectedChar { c: byte as _ }, + ) } - - let start = self.cur_pos(); - self.input.bump(); - self.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? - }; - - Ok(Some(token)) + } } /// `#` @@ -695,6 +547,75 @@ impl<'a> Lexer<'a> { Ok(Some(vec![c.into()])) } + + fn read_token_plus_minus(&mut self, c: u8) -> LexResult> { + let start = self.cur_pos(); + + self.input.bump(); + + // '++', '--' + Ok(Some(if self.input.cur() == Some(c as char) { + self.input.bump(); + + // Handle --> + if self.state.had_line_break && c == b'-' && self.eat(b'>') { + self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule); + self.skip_line_comment(0); + self.skip_space::()?; + return self.read_token(); + } + + if c == b'+' { + PlusPlus + } else { + MinusMinus + } + } else if self.input.eat_byte(b'=') { + AssignOp(if c == b'+' { AddAssign } else { SubAssign }) + } else { + BinOp(if c == b'+' { Add } else { Sub }) + })) + } + + fn read_token_bang_or_eq(&mut self, c: u8) -> LexResult> { + let start = self.cur_pos(); + let had_line_break_before_last = self.had_line_break_before_last(); + + self.input.bump(); + + Ok(Some(if self.input.eat_byte(b'=') { + // "==" + + if self.input.eat_byte(b'=') { + if c == b'!' { + BinOp(NotEqEq) + } else { + // ======= + // ^ + if had_line_break_before_last && self.is_str("====") { + self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185); + self.skip_line_comment(4); + self.skip_space::()?; + return self.read_token(); + } + + BinOp(EqEqEq) + } + } else if c == b'!' { + BinOp(NotEq) + } else { + BinOp(EqEq) + } + } else if c == b'=' && self.input.eat_byte(b'>') { + // "=>" + + Arrow + } else if c == b'!' { + Bang + } else { + AssignOp(Assign) + })) + } } impl<'a> Lexer<'a> { diff --git a/crates/swc_ecma_parser/src/lexer/table.rs b/crates/swc_ecma_parser/src/lexer/table.rs new file mode 100644 index 000000000000..2e7072e6d523 --- /dev/null +++ b/crates/swc_ecma_parser/src/lexer/table.rs @@ -0,0 +1,175 @@ +//! Lookup table for byte handlers. +//! +//! Idea is taken from ratel. +//! +//! https://github.com/ratel-rust/ratel-core/blob/e55a1310ba69a3f5ce2a9a6eef643feced02ac08/ratel/src/lexer/mod.rs#L665 + +use either::Either; +use swc_common::input::Input; + +use super::{pos_span, util::CharExt, LexResult, Lexer}; +use crate::{ + error::SyntaxError, + token::{AssignOpToken, BinOpToken, Token}, +}; + +pub(super) type ByteHandler = Option fn(&mut Lexer<'aa>) -> LexResult>>; + +/// Lookup table mapping any incoming byte to a handler function defined below. +pub(super) static BYTE_HANDLERS: [ByteHandler; 256] = [ + // 0 1 2 3 4 5 6 7 8 9 A B C D E F // + EOF, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 0 + ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 1 + ___, EXL, QOT, HSH, IDT, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 + ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, MOR, QST, // 3 + AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4 + IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, IDT, BTC, CRT, IDT, // 5 + TPL, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 6 + IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BEO, PIP, BEC, TLD, ERR, // 7 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F +]; + +const ___: ByteHandler = None; + +const EOF: ByteHandler = Some(|lexer| { + lexer.input.bump_bytes(1); + + Ok(None) +}); + +const ERR: ByteHandler = Some(|lexer| { + let c = unsafe { + // Safety: Byte handler is only called for non-last chracters + lexer.input.cur().unwrap_unchecked() + }; + + let start = lexer.cur_pos(); + lexer.input.bump(); + lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? +}); + +/// Identifier +const IDT: ByteHandler = Some(|lexer| lexer.read_ident_or_keyword().map(Some)); + +/// `0` +const ZER: ByteHandler = Some(|lexer| lexer.read_token_zero().map(Some)); + +/// Numbers +const DIG: ByteHandler = Some(|lexer| { + lexer + .read_number(false) + .map(|v| match v { + Either::Left((value, raw)) => Token::Num { value, raw }, + Either::Right((value, raw)) => Token::BigInt { value, raw }, + }) + .map(Some) +}); + +/// String literals with `'` or `"` +const QOT: ByteHandler = Some(|lexer| lexer.read_str_lit().map(Some)); + +/// Unicode +const UNI: ByteHandler = Some(|lexer| { + let c = unsafe { + // Safety: Byte handler is only called for non-last chracters + lexer.input.cur().unwrap_unchecked() + }; + + // Identifier or keyword. '\uXXXX' sequences are allowed in + // identifiers, so '\' also dispatches to that. + if c == '\\' || c.is_ident_start() { + return lexer.read_ident_or_keyword().map(Some); + } + + let start = lexer.cur_pos(); + lexer.input.bump(); + lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? +}); + +/// `:` +const COL: ByteHandler = Some(|lexer| lexer.read_token_colon().map(Some)); + +/// `%` +const PRC: ByteHandler = Some(|lexer| lexer.read_token_mul_mod(b'%').map(Some)); + +/// `*` +const ATR: ByteHandler = Some(|lexer| lexer.read_token_mul_mod(b'*').map(Some)); + +/// `?` +const QST: ByteHandler = Some(|lexer| lexer.read_token_question_mark().map(Some)); + +/// `&` +const AMP: ByteHandler = Some(|lexer| lexer.read_token_logical(b'&').map(Some)); + +/// `|` +const PIP: ByteHandler = Some(|lexer| lexer.read_token_logical(b'|').map(Some)); + +macro_rules! single_char { + ($name:ident, $c:literal, $token:ident) => { + const $name: ByteHandler = Some(|lexer| { + lexer.input.bump_bytes(1); + Ok(Some(Token::$token)) + }); + }; +} + +single_char!(SEM, b';', Semi); +single_char!(COM, b',', Comma); +single_char!(TPL, b'`', BackQuote); +single_char!(TLD, b'~', Tilde); +single_char!(AT_, b'@', At); + +single_char!(PNO, b'(', LParen); +single_char!(PNC, b')', RParen); + +single_char!(BTO, b'[', LBracket); +single_char!(BTC, b']', RBracket); + +single_char!(BEO, b'{', LBrace); +single_char!(BEC, b'}', RBrace); + +/// `^` +const CRT: ByteHandler = Some(|lexer| { + // Bitwise xor + lexer.input.bump_bytes(1); + Ok(Some(if lexer.input.cur_as_ascii() == Some(b'=') { + lexer.input.bump_bytes(1); + Token::AssignOp(AssignOpToken::BitXorAssign) + } else { + Token::BinOp(BinOpToken::BitXor) + })) +}); + +/// `+` +const PLS: ByteHandler = Some(|lexer| lexer.read_token_plus_minus(b'+')); + +/// `-` +const MIN: ByteHandler = Some(|lexer| lexer.read_token_plus_minus(b'-')); + +/// `!` +const EXL: ByteHandler = Some(|lexer| lexer.read_token_bang_or_eq(b'!')); + +/// `=` +const EQL: ByteHandler = Some(|lexer| lexer.read_token_bang_or_eq(b'=')); + +/// `.` +const PRD: ByteHandler = Some(|lexer| lexer.read_token_dot().map(Some)); + +/// `<` +const LSS: ByteHandler = Some(|lexer| lexer.read_token_lt_gt()); + +/// `>` +const MOR: ByteHandler = Some(|lexer| lexer.read_token_lt_gt()); + +/// `/` +const SLH: ByteHandler = Some(|lexer| lexer.read_slash()); + +/// `#` +const HSH: ByteHandler = Some(|lexer| lexer.read_token_number_sign());