From 9beefaa1d5e711b63293d52a6c77e8aaf5e8f4c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Donny/=EA=B0=95=EB=8F=99=EC=9C=A4?= <kdy1997.dev@gmail.com>
Date: Sat, 11 Mar 2023 21:13:00 +0900
Subject: [PATCH] perf(es/lexer): Use jump table for `read_token` (#7058)

---
 crates/swc_ecma_parser/src/lexer/mod.rs   | 259 ++++++++--------------
 crates/swc_ecma_parser/src/lexer/table.rs | 175 +++++++++++++++
 2 files changed, 265 insertions(+), 169 deletions(-)
 create mode 100644 crates/swc_ecma_parser/src/lexer/table.rs
diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs
index 11efbc5d3132..05b855f18a2b 100644
--- a/crates/swc_ecma_parser/src/lexer/mod.rs
+++ b/crates/swc_ecma_parser/src/lexer/mod.rs
@@ -9,7 +9,12 @@ use swc_atoms::{Atom, AtomGenerator};
 use swc_common::{comments::Comments, input::StringInput, BytePos, Span};
 use swc_ecma_ast::{op, EsVersion};
 
-use self::{comments_buffer::CommentsBuffer, state::State, util::*};
+use self::{
+    comments_buffer::CommentsBuffer,
+    state::State,
+    table::{ByteHandler, BYTE_HANDLERS},
+    util::*,
+};
 pub use self::{
     input::Input,
     state::{TokenContext, TokenContexts},
@@ -25,6 +30,7 @@ pub mod input;
 mod jsx;
 mod number;
 mod state;
+mod table;
 #[cfg(test)]
 mod tests;
 pub mod util;
@@ -161,178 +167,24 @@ impl<'a> Lexer<'a> {
 
     /// babel: `getTokenFromCode`
     fn read_token(&mut self) -> LexResult<Option<Token>> {
-        let c = self.input.cur_as_ascii();
-
-        match c {
-            None => {}
-            Some(c) => {
-                match c {
-                    b'#' => return self.read_token_number_sign(),
-
-                    //
-                    b'.' => return self.read_token_dot().map(Some),
-
-                    b'(' | b')' | b';' | b',' | b'[' | b']' | b'{' | b'}' | b'@' | b'`' | b'~' => {
-                        // These tokens are emitted directly.
-                        self.input.bump();
-                        return Ok(Some(match c {
-                            b'(' => LParen,
-                            b')' => RParen,
-                            b';' => Semi,
-                            b',' => Comma,
-                            b'[' => LBracket,
-                            b']' => RBracket,
-                            b'{' => LBrace,
-                            b'}' => RBrace,
-                            b'@' => At,
-                            b'`' => tok!('`'),
-                            b'~' => tok!('~'),
-
-                            _ => unreachable!(),
-                        }));
-                    }
-
-                    b'?' => return self.read_token_question_mark().map(Some),
-
-                    b':' => return self.read_token_colon().map(Some),
-
-                    b'0' => return self.read_token_zero().map(Some),
-
-                    b'1'..=b'9' => {
-                        return self
-                            .read_number(false)
-                            .map(|v| match v {
-                                Left((value, raw)) => Num { value, raw },
-                                Right((value, raw)) => BigInt { value, raw },
-                            })
-                            .map(Some);
-                    }
-
-                    b'"' | b'\'' => return self.read_str_lit().map(Some),
-
-                    b'/' => return self.read_slash(),
-
-                    b'%' | b'*' => return self.read_token_mul_mod(c).map(Some),
-
-                    // Logical operators
-                    b'|' | b'&' => return self.read_token_logical(c).map(Some),
-                    b'^' => {
-                        // Bitwise xor
-                        self.input.bump();
-                        return Ok(Some(if self.input.cur() == Some('=') {
-                            self.input.bump();
-                            AssignOp(BitXorAssign)
-                        } else {
-                            BinOp(BitXor)
-                        }));
-                    }
-
-                    b'+' | b'-' => {
-                        let start = self.cur_pos();
-
-                        self.input.bump();
-
-                        // '++', '--'
-                        return Ok(Some(if self.input.cur() == Some(c as char) {
-                            self.input.bump();
-
-                            // Handle -->
-                            if self.state.had_line_break && c == b'-' && self.eat(b'>') {
-                                self.emit_module_mode_error(
-                                    start,
-                                    SyntaxError::LegacyCommentInModule,
-                                );
-                                self.skip_line_comment(0);
-                                self.skip_space::<true>()?;
-                                return self.read_token();
-                            }
-
-                            if c == b'+' {
-                                PlusPlus
-                            } else {
-                                MinusMinus
-                            }
-                        } else if self.input.eat_byte(b'=') {
-                            AssignOp(if c == b'+' { AddAssign } else { SubAssign })
-                        } else {
-                            BinOp(if c == b'+' { Add } else { Sub })
-                        }));
-                    }
-
-                    b'<' | b'>' => return self.read_token_lt_gt(),
-
-                    b'!' | b'=' => {
-                        let start = self.cur_pos();
-                        let had_line_break_before_last = self.had_line_break_before_last();
-
-                        self.input.bump();
-
-                        return Ok(Some(if self.input.eat_byte(b'=') {
-                            // "=="
-
-                            if self.input.eat_byte(b'=') {
-                                if c == b'!' {
-                                    BinOp(NotEqEq)
-                                } else {
-                                    // =======
-                                    //    ^
-                                    if had_line_break_before_last && self.is_str("====") {
-                                        self.emit_error_span(
-                                            fixed_len_span(start, 7),
-                                            SyntaxError::TS1185,
-                                        );
-                                        self.skip_line_comment(4);
-                                        self.skip_space::<true>()?;
-                                        return self.read_token();
-                                    }
-
-                                    BinOp(EqEqEq)
-                                }
-                            } else if c == b'!' {
-                                BinOp(NotEq)
-                            } else {
-                                BinOp(EqEq)
-                            }
-                        } else if c == b'=' && self.input.eat_byte(b'>') {
-                            // "=>"
-
-                            Arrow
-                        } else if c == b'!' {
-                            Bang
-                        } else {
-                            AssignOp(Assign)
-                        }));
-                    }
+        let byte = match self.input.as_str().as_bytes().first() {
+            Some(&v) => v,
+            None => return Ok(None),
+        };
 
-                    b'a'..=b'z' | b'A'..=b'Z' | b'$' | b'_' | b'\\' => {
-                        // Fast path for ascii identifiers.
-                        return self.read_ident_or_keyword().map(Some);
-                    }
-                    _ => {}
-                }
-            }
-        }
+        let handler = unsafe { *(&BYTE_HANDLERS as *const ByteHandler).offset(byte as isize) };
 
-        let c = match self.input.cur() {
-            Some(c) => c,
+        match handler {
+            Some(handler) => handler(self),
             None => {
-                return Ok(None);
-            }
-        };
-
-        let token = {
-            // Identifier or keyword. '\uXXXX' sequences are allowed in
-            // identifiers, so '\' also dispatches to that.
-            if c == '\\' || c.is_ident_start() {
-                return self.read_ident_or_keyword().map(Some);
+                let start = self.cur_pos();
+                self.input.bump_bytes(1);
+                self.error_span(
+                    pos_span(start),
+                    SyntaxError::UnexpectedChar { c: byte as _ },
+                )
             }
-
-            let start = self.cur_pos();
-            self.input.bump();
-            self.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
-        };
-
-        Ok(Some(token))
+        }
     }
 
     /// `#`
@@ -695,6 +547,75 @@ impl<'a> Lexer<'a> {
 
         Ok(Some(vec![c.into()]))
     }
+
+    fn read_token_plus_minus(&mut self, c: u8) -> LexResult<Option<Token>> {
+        let start = self.cur_pos();
+
+        self.input.bump();
+
+        // '++', '--'
+        Ok(Some(if self.input.cur() == Some(c as char) {
+            self.input.bump();
+
+            // Handle -->
+            if self.state.had_line_break && c == b'-' && self.eat(b'>') {
+                self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule);
+                self.skip_line_comment(0);
+                self.skip_space::<true>()?;
+                return self.read_token();
+            }
+
+            if c == b'+' {
+                PlusPlus
+            } else {
+                MinusMinus
+            }
+        } else if self.input.eat_byte(b'=') {
+            AssignOp(if c == b'+' { AddAssign } else { SubAssign })
+        } else {
+            BinOp(if c == b'+' { Add } else { Sub })
+        }))
+    }
+
+    fn read_token_bang_or_eq(&mut self, c: u8) -> LexResult<Option<Token>> {
+        let start = self.cur_pos();
+        let had_line_break_before_last = self.had_line_break_before_last();
+
+        self.input.bump();
+
+        Ok(Some(if self.input.eat_byte(b'=') {
+            // "=="
+
+            if self.input.eat_byte(b'=') {
+                if c == b'!' {
+                    BinOp(NotEqEq)
+                } else {
+                    // =======
+                    //    ^
+                    if had_line_break_before_last && self.is_str("====") {
+                        self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185);
+                        self.skip_line_comment(4);
+                        self.skip_space::<true>()?;
+                        return self.read_token();
+                    }
+
+                    BinOp(EqEqEq)
+                }
+            } else if c == b'!' {
+                BinOp(NotEq)
+            } else {
+                BinOp(EqEq)
+            }
+        } else if c == b'=' && self.input.eat_byte(b'>') {
+            // "=>"
+
+            Arrow
+        } else if c == b'!' {
+            Bang
+        } else {
+            AssignOp(Assign)
+        }))
+    }
 }
 
 impl<'a> Lexer<'a> {
diff --git a/crates/swc_ecma_parser/src/lexer/table.rs b/crates/swc_ecma_parser/src/lexer/table.rs
new file mode 100644
index 000000000000..2e7072e6d523
--- /dev/null
+++ b/crates/swc_ecma_parser/src/lexer/table.rs
@@ -0,0 +1,175 @@
+//! Lookup table for byte handlers.
+//!
+//! Idea is taken from ratel.
+//!
+//! https://github.com/ratel-rust/ratel-core/blob/e55a1310ba69a3f5ce2a9a6eef643feced02ac08/ratel/src/lexer/mod.rs#L665
+
+use either::Either;
+use swc_common::input::Input;
+
+use super::{pos_span, util::CharExt, LexResult, Lexer};
+use crate::{
+    error::SyntaxError,
+    token::{AssignOpToken, BinOpToken, Token},
+};
+
+pub(super) type ByteHandler = Option<for<'aa> fn(&mut Lexer<'aa>) -> LexResult<Option<Token>>>;
+
+/// Lookup table mapping any incoming byte to a handler function defined below.
+pub(super) static BYTE_HANDLERS: [ByteHandler; 256] = [
+    //   0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F   //
+    EOF, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 0
+    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 1
+    ___, EXL, QOT, HSH, IDT, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2
+    ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, MOR, QST, // 3
+    AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4
+    IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, IDT, BTC, CRT, IDT, // 5
+    TPL, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 6
+    IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BEO, PIP, BEC, TLD, ERR, // 7
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F
+];
+
+const ___: ByteHandler = None;
+
+const EOF: ByteHandler = Some(|lexer| {
+    lexer.input.bump_bytes(1);
+
+    Ok(None)
+});
+
+const ERR: ByteHandler = Some(|lexer| {
+    let c = unsafe {
+        // Safety: Byte handler is only called for non-last chracters
+        lexer.input.cur().unwrap_unchecked()
+    };
+
+    let start = lexer.cur_pos();
+    lexer.input.bump();
+    lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
+});
+
+/// Identifier
+const IDT: ByteHandler = Some(|lexer| lexer.read_ident_or_keyword().map(Some));
+
+/// `0`
+const ZER: ByteHandler = Some(|lexer| lexer.read_token_zero().map(Some));
+
+/// Numbers
+const DIG: ByteHandler = Some(|lexer| {
+    lexer
+        .read_number(false)
+        .map(|v| match v {
+            Either::Left((value, raw)) => Token::Num { value, raw },
+            Either::Right((value, raw)) => Token::BigInt { value, raw },
+        })
+        .map(Some)
+});
+
+/// String literals with `'` or `"`
+const QOT: ByteHandler = Some(|lexer| lexer.read_str_lit().map(Some));
+
+/// Unicode
+const UNI: ByteHandler = Some(|lexer| {
+    let c = unsafe {
+        // Safety: Byte handler is only called for non-last chracters
+        lexer.input.cur().unwrap_unchecked()
+    };
+
+    // Identifier or keyword. '\uXXXX' sequences are allowed in
+    // identifiers, so '\' also dispatches to that.
+    if c == '\\' || c.is_ident_start() {
+        return lexer.read_ident_or_keyword().map(Some);
+    }
+
+    let start = lexer.cur_pos();
+    lexer.input.bump();
+    lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
+});
+
+/// `:`
+const COL: ByteHandler = Some(|lexer| lexer.read_token_colon().map(Some));
+
+/// `%`
+const PRC: ByteHandler = Some(|lexer| lexer.read_token_mul_mod(b'%').map(Some));
+
+/// `*`
+const ATR: ByteHandler = Some(|lexer| lexer.read_token_mul_mod(b'*').map(Some));
+
+/// `?`
+const QST: ByteHandler = Some(|lexer| lexer.read_token_question_mark().map(Some));
+
+/// `&`
+const AMP: ByteHandler = Some(|lexer| lexer.read_token_logical(b'&').map(Some));
+
+/// `|`
+const PIP: ByteHandler = Some(|lexer| lexer.read_token_logical(b'|').map(Some));
+
+macro_rules! single_char {
+    ($name:ident, $c:literal, $token:ident) => {
+        const $name: ByteHandler = Some(|lexer| {
+            lexer.input.bump_bytes(1);
+            Ok(Some(Token::$token))
+        });
+    };
+}
+
+single_char!(SEM, b';', Semi);
+single_char!(COM, b',', Comma);
+single_char!(TPL, b'`', BackQuote);
+single_char!(TLD, b'~', Tilde);
+single_char!(AT_, b'@', At);
+
+single_char!(PNO, b'(', LParen);
+single_char!(PNC, b')', RParen);
+
+single_char!(BTO, b'[', LBracket);
+single_char!(BTC, b']', RBracket);
+
+single_char!(BEO, b'{', LBrace);
+single_char!(BEC, b'}', RBrace);
+
+/// `^`
+const CRT: ByteHandler = Some(|lexer| {
+    // Bitwise xor
+    lexer.input.bump_bytes(1);
+    Ok(Some(if lexer.input.cur_as_ascii() == Some(b'=') {
+        lexer.input.bump_bytes(1);
+        Token::AssignOp(AssignOpToken::BitXorAssign)
+    } else {
+        Token::BinOp(BinOpToken::BitXor)
+    }))
+});
+
+/// `+`
+const PLS: ByteHandler = Some(|lexer| lexer.read_token_plus_minus(b'+'));
+
+/// `-`
+const MIN: ByteHandler = Some(|lexer| lexer.read_token_plus_minus(b'-'));
+
+/// `!`
+const EXL: ByteHandler = Some(|lexer| lexer.read_token_bang_or_eq(b'!'));
+
+/// `=`
+const EQL: ByteHandler = Some(|lexer| lexer.read_token_bang_or_eq(b'='));
+
+/// `.`
+const PRD: ByteHandler = Some(|lexer| lexer.read_token_dot().map(Some));
+
+/// `<`
+const LSS: ByteHandler = Some(|lexer| lexer.read_token_lt_gt());
+
+/// `>`
+const MOR: ByteHandler = Some(|lexer| lexer.read_token_lt_gt());
+
+/// `/`
+const SLH: ByteHandler = Some(|lexer| lexer.read_slash());
+
+/// `#`
+const HSH: ByteHandler = Some(|lexer| lexer.read_token_number_sign());