Skip to content

Commit

Permalink
Add support for parsing MDX ESM, expressions
Browse files Browse the repository at this point in the history
This commit adds support for hooks that lets a user integrate another
parser with `micromark-rs`, to parse ESM and expressions according to
a particular grammar (such as a programming language, typically
JavaScript).

For an example integrating with SWC, see `tests/test_utils/mod.rs`.

The integration occurs with two functions passed in `options`:
`mdx_expression_parse` and `mdx_esm_parse`.
The can signal back to micromark when they are successful,
whether there is an error at the end (in which case micromark will
try to parse more), or whether there is a syntax error (in which case
micromark will crash).
  • Loading branch information
wooorm committed Sep 19, 2022
1 parent d4cc03c commit fe618ff
Show file tree
Hide file tree
Showing 22 changed files with 1,573 additions and 445 deletions.
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@ unicode-id = { version = "0.3", features = ["no_std"] }

[dev-dependencies]
env_logger = "0.9"
criterion = "0.3"
criterion = "0.4"
pretty_assertions = "1"
swc_common = "0.28"
swc_ecma_parser = "0.119"
swc_ecma_ast = "0.91"

[build-dependencies]
regex = "1"
Expand Down
3 changes: 2 additions & 1 deletion src/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ fn enter(context: &mut CompileContext) {
| Name::HeadingAtxText
| Name::HeadingSetextText
| Name::Label
| Name::MdxEsm
| Name::MdxFlowExpression
| Name::MdxTextExpression
| Name::MdxJsxFlowTag
Expand Down Expand Up @@ -412,7 +413,7 @@ fn exit(context: &mut CompileContext) {
| Name::Resource => {
on_exit_drop(context);
}
Name::MdxFlowExpression | Name::MdxJsxFlowTag => on_exit_drop_slurp(context),
Name::MdxEsm | Name::MdxFlowExpression | Name::MdxJsxFlowTag => on_exit_drop_slurp(context),
Name::CharacterEscapeValue | Name::CodeTextData | Name::Data | Name::MathTextData => {
on_exit_data(context);
}
Expand Down
8 changes: 8 additions & 0 deletions src/construct/flow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
//! * [Heading (atx)][crate::construct::heading_atx]
//! * [Heading (setext)][crate::construct::heading_setext]
//! * [HTML (flow)][crate::construct::html_flow]
//! * [MDX esm][crate::construct::mdx_esm]
//! * [MDX expression (flow)][crate::construct::mdx_expression_flow]
//! * [MDX JSX (flow)][crate::construct::mdx_jsx_flow]
//! * [Raw (flow)][crate::construct::raw_flow] (code (fenced), math (flow))
Expand Down Expand Up @@ -66,6 +67,13 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
);
State::Retry(StateName::HtmlFlowStart)
}
Some(b'e' | b'i') => {
tokenizer.attempt(
State::Next(StateName::FlowAfter),
State::Next(StateName::FlowBeforeContent),
);
State::Retry(StateName::MdxEsmStart)
}
Some(b'{') => {
tokenizer.attempt(
State::Next(StateName::FlowAfter),
Expand Down
224 changes: 224 additions & 0 deletions src/construct/mdx_esm.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
//! MDX ESM occurs in the [flow][] content type.
//!
//! ## Grammar
//!
//! MDX expression (flow) forms with the following BNF
//! (<small>see [construct][crate::construct] for character groups</small>):
//!
//! ```bnf
//! mdx_esm ::= word *line *(eol *line)
//!
//! word ::= 'e' 'x' 'p' 'o' 'r' 't' | 'i' 'm' 'p' 'o' 'r' 't'
//! ```
//!
//! This construct must be followed by a blank line or eof (end of file).
//! It can include blank lines if [`MdxEsmParse`][crate::MdxEsmParse] passed in
//! `options.mdx_esm_parse` allows it.
//!
//! ## Tokens
//!
//! * [`LineEnding`][Name::LineEnding]
//! * [`MdxEsm`][Name::MdxEsm]
//! * [`MdxEsmData`][Name::MdxEsmData]
//!
//! ## References
//!
//! * [`syntax.js` in `micromark-extension-mdxjs-esm`](https://github.com/micromark/micromark-extension-mdxjs-esm/blob/main/dev/lib/syntax.js)
//! * [`mdxjs.com`](https://mdxjs.com)
//!
//! [flow]: crate::construct::flow

use crate::event::Name;
use crate::state::{Name as StateName, State};
use crate::tokenizer::Tokenizer;
use crate::util::{
mdx_collect::{collect, place_to_point},
slice::Slice,
};
use crate::MdxSignal;
use alloc::format;

/// Start of MDX ESM.
///
/// ```markdown
/// > | import a from 'b'
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
// If it’s turned on.
if tokenizer.parse_state.options.constructs.mdx_esm
// If there is a gnostic parser.
&& tokenizer.parse_state.options.mdx_esm_parse.is_some()
// When not interrupting.
&& !tokenizer.interrupt
// Only at the start of a line, not at whitespace or in a container.
&& tokenizer.point.column == 1
&& matches!(tokenizer.current, Some(b'e' | b'i'))
{
// Place where keyword starts.
tokenizer.tokenize_state.start = tokenizer.point.index;
tokenizer.enter(Name::MdxEsm);
tokenizer.enter(Name::MdxEsmData);
tokenizer.consume();
State::Next(StateName::MdxEsmWord)
} else {
State::Nok
}
}

/// In keyword.
///
/// ```markdown
/// > | import a from 'b'
/// ^^^^^^
/// ```
pub fn word(tokenizer: &mut Tokenizer) -> State {
if matches!(tokenizer.current, Some(b'a'..=b'z')) {
tokenizer.consume();
State::Next(StateName::MdxEsmWord)
} else {
let slice = Slice::from_indices(
tokenizer.parse_state.bytes,
tokenizer.tokenize_state.start,
tokenizer.point.index,
);

if matches!(slice.as_str(), "export" | "import") && tokenizer.current == Some(b' ') {
tokenizer.concrete = true;
tokenizer.tokenize_state.start = tokenizer.events.len() - 1;
tokenizer.consume();
State::Next(StateName::MdxEsmInside)
} else {
tokenizer.tokenize_state.start = 0;
State::Nok
}
}
}

/// In data.
///
/// ```markdown
/// > | import a from 'b'
/// ^
/// ```
pub fn inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None | Some(b'\n') => {
tokenizer.exit(Name::MdxEsmData);
State::Retry(StateName::MdxEsmLineStart)
}
_ => {
tokenizer.consume();
State::Next(StateName::MdxEsmInside)
}
}
}

/// At start of line.
///
/// ```markdown
/// | import a from 'b'
/// > | export {a}
/// ^
/// ```
pub fn line_start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None => State::Retry(StateName::MdxEsmAtEnd),
Some(b'\n') => {
tokenizer.check(
State::Next(StateName::MdxEsmAtEnd),
State::Next(StateName::MdxEsmContinuationStart),
);
State::Retry(StateName::MdxEsmBlankLineBefore)
}
_ => {
tokenizer.enter(Name::MdxEsmData);
tokenizer.consume();
State::Next(StateName::MdxEsmInside)
}
}
}

/// At start of line that continues.
///
/// ```markdown
/// | import a from 'b'
/// > | export {a}
/// ^
/// ```
pub fn continuation_start(tokenizer: &mut Tokenizer) -> State {
tokenizer.enter(Name::LineEnding);
tokenizer.consume();
tokenizer.exit(Name::LineEnding);
State::Next(StateName::MdxEsmLineStart)
}

/// At start of a potentially blank line.
///
/// ```markdown
/// | import a from 'b'
/// > | export {a}
/// ^
/// ```
pub fn blank_line_before(tokenizer: &mut Tokenizer) -> State {
tokenizer.enter(Name::LineEnding);
tokenizer.consume();
tokenizer.exit(Name::LineEnding);
State::Next(StateName::BlankLineStart)
}

/// At end of line (blank or eof).
///
/// ```markdown
/// > | import a from 'b'
/// ^
/// ```
pub fn at_end(tokenizer: &mut Tokenizer) -> State {
let result = parse_esm(tokenizer);

// Done!.
if matches!(result, State::Ok) {
tokenizer.concrete = false;
tokenizer.exit(Name::MdxEsm);
}

result
}

/// Parse ESM with a given function.
fn parse_esm(tokenizer: &mut Tokenizer) -> State {
// We can `unwrap` because we don’t parse if this is `None`.
let parse = tokenizer
.parse_state
.options
.mdx_esm_parse
.as_ref()
.unwrap();

// Collect the body of the ESM and positional info for each run of it.
let result = collect(
tokenizer,
tokenizer.tokenize_state.start,
&[Name::MdxEsmData, Name::LineEnding],
);

// Parse and handle what was signaled back.
match parse(&result.value) {
MdxSignal::Ok => State::Ok,
MdxSignal::Error(message, place) => {
let point = place_to_point(&result, place);
State::Error(format!("{}:{}: {}", point.line, point.column, message))
}
MdxSignal::Eof(message) => {
if tokenizer.current == None {
State::Error(format!(
"{}:{}: {}",
tokenizer.point.line, tokenizer.point.column, message
))
} else {
tokenizer.tokenize_state.mdx_last_parse_error = Some(message);
State::Retry(StateName::MdxEsmContinuationStart)
}
}
}
}
2 changes: 2 additions & 0 deletions src/construct/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
//! * [gfm task list item check][gfm_task_list_item_check]
//! * [mdx expression (flow)][mdx_expression_flow]
//! * [mdx expression (text)][mdx_expression_text]
//! * [mdx esm][mdx_esm]
//! * [mdx jsx (flow)][mdx_jsx_flow]
//! * [mdx jsx (text)][mdx_jsx_text]
//!
Expand Down Expand Up @@ -169,6 +170,7 @@ pub mod label_end;
pub mod label_start_image;
pub mod label_start_link;
pub mod list_item;
pub mod mdx_esm;
pub mod mdx_expression_flow;
pub mod mdx_expression_text;
pub mod mdx_jsx_flow;
Expand Down

0 comments on commit fe618ff

Please sign in to comment.