Skip to content

Commit

Permalink
Fix to prefer flow over definitions, setext headings
Browse files Browse the repository at this point in the history
An undocumented part of CommonMark is how to deal with things in definition
labels or definition titles (which both can span multiple lines).
Can flow (or containers?) interrupt them?
They can according to the `cmark` reference parser, so this was implemented here.

This adds a new `Content` content type, which houses zero or more definitions,
and then zero-or-one paragraphs.
Content can be followed by a setext heading underline, which either turns
into a setext heading when the content ends in a paragraph, or turns into
the start of the following paragraph when it is followed by content that
starts with a paragraph, or turns into a stray paragraph.
  • Loading branch information
wooorm committed Sep 14, 2022
1 parent 65d4b46 commit 74d2688
Show file tree
Hide file tree
Showing 26 changed files with 724 additions and 355 deletions.
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ The following scripts are useful when working on this project:
```
- lint:
```sh
cargo fmt --check && cargo clippy -- -D clippy::pedantic -D clippy::cargo -A clippy::doc_link_with_quotes
cargo fmt --check && cargo clippy -- -D clippy::pedantic -D clippy::cargo -A clippy::doc_link_with_quotes -A clippy::unnecessary_wraps
```
- test:
```sh
Expand Down
6 changes: 3 additions & 3 deletions src/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ fn exit(context: &mut CompileContext) {
Name::HeadingAtxSequence => on_exit_heading_atx_sequence(context),
Name::HeadingAtxText => on_exit_heading_atx_text(context),
Name::HeadingSetextText => on_exit_heading_setext_text(context),
Name::HeadingSetextUnderline => on_exit_heading_setext_underline(context),
Name::HeadingSetextUnderlineSequence => on_exit_heading_setext_underline_sequence(context),
Name::HtmlFlow | Name::HtmlText => on_exit_html(context),
Name::HtmlFlowData | Name::HtmlTextData => on_exit_html_data(context),
Name::Image | Name::Link => on_exit_media(context),
Expand Down Expand Up @@ -1440,8 +1440,8 @@ fn on_exit_heading_setext_text(context: &mut CompileContext) {
context.slurp_one_line_ending = true;
}

/// Handle [`Exit`][Kind::Exit]:[`HeadingSetextUnderline`][Name::HeadingSetextUnderline].
fn on_exit_heading_setext_underline(context: &mut CompileContext) {
/// Handle [`Exit`][Kind::Exit]:[`HeadingSetextUnderlineSequence`][Name::HeadingSetextUnderlineSequence].
fn on_exit_heading_setext_underline_sequence(context: &mut CompileContext) {
let text = context
.heading_setext_buffer
.take()
Expand Down
6 changes: 5 additions & 1 deletion src/construct/attention.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
use crate::event::{Event, Kind, Name, Point};
use crate::resolve::Name as ResolveName;
use crate::state::{Name as StateName, State};
use crate::subtokenize::Subresult;
use crate::tokenizer::Tokenizer;
use crate::util::{
char::{
Expand All @@ -87,6 +88,7 @@ use crate::util::{
},
slice::Slice,
};
use alloc::string::String;
use alloc::{vec, vec::Vec};

/// Attentention sequence that we can take markers from.
Expand Down Expand Up @@ -150,7 +152,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State {
}

/// Resolve sequences.
pub fn resolve(tokenizer: &mut Tokenizer) {
pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> {
// Find all sequences, gather info about them.
let mut sequences = get_sequences(tokenizer);

Expand Down Expand Up @@ -221,6 +223,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) {
}

tokenizer.map.consume(&mut tokenizer.events);

Ok(None)
}

/// Get sequences.
Expand Down
188 changes: 188 additions & 0 deletions src/construct/content.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
//! Content occurs in the [flow][] content type.
//!
//! Content contains zero or more [definition][definition]s, followed by zero
//! or one [paragraph][].
//!
//! The constructs found in flow are:
//!
//! * [Definition][crate::construct::definition]
//! * [Paragraph][crate::construct::paragraph]
//!
//! ## Tokens
//!
//! * [`Content`][Name::Content]
//!
//! > 👉 **Note**: while parsing, [`Content`][Name::Content]
//! > is used, which is later compiled away.
//!
//! ## References
//!
//! * [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js)
//!
//! [flow]: crate::construct::flow
//! [definition]: crate::construct::definition
//! [paragraph]: crate::construct::paragraph

use crate::event::{Content, Kind, Link, Name};
use crate::resolve::Name as ResolveName;
use crate::state::{Name as StateName, State};
use crate::subtokenize::{subtokenize, Subresult};
use crate::tokenizer::Tokenizer;
use alloc::{string::String, vec};

/// Before a content content.
///
/// ```markdown
/// > | abc
/// ^
/// ```
pub fn chunk_start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None | Some(b'\n') => unreachable!("unexpected eol/eof"),
_ => {
tokenizer.enter_link(
Name::Content,
Link {
previous: None,
next: None,
content: Content::Content,
},
);
State::Retry(StateName::ContentChunkInside)
}
}
}

/// In a content chunk.
///
/// ```markdown
/// > | abc
/// ^^^
/// ```
pub fn chunk_inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None | Some(b'\n') => {
tokenizer.exit(Name::Content);
tokenizer.register_resolver_before(ResolveName::Content);
// You’d be interrupting.
tokenizer.interrupt = true;
State::Ok
}
_ => {
tokenizer.consume();
State::Next(StateName::ContentChunkInside)
}
}
}

/// Before a definition.
///
/// ```markdown
/// > | [a]: b
/// ^
/// ```
pub fn definition_before(tokenizer: &mut Tokenizer) -> State {
tokenizer.attempt(
State::Next(StateName::ContentDefinitionAfter),
State::Next(StateName::ParagraphStart),
);
State::Retry(StateName::DefinitionStart)
}

/// After a definition.
///
/// ```markdown
/// > | [a]: b
/// ^
/// | c
/// ```
pub fn definition_after(tokenizer: &mut Tokenizer) -> State {
debug_assert!(matches!(tokenizer.current, None | Some(b'\n')));
if tokenizer.current.is_none() {
State::Ok
} else {
tokenizer.enter(Name::LineEnding);
tokenizer.consume();
tokenizer.exit(Name::LineEnding);
State::Next(StateName::ContentDefinitionBefore)
}
}

/// Merge `Content` chunks, which currently span a single line, into actual
/// `Content`s that span multiple lines.
pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> {
let mut index = 0;

while index < tokenizer.events.len() {
let event = &tokenizer.events[index];

if event.kind == Kind::Enter && event.name == Name::Content {
// Exit:Content
let mut exit_index = index + 1;

loop {
let mut enter_index = exit_index + 1;

if enter_index == tokenizer.events.len()
|| tokenizer.events[enter_index].name != Name::LineEnding
{
break;
}

// Skip past line ending.
enter_index += 2;

// Skip past prefix.
while enter_index < tokenizer.events.len() {
let event = &tokenizer.events[enter_index];

if event.name != Name::SpaceOrTab
&& event.name != Name::BlockQuotePrefix
&& event.name != Name::BlockQuoteMarker
{
break;
}

enter_index += 1;
}

if enter_index == tokenizer.events.len()
|| tokenizer.events[enter_index].name != Name::Content
{
break;
}

// Set Exit:Content point to Exit:LineEnding.
tokenizer.events[exit_index].point = tokenizer.events[exit_index + 2].point.clone();
// Remove Enter:LineEnding, Exit:LineEnding.
tokenizer.map.add(exit_index + 1, 2, vec![]);

// Link Enter:Content to Enter:Content on this line and vice versa.
tokenizer.events[exit_index - 1].link.as_mut().unwrap().next = Some(enter_index);
tokenizer.events[enter_index]
.link
.as_mut()
.unwrap()
.previous = Some(exit_index - 1);

// Potential next start.
exit_index = enter_index + 1;
}

// Move to `Exit:Content`.
index = exit_index;
}

index += 1;
}

tokenizer.map.consume(&mut tokenizer.events);

let result = subtokenize(
&mut tokenizer.events,
tokenizer.parse_state,
&Some(Content::Content),
)?;

Ok(Some(result))
}
26 changes: 21 additions & 5 deletions src/construct/definition.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! Definition occurs in the [flow] content type.
//! Definition occurs in the [content] content type.
//!
//! ## Grammar
//!
Expand All @@ -12,8 +12,8 @@
//! ; those parts.
//! ```
//!
//! As this construct occurs in flow, like all flow constructs, it must be
//! followed by an eol (line ending) or eof (end of file).
//! This construct must be followed by an eol (line ending) or eof (end of
//! file), like flow constructs.
//!
//! See [`destination`][destination], [`label`][label], and [`title`][title]
//! for grammar, notes, and recommendations on each part.
Expand Down Expand Up @@ -88,7 +88,7 @@
//! * [`definition.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/definition.js)
//! * [*§ 4.7 Link reference definitions* in `CommonMark`](https://spec.commonmark.org/0.30/#link-reference-definitions)
//!
//! [flow]: crate::construct::flow
//! [content]: crate::construct::content
//! [string]: crate::construct::string
//! [character_escape]: crate::construct::character_escape
//! [character_reference]: crate::construct::character_reference
Expand Down Expand Up @@ -157,7 +157,10 @@ pub fn before(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.token_1 = Name::DefinitionLabel;
tokenizer.tokenize_state.token_2 = Name::DefinitionLabelMarker;
tokenizer.tokenize_state.token_3 = Name::DefinitionLabelString;
tokenizer.attempt(State::Next(StateName::DefinitionLabelAfter), State::Nok);
tokenizer.attempt(
State::Next(StateName::DefinitionLabelAfter),
State::Next(StateName::DefinitionLabelNok),
);
State::Retry(StateName::LabelStart)
}
_ => State::Nok,
Expand Down Expand Up @@ -192,6 +195,19 @@ pub fn label_after(tokenizer: &mut Tokenizer) -> State {
}
}

/// At a non-label
///
/// ```markdown
/// > | []
/// ^
/// ```
pub fn label_nok(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.token_1 = Name::Data;
tokenizer.tokenize_state.token_2 = Name::Data;
tokenizer.tokenize_state.token_3 = Name::Data;
State::Nok
}

/// After marker.
///
/// ```markdown
Expand Down
5 changes: 3 additions & 2 deletions src/construct/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
while !document_lazy_continuation_current && stack_index > 0 {
stack_index -= 1;
let name = &child.stack[stack_index];
if name == &Name::Paragraph || name == &Name::Definition || name == &Name::GfmTableHead {
if name == &Name::Content || name == &Name::GfmTableHead {
document_lazy_continuation_current = true;
}
}
Expand All @@ -423,7 +423,7 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
if !document_lazy_continuation_current && !child.events.is_empty() {
let before = skip::opt_back(&child.events, child.events.len() - 1, &[Name::LineEnding]);
let name = &child.events[before].name;
if name == &Name::Paragraph {
if name == &Name::Content {
document_lazy_continuation_current = true;
}
}
Expand Down Expand Up @@ -582,6 +582,7 @@ fn resolve(tokenizer: &mut Tokenizer) {
&tokenizer.events,
flow_index,
&mut child.events,
(0, 0),
);

// Replace the flow data with actual events.
Expand Down

0 comments on commit 74d2688

Please sign in to comment.