From 91e5cc8d7912a77233144aee8209891dc98cb3af Mon Sep 17 00:00:00 2001 From: Joshua Barretto Date: Thu, 4 Aug 2022 19:37:36 +0100 Subject: [PATCH] Updated Pythonic example to demonstrate spanning and flattening --- examples/pythonic.rs | 95 +++++++++++++++++++++++++++++++++++--------- src/text.rs | 20 +++++----- 2 files changed, 87 insertions(+), 28 deletions(-) diff --git a/examples/pythonic.rs b/examples/pythonic.rs index 17aeb2fa..7683fc31 100644 --- a/examples/pythonic.rs +++ b/examples/pythonic.rs @@ -1,42 +1,101 @@ -use chumsky::prelude::*; +use chumsky::{prelude::*, Flat, BoxStream}; +use std::ops::Range; -#[derive(Debug)] +// Represents the different kinds of delimiters we care about +#[derive(Copy, Clone, Debug)] enum Delim { Paren, Block, } -#[derive(Debug)] +// An 'atomic' token (i.e: it has no child tokens) +#[derive(Clone, Debug)] enum Token { Int(u64), Ident(String), Op(String), - Tree(Delim, Vec), + Open(Delim), + Close(Delim), } -// A parser that turns pythonic code with semantic whitespace into a token tree -fn lexer() -> impl Parser, Error = Simple> { - let int = text::int(10).from_str().unwrapped().map(Token::Int); +// The output of the lexer: a recursive tree of nested tokens +#[derive(Debug)] +enum TokenTree { + Token(Token), + Tree(Delim, Vec>), +} - let ident = text::ident().map(Token::Ident); +type Span = Range; - let op = one_of("=.:%,") - .repeated() - .at_least(1) - .collect() - .map(Token::Op); +type Spanned = (T, Span); +// A parser that turns pythonic code with semantic whitespace into a token tree +fn lexer() -> impl Parser>, Error = Simple> { let tt = recursive(|tt| { - let tt_list = tt.padded().repeated(); + // Define some atomic tokens + let int = text::int(10).from_str().unwrapped().map(Token::Int); + let ident = text::ident().map(Token::Ident); + let op = one_of("=.:%,") + .repeated() + .at_least(1) + .collect() + .map(Token::Op); + + let single_token = int + .or(op) + .or(ident) + .map(TokenTree::Token); - int.or(op).or(ident).or(tt_list + // Tokens surrounded by parentheses get turned into parenthesised token trees + let token_tree = tt + .padded() + .repeated() .delimited_by(just('('), just(')')) - .map(|tts| Token::Tree(Delim::Paren, tts))) + .map(|tts| TokenTree::Tree(Delim::Paren, tts)); + + single_token.or(token_tree) + .map_with_span(|tt, span| (tt, span)) }); - text::semantic_indentation(tt, |tts| Token::Tree(Delim::Block, tts)).then_ignore(end()) + // Whitespace indentation creates code block token trees + text::semantic_indentation(tt, |tts, span| (TokenTree::Tree(Delim::Block, tts), span)) + .then_ignore(end()) +} + +/// Flatten a series of token trees into a single token stream, ready for feeding into the main parser +fn tts_to_stream(eoi: Span, token_trees: Vec>) -> BoxStream<'static, Token, Span> { + use std::iter::once; + + BoxStream::from_nested(eoi, token_trees.into_iter(), |(tt, span)| match tt { + // Single tokens remain unchanged + TokenTree::Token(token) => Flat::Single((token, span)), + // Nested token trees get flattened into their inner contents, surrounded by `Open` and `Close` tokens + TokenTree::Tree(delim, tree) => Flat::Many( + once((TokenTree::Token(Token::Open(delim)), span.clone())) + .chain(tree.into_iter()) + .chain(once((TokenTree::Token(Token::Close(delim)), span))), + ), + }) } fn main() { - println!("{:#?}", lexer().parse(include_str!("sample.py"))); + let code = include_str!("sample.py"); + + // First, lex the code into some nested token trees + let tts = lexer().parse(code).unwrap(); + + println!("--- Token Trees ---\n{:#?}", tts); + + // Next, flatten + let eoi = 0..code.chars().count(); + let mut token_stream = tts_to_stream(eoi, tts); + + // At this point, we have a token stream that can be fed into the main parser! Because this is just an example, + // we're instead going to just collect the token stream into a vector and print it. + + let flattened_trees = token_stream + .fetch_tokens() + .collect::>(); + + println!("--- Flattened Token Trees ---\n{:?}", flattened_trees); } diff --git a/src/text.rs b/src/text.rs index 91a390e1..99e149f6 100644 --- a/src/text.rs +++ b/src/text.rs @@ -383,7 +383,7 @@ where C: Character + 'a, Tok: 'a, T: Parser + Clone + 'a, - F: Fn(Vec) -> Tok + Clone + 'a, + F: Fn(Vec, E::Span) -> Tok + Clone + 'a, { let line_ws = filter(|c: &C| c.is_inline_whitespace()); @@ -391,17 +391,17 @@ where let lines = line_ws .repeated() - .then(line) + .then(line.map_with_span(|line, span| (line, span))) .separated_by(newline()) .padded(); lines.map(move |lines| { - fn collapse(mut tree: Vec<(Vec, Vec)>, make_group: &F) -> Option + fn collapse(mut tree: Vec<(Vec, Vec, Option)>, make_group: &F) -> Option where - F: Fn(Vec) -> Tok, + F: Fn(Vec, S) -> Tok, { - while let Some((_, tts)) = tree.pop() { - let tt = make_group(tts); + while let Some((_, tts, line_span)) = tree.pop() { + let tt = make_group(tts, line_span?); if let Some(last) = tree.last_mut() { last.1.push(tt); } else { @@ -411,13 +411,13 @@ where None } - let mut nesting = vec![(Vec::new(), Vec::new())]; - for (indent, mut line) in lines { + let mut nesting = vec![(Vec::new(), Vec::new(), None)]; + for (indent, (mut line, line_span)) in lines { let mut indent = indent.as_slice(); let mut i = 0; while let Some(tail) = nesting .get(i) - .and_then(|(n, _)| indent.strip_prefix(n.as_slice())) + .and_then(|(n, _, _)| indent.strip_prefix(n.as_slice())) { indent = tail; i += 1; @@ -426,7 +426,7 @@ where nesting.last_mut().unwrap().1.push(tail); } if !indent.is_empty() { - nesting.push((indent.to_vec(), line)); + nesting.push((indent.to_vec(), line, Some(line_span))); } else { nesting.last_mut().unwrap().1.append(&mut line); }