Updated Pythonic example to demonstrate spanning and flattening

zesterer · Aug 4, 2022 · 91e5cc8 · 91e5cc8
1 parent f17f70e
commit 91e5cc8
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 28 deletions.
diff --git a/examples/pythonic.rs b/examples/pythonic.rs
@@ -1,42 +1,101 @@
-use chumsky::prelude::*;
+use chumsky::{prelude::*, Flat, BoxStream};
+use std::ops::Range;
 
-#[derive(Debug)]
+// Represents the different kinds of delimiters we care about
+#[derive(Copy, Clone, Debug)]
 enum Delim {
     Paren,
     Block,
 }
 
-#[derive(Debug)]
+// An 'atomic' token (i.e: it has no child tokens)
+#[derive(Clone, Debug)]
 enum Token {
     Int(u64),
     Ident(String),
     Op(String),
-    Tree(Delim, Vec<Token>),
+    Open(Delim),
+    Close(Delim),
 }
 
-// A parser that turns pythonic code with semantic whitespace into a token tree
-fn lexer() -> impl Parser<char, Vec<Token>, Error = Simple<char>> {
-    let int = text::int(10).from_str().unwrapped().map(Token::Int);
+// The output of the lexer: a recursive tree of nested tokens
+#[derive(Debug)]
+enum TokenTree {
+    Token(Token),
+    Tree(Delim, Vec<Spanned<TokenTree>>),
+}
 
-    let ident = text::ident().map(Token::Ident);
+type Span = Range<usize>;
 
-    let op = one_of("=.:%,")
-        .repeated()
-        .at_least(1)
-        .collect()
-        .map(Token::Op);
+type Spanned<T> = (T, Span);
 
+// A parser that turns pythonic code with semantic whitespace into a token tree
+fn lexer() -> impl Parser<char, Vec<Spanned<TokenTree>>, Error = Simple<char>> {
     let tt = recursive(|tt| {
-        let tt_list = tt.padded().repeated();
+        // Define some atomic tokens
+        let int = text::int(10).from_str().unwrapped().map(Token::Int);
+        let ident = text::ident().map(Token::Ident);
+        let op = one_of("=.:%,")
+            .repeated()
+            .at_least(1)
+            .collect()
+            .map(Token::Op);
+
+        let single_token = int
+            .or(op)
+            .or(ident)
+            .map(TokenTree::Token);
 
-        int.or(op).or(ident).or(tt_list
+        // Tokens surrounded by parentheses get turned into parenthesised token trees
+        let token_tree = tt
+            .padded()
+            .repeated()
             .delimited_by(just('('), just(')'))
-            .map(|tts| Token::Tree(Delim::Paren, tts)))
+            .map(|tts| TokenTree::Tree(Delim::Paren, tts));
+
+        single_token.or(token_tree)
+            .map_with_span(|tt, span| (tt, span))
     });
 
-    text::semantic_indentation(tt, |tts| Token::Tree(Delim::Block, tts)).then_ignore(end())
+    // Whitespace indentation creates code block token trees
+    text::semantic_indentation(tt, |tts, span| (TokenTree::Tree(Delim::Block, tts), span))
+        .then_ignore(end())
+}
+
+/// Flatten a series of token trees into a single token stream, ready for feeding into the main parser
+fn tts_to_stream(eoi: Span, token_trees: Vec<Spanned<TokenTree>>) -> BoxStream<'static, Token, Span> {
+    use std::iter::once;
+
+    BoxStream::from_nested(eoi, token_trees.into_iter(), |(tt, span)| match tt {
+        // Single tokens remain unchanged
+        TokenTree::Token(token) => Flat::Single((token, span)),
+        // Nested token trees get flattened into their inner contents, surrounded by `Open` and `Close` tokens
+        TokenTree::Tree(delim, tree) => Flat::Many(
+            once((TokenTree::Token(Token::Open(delim)), span.clone()))
+                .chain(tree.into_iter())
+                .chain(once((TokenTree::Token(Token::Close(delim)), span))),
+        ),
+    })
 }
 
 fn main() {
-    println!("{:#?}", lexer().parse(include_str!("sample.py")));
+    let code = include_str!("sample.py");
+
+    // First, lex the code into some nested token trees
+    let tts = lexer().parse(code).unwrap();
+
+    println!("--- Token Trees ---\n{:#?}", tts);
+
+    // Next, flatten
+    let eoi = 0..code.chars().count();
+    let mut token_stream = tts_to_stream(eoi, tts);
+
+    // At this point, we have a token stream that can be fed into the main parser! Because this is just an example,
+    // we're instead going to just collect the token stream into a vector and print it.
+
+    let flattened_trees = token_stream
+        .fetch_tokens()
+        .collect::<Vec<_>>();
+
+    println!("--- Flattened Token Trees ---\n{:?}", flattened_trees);
 }
diff --git a/src/text.rs b/src/text.rs
@@ -383,25 +383,25 @@ where
     C: Character + 'a,
     Tok: 'a,
     T: Parser<C, Tok, Error = E> + Clone + 'a,
-    F: Fn(Vec<Tok>) -> Tok + Clone + 'a,
+    F: Fn(Vec<Tok>, E::Span) -> Tok + Clone + 'a,
 {
     let line_ws = filter(|c: &C| c.is_inline_whitespace());
 
     let line = token.padded_by(line_ws.ignored().repeated()).repeated();
 
     let lines = line_ws
         .repeated()
-        .then(line)
+        .then(line.map_with_span(|line, span| (line, span)))
         .separated_by(newline())
         .padded();
 
     lines.map(move |lines| {
-        fn collapse<C, Tok, F>(mut tree: Vec<(Vec<C>, Vec<Tok>)>, make_group: &F) -> Option<Tok>
+        fn collapse<C, Tok, F, S>(mut tree: Vec<(Vec<C>, Vec<Tok>, Option<S>)>, make_group: &F) -> Option<Tok>
         where
-            F: Fn(Vec<Tok>) -> Tok,
+            F: Fn(Vec<Tok>, S) -> Tok,
         {
-            while let Some((_, tts)) = tree.pop() {
-                let tt = make_group(tts);
+            while let Some((_, tts, line_span)) = tree.pop() {
+                let tt = make_group(tts, line_span?);
                 if let Some(last) = tree.last_mut() {
                     last.1.push(tt);
                 } else {
@@ -411,13 +411,13 @@ where
             None
         }
 
-        let mut nesting = vec![(Vec::new(), Vec::new())];
-        for (indent, mut line) in lines {
+        let mut nesting = vec![(Vec::new(), Vec::new(), None)];
+        for (indent, (mut line, line_span)) in lines {
             let mut indent = indent.as_slice();
             let mut i = 0;
             while let Some(tail) = nesting
                 .get(i)
-                .and_then(|(n, _)| indent.strip_prefix(n.as_slice()))
+                .and_then(|(n, _, _)| indent.strip_prefix(n.as_slice()))
             {
                 indent = tail;
                 i += 1;
@@ -426,7 +426,7 @@ where
                 nesting.last_mut().unwrap().1.push(tail);
             }
             if !indent.is_empty() {
-                nesting.push((indent.to_vec(), line));
+                nesting.push((indent.to_vec(), line, Some(line_span)));
             } else {
                 nesting.last_mut().unwrap().1.append(&mut line);
             }