Skip to content

Commit

Permalink
Updated Pythonic example to demonstrate spanning and flattening
Browse files Browse the repository at this point in the history
  • Loading branch information
zesterer committed Aug 4, 2022
1 parent f17f70e commit 91e5cc8
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 28 deletions.
95 changes: 77 additions & 18 deletions examples/pythonic.rs
@@ -1,42 +1,101 @@
use chumsky::prelude::*;
use chumsky::{prelude::*, Flat, BoxStream};
use std::ops::Range;

#[derive(Debug)]
// Represents the different kinds of delimiters we care about
#[derive(Copy, Clone, Debug)]
enum Delim {
Paren,
Block,
}

#[derive(Debug)]
// An 'atomic' token (i.e: it has no child tokens)
#[derive(Clone, Debug)]
enum Token {
Int(u64),
Ident(String),
Op(String),
Tree(Delim, Vec<Token>),
Open(Delim),
Close(Delim),
}

// A parser that turns pythonic code with semantic whitespace into a token tree
fn lexer() -> impl Parser<char, Vec<Token>, Error = Simple<char>> {
let int = text::int(10).from_str().unwrapped().map(Token::Int);
// The output of the lexer: a recursive tree of nested tokens
#[derive(Debug)]
enum TokenTree {
Token(Token),
Tree(Delim, Vec<Spanned<TokenTree>>),
}

let ident = text::ident().map(Token::Ident);
type Span = Range<usize>;

let op = one_of("=.:%,")
.repeated()
.at_least(1)
.collect()
.map(Token::Op);
type Spanned<T> = (T, Span);

// A parser that turns pythonic code with semantic whitespace into a token tree
fn lexer() -> impl Parser<char, Vec<Spanned<TokenTree>>, Error = Simple<char>> {
let tt = recursive(|tt| {
let tt_list = tt.padded().repeated();
// Define some atomic tokens
let int = text::int(10).from_str().unwrapped().map(Token::Int);
let ident = text::ident().map(Token::Ident);
let op = one_of("=.:%,")
.repeated()
.at_least(1)
.collect()
.map(Token::Op);

let single_token = int
.or(op)
.or(ident)
.map(TokenTree::Token);

int.or(op).or(ident).or(tt_list
// Tokens surrounded by parentheses get turned into parenthesised token trees
let token_tree = tt
.padded()
.repeated()
.delimited_by(just('('), just(')'))
.map(|tts| Token::Tree(Delim::Paren, tts)))
.map(|tts| TokenTree::Tree(Delim::Paren, tts));

single_token.or(token_tree)
.map_with_span(|tt, span| (tt, span))
});

text::semantic_indentation(tt, |tts| Token::Tree(Delim::Block, tts)).then_ignore(end())
// Whitespace indentation creates code block token trees
text::semantic_indentation(tt, |tts, span| (TokenTree::Tree(Delim::Block, tts), span))
.then_ignore(end())
}

/// Flatten a series of token trees into a single token stream, ready for feeding into the main parser
fn tts_to_stream(eoi: Span, token_trees: Vec<Spanned<TokenTree>>) -> BoxStream<'static, Token, Span> {
use std::iter::once;

BoxStream::from_nested(eoi, token_trees.into_iter(), |(tt, span)| match tt {
// Single tokens remain unchanged
TokenTree::Token(token) => Flat::Single((token, span)),
// Nested token trees get flattened into their inner contents, surrounded by `Open` and `Close` tokens
TokenTree::Tree(delim, tree) => Flat::Many(
once((TokenTree::Token(Token::Open(delim)), span.clone()))
.chain(tree.into_iter())
.chain(once((TokenTree::Token(Token::Close(delim)), span))),
),
})
}

fn main() {
println!("{:#?}", lexer().parse(include_str!("sample.py")));
let code = include_str!("sample.py");

// First, lex the code into some nested token trees
let tts = lexer().parse(code).unwrap();

println!("--- Token Trees ---\n{:#?}", tts);

// Next, flatten
let eoi = 0..code.chars().count();
let mut token_stream = tts_to_stream(eoi, tts);

// At this point, we have a token stream that can be fed into the main parser! Because this is just an example,
// we're instead going to just collect the token stream into a vector and print it.

let flattened_trees = token_stream
.fetch_tokens()
.collect::<Vec<_>>();

println!("--- Flattened Token Trees ---\n{:?}", flattened_trees);
}
20 changes: 10 additions & 10 deletions src/text.rs
Expand Up @@ -383,25 +383,25 @@ where
C: Character + 'a,
Tok: 'a,
T: Parser<C, Tok, Error = E> + Clone + 'a,
F: Fn(Vec<Tok>) -> Tok + Clone + 'a,
F: Fn(Vec<Tok>, E::Span) -> Tok + Clone + 'a,
{
let line_ws = filter(|c: &C| c.is_inline_whitespace());

let line = token.padded_by(line_ws.ignored().repeated()).repeated();

let lines = line_ws
.repeated()
.then(line)
.then(line.map_with_span(|line, span| (line, span)))
.separated_by(newline())
.padded();

lines.map(move |lines| {
fn collapse<C, Tok, F>(mut tree: Vec<(Vec<C>, Vec<Tok>)>, make_group: &F) -> Option<Tok>
fn collapse<C, Tok, F, S>(mut tree: Vec<(Vec<C>, Vec<Tok>, Option<S>)>, make_group: &F) -> Option<Tok>
where
F: Fn(Vec<Tok>) -> Tok,
F: Fn(Vec<Tok>, S) -> Tok,
{
while let Some((_, tts)) = tree.pop() {
let tt = make_group(tts);
while let Some((_, tts, line_span)) = tree.pop() {
let tt = make_group(tts, line_span?);
if let Some(last) = tree.last_mut() {
last.1.push(tt);
} else {
Expand All @@ -411,13 +411,13 @@ where
None
}

let mut nesting = vec![(Vec::new(), Vec::new())];
for (indent, mut line) in lines {
let mut nesting = vec![(Vec::new(), Vec::new(), None)];
for (indent, (mut line, line_span)) in lines {
let mut indent = indent.as_slice();
let mut i = 0;
while let Some(tail) = nesting
.get(i)
.and_then(|(n, _)| indent.strip_prefix(n.as_slice()))
.and_then(|(n, _, _)| indent.strip_prefix(n.as_slice()))
{
indent = tail;
i += 1;
Expand All @@ -426,7 +426,7 @@ where
nesting.last_mut().unwrap().1.push(tail);
}
if !indent.is_empty() {
nesting.push((indent.to_vec(), line));
nesting.push((indent.to_vec(), line, Some(line_span)));
} else {
nesting.last_mut().unwrap().1.append(&mut line);
}
Expand Down

0 comments on commit 91e5cc8

Please sign in to comment.