Skip to content

Commit

Permalink
Add stringIn1 parser
Browse files Browse the repository at this point in the history
This parser builds a radix tree from non empty string alternatives.
Radix trees are interesting as they allow for efficient parsing of
string alterntives (e.g. enum names) without backtracing and require
only 1 character lookahead.

String alternatives are first sorted and then grouped by common non
empty prefix. Grouped string are then recursively grouped with the same
process. In the resulting tree, each group of substrings start with a
different character, which can be used to decide which alternative
branch to take.

This operator is inspired by fastparse `stringIn` one, and requires all
alternatives to be non empty.
  • Loading branch information
satabin committed Dec 11, 2020
1 parent bd8cef4 commit 04dd432
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 6 deletions.
59 changes: 53 additions & 6 deletions core/shared/src/main/scala/cats/parse/Parser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,7 @@ object Parser extends ParserInstances {

object Expectation {
case class Str(offset: Int, str: String) extends Expectation
case class OneStr(offset: Int, strs: List[String]) extends Expectation
// expected a character in a given range
case class InRange(offset: Int, lower: Char, upper: Char) extends Expectation
case class StartOfString(offset: Int) extends Expectation
Expand All @@ -476,22 +477,33 @@ object Parser extends ParserInstances {
(left, right) match {
case (Str(_, s1), Str(_, s2)) => s1.compare(s2)
case (Str(_, _), _) => -1
case (InRange(_, _, _), Str(_, _)) => 1
case (OneStr(_, _), Str(_, _)) => 1
case (OneStr(_, s1), OneStr(_, s2)) =>
val s = s1.lengthCompare(s2)
if (s == 0) s1.compare(s2)
else s
case (OneStr(_, _), _) => -1
case (InRange(_, _, _), Str(_, _) | OneStr(_, _)) => 1
case (InRange(_, l1, u1), InRange(_, l2, u2)) =>
val c1 = Character.compare(l1, l2)
if (c1 == 0) Character.compare(u1, u2)
else c1
case (InRange(_, _, _), _) => -1
case (StartOfString(_), Str(_, _) | InRange(_, _, _)) => 1
case (StartOfString(_), Str(_, _) | OneStr(_, _) | InRange(_, _, _)) => 1
case (StartOfString(_), _) =>
-1 // if they have the same offset, already handled above
case (EndOfString(_, _), Str(_, _) | InRange(_, _, _) | StartOfString(_)) => 1
case (
EndOfString(_, _),
Str(_, _) | OneStr(_, _) | InRange(_, _, _) | StartOfString(_)
) =>
1
case (EndOfString(_, l1), EndOfString(_, l2)) =>
Integer.compare(l1, l2)
case (EndOfString(_, _), _) => -1
case (
Length(_, _, _),
Str(_, _) | InRange(_, _, _) | StartOfString(_) | EndOfString(_, _)
Str(_, _) | OneStr(_, _) | InRange(_, _, _) | StartOfString(_) |
EndOfString(_, _)
) =>
1
case (Length(_, e1, a1), Length(_, e2, a2)) =>
Expand Down Expand Up @@ -796,6 +808,9 @@ object Parser extends ParserInstances {
}
}

def stringIn1(strings: List[String]): Parser1[Unit] =
Impl.StringIn1(strings)

private[this] val emptyStringParser: Parser[String] =
pure("")

Expand Down Expand Up @@ -1538,8 +1553,9 @@ object Parser extends ParserInstances {
case Defer1(fn) =>
Defer1(() => unmap1(compute1(fn)))
case Rep1(p, m, _) => Rep1(unmap1(p), m, Accumulator.unitAccumulator)
case AnyChar | CharIn(_, _, _) | Str(_) | IgnoreCase(_) | Fail() | FailWith(_) | Length(_) |
TailRecM1(_, _) | FlatMap1(_, _) =>
case AnyChar | CharIn(_, _, _) | Str(_) | StringIn1(_) | IgnoreCase(_) | Fail() | FailWith(
_
) | Length(_) | TailRecM1(_, _) | FlatMap1(_, _) =>
// we can't transform this significantly
pa

Expand Down Expand Up @@ -1724,6 +1740,28 @@ object Parser extends ParserInstances {
null.asInstanceOf[A]
}

final def stringIn1[A](radix: RadixNode, all: List[String], state: State): Unit = {
val startOffset = state.offset
var offset = state.offset
var tree = radix
var cont = true
while (cont && offset < state.str.size && tree.children.contains(state.str(offset))) {
val (prefix, child) = tree.children(state.str(offset))
// accept the prefix fo this character
if (state.str.startsWith(prefix, offset)) {
offset += prefix.size
tree = child
} else {
cont = false
}
}
if (!tree.word) {
state.error = Chain.one(Expectation.OneStr(startOffset, all))
} else {
state.offset = offset
}
}

case class OneOf1[A](all: List[Parser1[A]]) extends Parser1[A] {
require(all.lengthCompare(2) >= 0, s"expected more than two items, found: ${all.size}")
private[this] val ary: Array[Parser[A]] = all.toArray
Expand All @@ -1738,6 +1776,15 @@ object Parser extends ParserInstances {
override def parseMut(state: State): A = oneOf(ary, state)
}

case class StringIn1(all: List[String]) extends Parser1[Unit] {
require(all.lengthCompare(2) >= 0, s"expected more than two items, found: ${all.size}")
require(!all.contains(""), "empty string is not allowed in alternatives")
private[this] val sorted = all.sorted
private[this] val tree = RadixNode.fromSortedStrings(NonEmptyList.fromListUnsafe(sorted))

override def parseMut(state: State): Unit = stringIn1(tree, sorted, state)
}

final def prod[A, B](pa: Parser[A], pb: Parser[B], state: State): (A, B) = {
val a = pa.parseMut(state)
if (state.error eq null) {
Expand Down
97 changes: 97 additions & 0 deletions core/shared/src/main/scala/cats/parse/RadixNode.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Copyright (c) 2020 Typelevel
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package cats.parse

import cats.data.NonEmptyList

import scala.annotation.tailrec

class RadixNode(val children: Map[Char, (String, RadixNode)], val word: Boolean) {
override def toString(): String = s"RadixNode($children, $word)"
}

object RadixNode {
def fromSortedStrings(strings: NonEmptyList[String]): RadixNode = {
@tailrec
def groupByNonEmptyPrefix(
keys: List[String],
prefix: String,
current: NonEmptyList[String],
acc: Map[String, NonEmptyList[String]]
): Map[String, NonEmptyList[String]] =
keys match {
case key :: keys =>
val prefixSize = commonPrefix(prefix, key)
if (prefixSize == 0) {
// no common prefix, group current suffixes together sorted again
groupByNonEmptyPrefix(
keys,
key,
NonEmptyList.one(key),
acc.updated(prefix, current.map(_.drop(prefix.size)).reverse)
)
} else {
// clip the prefix to the length, and continue
groupByNonEmptyPrefix(keys, prefix.take(prefixSize), key :: current, acc)
}
case Nil =>
acc.updated(prefix, current.map(_.drop(prefix.size)).reverse)
}
NonEmptyList.fromList(strings.filter(_.nonEmpty)) match {
case Some(nonEmpty) =>
val grouped =
groupByNonEmptyPrefix(
nonEmpty.tail,
nonEmpty.head,
NonEmptyList.one(strings.head),
Map.empty
).view.mapValues(fromSortedStrings(_))
new RadixNode(
grouped.map { case (k, v) =>
(k(0), (k, v))
}.toMap,
nonEmpty.size < strings.size
)
case None =>
leaf
}
}

private val leaf = new RadixNode(Map.empty, true)

private def commonPrefix(s1: String, s2: String): Int = {
@tailrec
def loop(idx: Int): Int =
if (idx >= s1.size || idx >= s2.size) {
idx
} else {
val c1 = s1(idx)
val c2 = s2(idx)
if (c1 == c2) {
loop(idx + 1)
} else {
idx
}
}
loop(0)
}
}
10 changes: 10 additions & 0 deletions core/shared/src/test/scala/cats/parse/ParserTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ object ParserGen {
Gen.const(GenT(Parser.anyChar))
)

val stringIn1: Gen[GenT[Parser1]] =
Arbitrary.arbitrary[List[String]].map { cs =>
if (cs.size < 2 || cs.exists(_.isEmpty)) GenT(Parser.fail: Parser1[Unit])
else GenT(Parser.stringIn1(cs))
}

val expect1: Gen[GenT[Parser1]] =
Arbitrary.arbitrary[String].map { str =>
if (str.isEmpty) GenT(Parser.fail: Parser1[Unit])
Expand Down Expand Up @@ -473,6 +479,7 @@ object ParserGen {
(8, expect1),
(2, ignoreCase1),
(8, charIn1),
(8, stringIn1),
(1, Gen.choose(Char.MinValue, Char.MaxValue).map { c => GenT(Parser.char(c)) }),
(2, rec.map(void1(_))),
(2, rec.map(string1(_))),
Expand Down Expand Up @@ -554,6 +561,9 @@ class ParserTest extends munit.ScalaCheckSuite {

parseTest(Parser.oneOf1(fooP :: barP :: Nil), "bar", ())
parseTest(Parser.oneOf1(fooP :: barP :: Nil), "foo", ())
parseTest(Parser.stringIn1(List("foo", "bar", "foobar")), "foo", ())
parseTest(Parser.stringIn1(List("foo", "bar", "foobar")), "bar", ())
parseTest(Parser.stringIn1(List("foo", "bar", "foobar")), "foobar", ())
}

test("product tests") {
Expand Down

0 comments on commit 04dd432

Please sign in to comment.