Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add stringIn1 parser #110

Merged
merged 14 commits into from
Dec 22, 2020
55 changes: 55 additions & 0 deletions bench/src/main/scala/cats/parse/bench/StringInBench.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright (c) 2020 Typelevel
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package cats.parse.bench

import cats.parse.Parser
import java.util.concurrent.TimeUnit
import org.openjdk.jmh.annotations._

@State(Scope.Benchmark)
@BenchmarkMode(Array(Mode.AverageTime))
@OutputTimeUnit(TimeUnit.NANOSECONDS)
class StringInBenchmarks {
val inputs =
List("foofoo", "bar", "foobat", "foot", "foobar")

val stringIn = Parser.stringIn("foo" :: "bar" :: "foobar" :: "foofoo" :: "foobaz" :: Nil)

val oneOf =
Parser.oneOf(
Parser.string("foobar") ::
Parser.string("foobaz") ::
Parser.string("foofoo") ::
Parser.string("foo") ::
Parser.string("bar") ::
Nil
)

@Benchmark
def stringInParse(): Unit =
inputs.foreach(stringIn.parseAll(_))

@Benchmark
def oneOfParse(): Unit =
inputs.foreach(oneOf.parseAll(_))

}
87 changes: 76 additions & 11 deletions core/shared/src/main/scala/cats/parse/Parser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ import cats.{Eval, FunctorFilter, Monad, Defer, Alternative, FlatMap, Now, Monoi
import cats.data.{AndThen, Chain, NonEmptyList}

import cats.implicits._
import scala.collection.immutable.SortedSet
import scala.collection.mutable.ListBuffer
import java.util.Arrays

/** Parser0[A] attempts to extract an `A` value from the given input,
* potentially moving its offset forward in the process.
Expand Down Expand Up @@ -520,7 +522,7 @@ object Parser {
}

object Expectation {
case class Str(offset: Int, str: String) extends Expectation
case class OneOfStr(offset: Int, strs: List[String]) extends Expectation
// expected a character in a given range
case class InRange(offset: Int, lower: Char, upper: Char) extends Expectation
case class StartOfString(offset: Int) extends Expectation
Expand All @@ -540,24 +542,28 @@ object Parser {
else {
// these are never equal
(left, right) match {
case (Str(_, s1), Str(_, s2)) => s1.compare(s2)
case (Str(_, _), _) => -1
case (InRange(_, _, _), Str(_, _)) => 1
case (OneOfStr(_, s1), OneOfStr(_, s2)) => s1.compare(s2)
case (OneOfStr(_, _), _) => -1
case (InRange(_, _, _), OneOfStr(_, _)) => 1
case (InRange(_, l1, u1), InRange(_, l2, u2)) =>
val c1 = Character.compare(l1, l2)
if (c1 == 0) Character.compare(u1, u2)
else c1
case (InRange(_, _, _), _) => -1
case (StartOfString(_), Str(_, _) | InRange(_, _, _)) => 1
case (StartOfString(_), OneOfStr(_, _) | InRange(_, _, _)) => 1
case (StartOfString(_), _) =>
-1 // if they have the same offset, already handled above
case (EndOfString(_, _), Str(_, _) | InRange(_, _, _) | StartOfString(_)) => 1
case (
EndOfString(_, _),
OneOfStr(_, _) | InRange(_, _, _) | StartOfString(_)
) =>
1
case (EndOfString(_, l1), EndOfString(_, l2)) =>
Integer.compare(l1, l2)
case (EndOfString(_, _), _) => -1
case (
Length(_, _, _),
Str(_, _) | InRange(_, _, _) | StartOfString(_) | EndOfString(_, _)
OneOfStr(_, _) | InRange(_, _, _) | StartOfString(_) | EndOfString(_, _)
) =>
1
case (Length(_, e1, a1), Length(_, e2, a2)) =>
Expand Down Expand Up @@ -862,6 +868,21 @@ object Parser {
}
}

/** Parse the longest matching string between alternatives.
* The order of the strings does not matter.
*
* If no string matches, this parser results in an epsilon failure.
*/
def stringIn(strings: Iterable[String]): Parser[Unit] =
strings.toList.distinct match {
case Nil => fail
case s :: Nil => string(s)
case two =>
Impl.StringIn(
SortedSet(two: _*)
) // sadly scala 2.12 doesn't have the `SortedSet.from` constructor function
}

/** If the first parser fails to parse its input with an epsilon error,
* try the second parser instead.
*
Expand Down Expand Up @@ -1631,8 +1652,9 @@ object Parser {
case Defer(fn) =>
Defer(() => unmap(compute(fn)))
case Rep(p, m, _) => Rep(unmap(p), m, Accumulator0.unitAccumulator0)
case AnyChar | CharIn(_, _, _) | Str(_) | IgnoreCase(_) | Fail() | FailWith(_) | Length(_) |
TailRecM1(_, _) | FlatMap(_, _) =>
case AnyChar | CharIn(_, _, _) | Str(_) | StringIn(_) | IgnoreCase(_) | Fail() | FailWith(
_
) | Length(_) | TailRecM1(_, _) | FlatMap(_, _) =>
// we can't transform this significantly
pa

Expand Down Expand Up @@ -1754,7 +1776,7 @@ object Parser {
state.offset += message.length
()
} else {
state.error = Chain.one(Expectation.Str(offset, message))
state.error = Chain.one(Expectation.OneOfStr(offset, message :: Nil))
()
}
}
Expand All @@ -1770,7 +1792,7 @@ object Parser {
state.offset += message.length
()
} else {
state.error = Chain.one(Expectation.Str(offset, message))
state.error = Chain.one(Expectation.OneOfStr(offset, message :: Nil))
()
}
}
Expand Down Expand Up @@ -1817,6 +1839,40 @@ object Parser {
null.asInstanceOf[A]
}

final def stringIn[A](radix: RadixNode, all: SortedSet[String], state: State): Unit = {
val startOffset = state.offset
val strLength = state.str.length
var offset = state.offset
var tree = radix
var cont = offset < strLength
var lastMatch = -1
while (cont) {
val c = state.str.charAt(offset)
val idx = Arrays.binarySearch(tree.fsts, c)
if (idx >= 0) {
val prefix = tree.prefixes(idx)
// accept the prefix fo this character
if (state.str.startsWith(prefix, offset)) {
val children = tree.children(idx)
offset += prefix.length
tree = children
cont = offset < strLength
if (children.word) lastMatch = offset
} else {
cont = false
}
} else {
cont = false
}
}
if (lastMatch < 0) {
state.error = Chain.one(Expectation.OneOfStr(startOffset, all.toList))
state.offset = startOffset
satabin marked this conversation as resolved.
Show resolved Hide resolved
} else {
state.offset = lastMatch
}
}

case class OneOf[A](all: List[Parser[A]]) extends Parser[A] {
require(all.lengthCompare(2) >= 0, s"expected more than two items, found: ${all.size}")
private[this] val ary: Array[Parser0[A]] = all.toArray
Expand All @@ -1831,6 +1887,15 @@ object Parser {
override def parseMut(state: State): A = oneOf(ary, state)
}

case class StringIn(sorted: SortedSet[String]) extends Parser[Unit] {
require(sorted.size >= 2, s"expected more than two items, found: ${sorted.size}")
require(!sorted.contains(""), "empty string is not allowed in alternatives")
private[this] val tree =
RadixNode.fromSortedStrings(NonEmptyList.fromListUnsafe(sorted.toList))

override def parseMut(state: State): Unit = stringIn(tree, sorted, state)
}

final def prod[A, B](pa: Parser0[A], pb: Parser0[B], state: State): (A, B) = {
val a = pa.parseMut(state)
if (state.error eq null) {
Expand Down
104 changes: 104 additions & 0 deletions core/shared/src/main/scala/cats/parse/RadixNode.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Copyright (c) 2020 Typelevel
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package cats.parse

import cats.data.NonEmptyList

import scala.annotation.tailrec

class RadixNode(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you are using an Array here to avoid the boxing of using a map. Another approach would be to use scala.collection.immutable.IntMap and lift the Char into an Int, and then IntMap won't box.

That will avoid the binary search, which might be a significant win.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, let's see if it improves. The Map method also has the overhead of creating an Option, I will compare to the current approach.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you have a chance to try the IntMap approach? I would think for larger sets it would be a big win, but I could be wrong.

val fsts: Array[Char],
val prefixes: Array[String],
val children: Array[RadixNode],
val word: Boolean
) {
override def toString(): String =
s"RadixNode(${fsts.mkString("[", ", ", "]")}, ${children.mkString("[", ", ", "]")}, $word)"
}

object RadixNode {
def fromSortedStrings(strings: NonEmptyList[String]): RadixNode = {
@tailrec
def groupByNonEmptyPrefix(
keys: List[String],
prefix: String,
current: NonEmptyList[String],
acc: List[(Char, String, NonEmptyList[String])]
): List[(Char, String, NonEmptyList[String])] =
keys match {
case key :: keys =>
val prefixSize = commonPrefix(prefix, key)
if (prefixSize == 0) {
// no common prefix, group current suffixes together sorted again
groupByNonEmptyPrefix(
keys,
key,
NonEmptyList.one(key),
(prefix(0), prefix, current.map(_.drop(prefix.size)).reverse) :: acc
)
} else {
// clip the prefix to the length, and continue
groupByNonEmptyPrefix(keys, prefix.take(prefixSize), key :: current, acc)
}
case Nil =>
(prefix(0), prefix, current.map(_.drop(prefix.size)).reverse) :: acc
}
NonEmptyList.fromList(strings.filter(_.nonEmpty)) match {
case Some(nonEmpty) =>
val grouped =
groupByNonEmptyPrefix(
nonEmpty.tail,
nonEmpty.head,
NonEmptyList.one(nonEmpty.head),
Nil
).reverse.map { case (fst, prefix, v) => (fst, prefix, fromSortedStrings(v)) }
val (fsts, prefixes, children) = grouped.unzip3
new RadixNode(
fsts.toArray,
prefixes.toArray,
children.toArray,
nonEmpty.size < strings.size
)
case None =>
leaf
}
}

private val leaf = new RadixNode(Array.empty, Array.empty, Array.empty, true)

private def commonPrefix(s1: String, s2: String): Int = {
@tailrec
def loop(idx: Int): Int =
if (idx >= s1.size || idx >= s2.size) {
idx
} else {
val c1 = s1(idx)
val c2 = s2(idx)
if (c1 == c2) {
loop(idx + 1)
} else {
idx
}
}
loop(0)
}
}
Loading