Skip to content

Commit

Permalink
strings: improve the performance of levenshtein_distance/2 (use early…
Browse files Browse the repository at this point in the history
… returns for the easy cases, idiomatic array initalisation, and @[direct_array_access])
  • Loading branch information
spytheman committed Jan 8, 2024
1 parent 51e19dd commit 5b9d0f2
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 3 deletions.
13 changes: 10 additions & 3 deletions vlib/strings/similarity.v
Expand Up @@ -2,11 +2,18 @@ module strings

// levenshtein_distance uses the Levenshtein Distance algorithm to calculate
// the distance between between two strings `a` and `b` (lower is closer).
@[direct_array_access]
pub fn levenshtein_distance(a string, b string) int {
mut f := [0].repeat(b.len + 1)
for j in 0 .. f.len {
f[j] = j
if a.len == 0 {
return b.len
}
if b.len == 0 {
return a.len
}
if a == b {
return 0
}
mut f := []int{len: b.len + 1, init: index}
for ca in a {
mut j := 1
mut fj1 := f[0]
Expand Down
69 changes: 69 additions & 0 deletions vlib/v/tests/bench/bench_strings_similarity.v
@@ -0,0 +1,69 @@
import os
import strings
import benchmark

const max_iterations = os.getenv_opt('MAX_ITERATIONS') or { '100_000' }.int()

fn imin(x u16, y u16) u16 {
return if x < y { x } else { y }
}

// From https://gist.github.com/zeozeozeo/f785910173f3115163bffd0c5240de07
@[direct_array_access]
pub fn zeozeozeo_levenshtein_distance(a string, b string) int {
if a.len == 0 {
return b.len
}
if b.len == 0 {
return a.len
}
if a == b {
return 0
}

mut row := []u16{len: a.len + 1}
for i in 1 .. row.len {
row[i] = i
}

for i := 1; i < b.len; i++ {
mut prev := u16(i)
for j := 1; j < a.len; j++ {
mut current := row[j - 1] // match
if b[i - 1] != a[j - 1] {
// insertion, substitution, deletion
current = imin(imin(row[j - 1] + 1, prev + 1), row[j] + 1)
}
row[j - 1] = prev
prev = current
}
row[a.len] = prev
}

return row[a.len]
}

fn main() {
a := 'abcdef'
b := 'abdef'

mut sum := i64(0)
mut bench := benchmark.start()
sum = 0
for _ in 0 .. max_iterations {
sum += i64(strings.levenshtein_distance(a, b))
}
bench.measure('strings.levenshtein_distance: ${sum}')

sum = 0
for _ in 0 .. max_iterations {
sum += i64(zeozeozeo_levenshtein_distance(a, b))
}
bench.measure('zeozeozeo_levenshtein_distance: ${sum}')

mut fsum := f64(0)
for _ in 0 .. max_iterations {
fsum += strings.dice_coefficient(a, b)
}
bench.measure('strings.dice_coefficient: ${fsum}')
}

0 comments on commit 5b9d0f2

Please sign in to comment.