Skip to content

Commit 86470ab

Browse files
authored
strings: add hamming_distance/jaro_similarity/jaro_winkler_similarity functions (#22701)
1 parent c32c2d7 commit 86470ab

File tree

2 files changed

+191
-0
lines changed

2 files changed

+191
-0
lines changed

vlib/strings/similarity.v

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,30 @@ fn min(a u16, b u16, c u16) u16 {
1212
return m
1313
}
1414

15+
@[inline]
16+
fn max2(a int, b int) int {
17+
if a < b {
18+
return b
19+
}
20+
return a
21+
}
22+
23+
@[inline]
24+
fn min2(a int, b int) int {
25+
if a < b {
26+
return a
27+
}
28+
return b
29+
}
30+
31+
@[inline]
32+
fn abs2(a int, b int) int {
33+
if a < b {
34+
return b - a
35+
}
36+
return a - b
37+
}
38+
1539
// levenshtein_distance uses the Levenshtein Distance algorithm to calculate
1640
// the distance between between two strings `a` and `b` (lower is closer).
1741
@[direct_array_access]
@@ -85,3 +109,121 @@ pub fn dice_coefficient(s1 string, s2 string) f32 {
85109
}
86110
return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
87111
}
112+
113+
// hamming_distance uses the Hamming Distance algorithm to calculate
114+
// the distance between two strings `a` and `b` (lower is closer).
115+
@[direct_array_access]
116+
pub fn hamming_distance(a string, b string) int {
117+
if a.len == 0 && b.len == 0 {
118+
return 0
119+
}
120+
mut match_len := min2(a.len, b.len)
121+
mut diff_count := abs2(a.len, b.len)
122+
for i in 0 .. match_len {
123+
if a[i] != b[i] {
124+
diff_count++
125+
}
126+
}
127+
return diff_count
128+
}
129+
130+
// hamming_similarity uses the Hamming Distance algorithm to calculate
131+
// the distance between two strings `a` and `b`.
132+
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
133+
pub fn hamming_similarity(a string, b string) f32 {
134+
l := max2(a.len, b.len)
135+
if l == 0 {
136+
// Both are empty strings, should return 1.0
137+
return 1.0
138+
}
139+
d := hamming_distance(a, b)
140+
return 1.00 - f32(d) / f32(l)
141+
}
142+
143+
// jaro_similarity uses the Jaro Distance algorithm to calculate
144+
// the distance between two strings `a` and `b`.
145+
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
146+
@[direct_array_access]
147+
pub fn jaro_similarity(a string, b string) f64 {
148+
a_len := a.len
149+
b_len := b.len
150+
if a_len == 0 && b_len == 0 {
151+
// Both are empty strings, should return 1.0
152+
return 1.0
153+
}
154+
if a_len == 0 || b_len == 0 {
155+
return 0
156+
}
157+
158+
// Maximum distance upto which matching is allowed
159+
match_distance := max2(a_len, b_len) / 2 - 1
160+
161+
mut a_matches := []bool{len: a_len}
162+
mut b_matches := []bool{len: b_len}
163+
mut matches := 0
164+
mut transpositions := 0.0
165+
166+
// Traverse through the first string
167+
for i in 0 .. a_len {
168+
start := max2(0, i - match_distance)
169+
end := min2(b_len, i + match_distance + 1)
170+
for k in start .. end {
171+
// If there is a match
172+
if b_matches[k] {
173+
continue
174+
}
175+
if a[i] != b[k] {
176+
continue
177+
}
178+
a_matches[i] = true
179+
b_matches[k] = true
180+
matches++
181+
break
182+
}
183+
}
184+
// If there is no match
185+
if matches == 0 {
186+
return 0
187+
}
188+
mut k := 0
189+
// Count number of occurrences where two characters match but
190+
// there is a third matched character in between the indices
191+
for i in 0 .. a_len {
192+
if !a_matches[i] {
193+
continue
194+
}
195+
// Find the next matched character in second string
196+
for !b_matches[k] {
197+
k++
198+
}
199+
if a[i] != b[k] {
200+
transpositions++
201+
}
202+
k++
203+
}
204+
transpositions /= 2
205+
return (matches / f64(a_len) + matches / f64(b_len) + (matches - transpositions) / matches) / 3
206+
}
207+
208+
// jaro_winkler_similarity uses the Jaro Winkler Distance algorithm to calculate
209+
// the distance between two strings `a` and `b`.
210+
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
211+
// The scaling factor(`p=0.1`) in Jaro-Winkler gives higher weight to prefix
212+
// similarities, making it especially effective for cases where slight misspellings
213+
// or prefixes are common.
214+
@[direct_array_access]
215+
pub fn jaro_winkler_similarity(a string, b string) f64 {
216+
// Maximum of 4 characters are allowed in prefix
217+
mut lmax := min2(4, min2(a.len, b.len))
218+
mut l := 0
219+
for i in 0 .. lmax {
220+
if a[i] == b[i] {
221+
l++
222+
}
223+
}
224+
js := jaro_similarity(a, b)
225+
// select a multiplier (Winkler suggested p=0.1) for the relative importance of the prefix for the word similarity
226+
p := 0.1
227+
ws := js + f64(l) * p * (1 - js)
228+
return ws
229+
}

vlib/strings/similarity_test.v

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,52 @@ fn test_levenshtein_distance() {
1212
assert strings.levenshtein_distance('flomax', 'volmax') == 3
1313
assert strings.levenshtein_distance('ab', 'cd') == 2
1414
}
15+
16+
fn test_hamming_distance() {
17+
assert strings.hamming_distance('', '') == 0
18+
assert strings.hamming_distance('one', 'one') == 0
19+
assert strings.hamming_distance('', 'two') == 3
20+
assert strings.hamming_distance('three', '') == 5
21+
assert strings.hamming_distance('bananna', '') == 7
22+
assert strings.hamming_distance('cats', 'hats') == 1
23+
assert strings.hamming_distance('hugs', 'shrugs') == 6
24+
assert strings.hamming_distance('broom', 'shroom') == 5
25+
assert strings.hamming_distance('flomax', 'volmax') == 3
26+
assert strings.hamming_distance('ab', 'cd') == 2
27+
}
28+
29+
fn test_hamming_similarity() {
30+
assert strings.hamming_similarity('', '') == 1.0
31+
assert strings.hamming_similarity('one', 'one') == 1.0
32+
assert strings.hamming_similarity('', 'two') == 0
33+
assert strings.hamming_similarity('three', '') == 0
34+
assert strings.hamming_similarity('bananna', '') == 0
35+
assert strings.hamming_similarity('cats', 'hats') == 0.75
36+
assert strings.hamming_similarity('hugs', 'shrugs') == 0
37+
assert strings.hamming_similarity('broom', 'shroom') == 0.1666666865348816
38+
assert strings.hamming_similarity('flomax', 'volmax') == 0.5
39+
assert strings.hamming_similarity('ab', 'cd') == 0
40+
}
41+
42+
fn test_jaro_similarity() {
43+
assert strings.jaro_similarity('', '') == 1
44+
assert strings.jaro_similarity('one', 'one') == 1
45+
assert strings.jaro_similarity('', 'two') == 0
46+
assert strings.jaro_similarity('three', '') == 0
47+
assert strings.jaro_similarity('bananna', '') == 0
48+
assert strings.jaro_similarity('MARTHA', 'MARHTA') == 0.9444444444444445
49+
assert strings.jaro_similarity('DIXON', 'DICKSONX') == 0.7666666666666666
50+
assert strings.jaro_similarity('JELLYFISH', 'SMELLYFISH') == 0.8962962962962964
51+
}
52+
53+
fn test_jaro_winkler_similarity() {
54+
assert strings.jaro_winkler_similarity('', '') == 1
55+
assert strings.jaro_winkler_similarity('one', 'one') == 1
56+
assert strings.jaro_winkler_similarity('', 'two') == 0
57+
assert strings.jaro_winkler_similarity('three', '') == 0
58+
assert strings.jaro_winkler_similarity('bananna', '') == 0
59+
assert strings.jaro_winkler_similarity('accomodate', 'accommodate') == 0.9818181818181818
60+
assert strings.jaro_winkler_similarity('accomodate', 'accompanist') == 0.8672727272727273
61+
assert strings.jaro_winkler_similarity('untill', 'huntsville') == 0.8666666666666667
62+
assert strings.jaro_winkler_similarity('wich', 'wichita') == 0.9142857142857143
63+
}

0 commit comments

Comments
 (0)