@@ -12,6 +12,30 @@ fn min(a u16, b u16, c u16) u16 {
1212 return m
1313}
1414
15+ @[inline]
16+ fn max2 (a int , b int ) int {
17+ if a < b {
18+ return b
19+ }
20+ return a
21+ }
22+
23+ @[inline]
24+ fn min2 (a int , b int ) int {
25+ if a < b {
26+ return a
27+ }
28+ return b
29+ }
30+
31+ @[inline]
32+ fn abs2 (a int , b int ) int {
33+ if a < b {
34+ return b - a
35+ }
36+ return a - b
37+ }
38+
1539// levenshtein_distance uses the Levenshtein Distance algorithm to calculate
1640// the distance between between two strings `a` and `b` (lower is closer).
1741@[direct_array_access]
@@ -85,3 +109,121 @@ pub fn dice_coefficient(s1 string, s2 string) f32 {
85109 }
86110 return (2.0 * f32 (intersection_size)) / (f32 (a.len) + f32 (b.len) - 2 )
87111}
112+
113+ // hamming_distance uses the Hamming Distance algorithm to calculate
114+ // the distance between two strings `a` and `b` (lower is closer).
115+ @[direct_array_access]
116+ pub fn hamming_distance (a string , b string ) int {
117+ if a.len == 0 && b.len == 0 {
118+ return 0
119+ }
120+ mut match_len := min2 (a.len, b.len)
121+ mut diff_count := abs2 (a.len, b.len)
122+ for i in 0 .. match_len {
123+ if a[i] != b[i] {
124+ diff_count++
125+ }
126+ }
127+ return diff_count
128+ }
129+
130+ // hamming_similarity uses the Hamming Distance algorithm to calculate
131+ // the distance between two strings `a` and `b`.
132+ // It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
133+ pub fn hamming_similarity (a string , b string ) f32 {
134+ l := max2 (a.len, b.len)
135+ if l == 0 {
136+ // Both are empty strings, should return 1.0
137+ return 1.0
138+ }
139+ d := hamming_distance (a, b)
140+ return 1.00 - f32 (d) / f32 (l)
141+ }
142+
143+ // jaro_similarity uses the Jaro Distance algorithm to calculate
144+ // the distance between two strings `a` and `b`.
145+ // It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
146+ @[direct_array_access]
147+ pub fn jaro_similarity (a string , b string ) f64 {
148+ a_len := a.len
149+ b_len := b.len
150+ if a_len == 0 && b_len == 0 {
151+ // Both are empty strings, should return 1.0
152+ return 1.0
153+ }
154+ if a_len == 0 || b_len == 0 {
155+ return 0
156+ }
157+
158+ // Maximum distance upto which matching is allowed
159+ match_distance := max2 (a_len, b_len) / 2 - 1
160+
161+ mut a_matches := []bool {len: a_len}
162+ mut b_matches := []bool {len: b_len}
163+ mut matches := 0
164+ mut transpositions := 0.0
165+
166+ // Traverse through the first string
167+ for i in 0 .. a_len {
168+ start := max2 (0 , i - match_distance)
169+ end := min2 (b_len, i + match_distance + 1 )
170+ for k in start .. end {
171+ // If there is a match
172+ if b_matches[k] {
173+ continue
174+ }
175+ if a[i] != b[k] {
176+ continue
177+ }
178+ a_matches[i] = true
179+ b_matches[k] = true
180+ matches++
181+ break
182+ }
183+ }
184+ // If there is no match
185+ if matches == 0 {
186+ return 0
187+ }
188+ mut k := 0
189+ // Count number of occurrences where two characters match but
190+ // there is a third matched character in between the indices
191+ for i in 0 .. a_len {
192+ if ! a_matches[i] {
193+ continue
194+ }
195+ // Find the next matched character in second string
196+ for ! b_matches[k] {
197+ k++
198+ }
199+ if a[i] != b[k] {
200+ transpositions++
201+ }
202+ k++
203+ }
204+ transpositions / = 2
205+ return (matches / f64 (a_len) + matches / f64 (b_len) + (matches - transpositions) / matches) / 3
206+ }
207+
208+ // jaro_winkler_similarity uses the Jaro Winkler Distance algorithm to calculate
209+ // the distance between two strings `a` and `b`.
210+ // It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
211+ // The scaling factor(`p=0.1`) in Jaro-Winkler gives higher weight to prefix
212+ // similarities, making it especially effective for cases where slight misspellings
213+ // or prefixes are common.
214+ @[direct_array_access]
215+ pub fn jaro_winkler_similarity (a string , b string ) f64 {
216+ // Maximum of 4 characters are allowed in prefix
217+ mut lmax := min2 (4 , min2 (a.len, b.len))
218+ mut l := 0
219+ for i in 0 .. lmax {
220+ if a[i] == b[i] {
221+ l++
222+ }
223+ }
224+ js := jaro_similarity (a, b)
225+ // select a multiplier (Winkler suggested p=0.1) for the relative importance of the prefix for the word similarity
226+ p := 0.1
227+ ws := js + f64 (l) * p * (1 - js)
228+ return ws
229+ }
0 commit comments