/
jaro.c
95 lines (80 loc) · 3.28 KB
/
jaro.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#include "jaro.h"
#include "code.h"
#include "adj_matrix.h"
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#define DEFAULT_WEIGHT 0.1
#define DEFAULT_THRESHOLD 0.7
#define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
const LibJaroOption DEFAULT_OPT = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
double jaro_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt){
if(!short_codes_len || !long_codes_len) return 0.0;
if(short_codes_len > long_codes_len){
SWAP(short_codes, long_codes);
SWAP(short_codes_len, long_codes_len);
}
if(opt->ignore_case){
for(size_t i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
for(size_t i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
}
int32_t window_size = long_codes_len/2 - 1;
if(window_size < 0) window_size = 0;
char short_codes_flag[short_codes_len];
char long_codes_flag[long_codes_len];
memset(short_codes_flag, 0, short_codes_len);
memset(long_codes_flag, 0, long_codes_len);
// count number of matching characters
size_t match_count = 0;
for(size_t i = 0; i < short_codes_len; i++){
size_t left = (i >= window_size) ? i - window_size : 0;
size_t right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
if(right > long_codes_len - 1) right = long_codes_len - 1;
for(size_t j = left; j <= right; j++){
if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
short_codes_flag[i] = long_codes_flag[j] = 1;
match_count++;
break;
}
}
}
if(!match_count) return 0.0;
// count number of transpositions
size_t transposition_count = 0, j = 0, k = 0;
for(size_t i = 0; i < short_codes_len; i++){
if(short_codes_flag[i]){
for(j = k; j < long_codes_len; j++){
if(long_codes_flag[j]){
k = j + 1;
break;
}
}
if(short_codes[i] != long_codes[j]) transposition_count++;
}
}
// count similarities in nonmatched characters
size_t similar_count = 0;
if(opt->adj_table && short_codes_len > match_count)
for(size_t i = 0; i < short_codes_len; i++)
if(!short_codes_flag[i])
for(size_t j = 0; j < long_codes_len; j++)
if(!long_codes_flag[j])
if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
similar_count += 3;
break;
}
double m = (double)match_count;
double t = (double)(transposition_count/2);
if(opt->adj_table) m = similar_count/10.0 + m;
return (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
}
double jaro_winkler_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt){
double jaro_distance = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
if(jaro_distance < opt->threshold) return jaro_distance;
else{
size_t prefix = 0;
size_t max_4 = short_codes_len > 4 ? 4 : short_codes_len;
for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
return jaro_distance + prefix*opt->weight*(1-jaro_distance);
}
}