From fe72ab406d9832793887766dd8d61b64303f25ca Mon Sep 17 00:00:00 2001 From: Jian Weihang Date: Sat, 30 Sep 2017 12:10:53 +0800 Subject: [PATCH] feat: support encodings other than utf-8 Do not treat every string as UTF-8 encoding, instead, use Ruby's API to get unicode codepoints. close #7 --- ext/jaro_winkler/adj_matrix.c | 4 +-- ext/jaro_winkler/code.c | 51 +++++++++++++++++++-------------- ext/jaro_winkler/code.h | 11 +++++-- ext/jaro_winkler/jaro.c | 37 +++--------------------- ext/jaro_winkler/jaro.h | 10 +++---- ext/jaro_winkler/jaro_winkler.c | 16 +++++++---- test/test_jaro_winkler.rb | 12 ++++++++ 7 files changed, 71 insertions(+), 70 deletions(-) diff --git a/ext/jaro_winkler/adj_matrix.c b/ext/jaro_winkler/adj_matrix.c index 0daec6c..59b8902 100644 --- a/ext/jaro_winkler/adj_matrix.c +++ b/ext/jaro_winkler/adj_matrix.c @@ -77,8 +77,8 @@ AdjMatrix* adj_matrix_default(){ for(size_t i = 0; i < length; i += 2){ uint64_t code_1, code_2; size_t dummy_length; - utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length); - utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length); + code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff; + code_2 = *DEFAULT_ADJ_TABLE[i+1] & 0xff; adj_matrix_add(ret_matrix, code_1, code_2); } first_time = 0; diff --git a/ext/jaro_winkler/code.c b/ext/jaro_winkler/code.c index 4600ec9..2461cec 100644 --- a/ext/jaro_winkler/code.c +++ b/ext/jaro_winkler/code.c @@ -1,30 +1,37 @@ #include #include #include +#include "ruby.h" +#include "ruby/encoding.h" +#include "code.h" -void utf_char_to_code(char *str, uint64_t *ret_code, size_t *ret_byte_length){ - unsigned char first_char = str[0]; - if(first_char >= 252) *ret_byte_length = 6; // 1111110x - else if(first_char >= 248) *ret_byte_length = 5; // 111110xx - else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx - else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx - else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx - else *ret_byte_length = 1; - *ret_code = 0; - memcpy(ret_code, str, *ret_byte_length); -} - -void string_to_codes(char *str, size_t length, uint64_t **ret_codes, size_t *ret_length){ - uint32_t code; - char byte_length; +void codepoints_init(CodePoints *codepoints, VALUE str){ + int32_t n; + uint32_t c; + const char *ptr, *end; + rb_encoding *enc; - *ret_codes = calloc(length, sizeof(long long)); - *ret_length = 0; + codepoints->length = 0; + codepoints->size = 32; + codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data)); + str = rb_str_new_frozen(str); + ptr = RSTRING_PTR(str); + end = RSTRING_END(str); + enc = rb_enc_get(str); - for(size_t i = 0; i < length;){ - size_t byte_length; - utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length); - *ret_length += 1; - i += byte_length; + while (ptr < end) { + c = rb_enc_codepoint_len(ptr, end, &n, enc); + if(codepoints->length == codepoints->size) { + codepoints->size *= 2; + codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) * codepoints->size); + } + codepoints->data[codepoints->length++] = c; + ptr += n; } + RB_GC_GUARD(str); +} + + +void codepoints_free(CodePoints *codepoints) { + free(codepoints->data); } diff --git a/ext/jaro_winkler/code.h b/ext/jaro_winkler/code.h index fbb7f50..76bbe3e 100644 --- a/ext/jaro_winkler/code.h +++ b/ext/jaro_winkler/code.h @@ -1,6 +1,13 @@ #pragma once #include #include +#include "ruby.h" -void utf_char_to_code(char *str, uint64_t *ret_code, size_t *ret_byte_length); -void string_to_codes(char *str, size_t length, uint64_t **ret_codes, size_t *ret_length); +typedef struct { + uint32_t *data; + size_t length; + size_t size; +} CodePoints; + +void codepoints_init(CodePoints*, VALUE str); +void codepoints_free(CodePoints*); diff --git a/ext/jaro_winkler/jaro.c b/ext/jaro_winkler/jaro.c index 25884e6..6788236 100644 --- a/ext/jaro_winkler/jaro.c +++ b/ext/jaro_winkler/jaro.c @@ -6,42 +6,13 @@ #include #include +#define DEFAULT_WEIGHT 0.1 +#define DEFAULT_THRESHOLD 0.7 #define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0) const LibJaroOption DEFAULT_OPT = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0}; -double jaro_distance_from_codes(uint64_t *codes1, size_t len1, uint64_t *codes2, size_t len2, LibJaroOption *opt); -double jaro_winkler_distance_from_codes(uint64_t *codes1, size_t len1, uint64_t *codes2, size_t len2, LibJaroOption *opt); - -double jaro_distance(char* short_str, size_t short_str_len, char* long_str, size_t long_str_len, LibJaroOption *opt){ - if(!short_str_len || !long_str_len) return 0.0; - - uint64_t *short_codes, *long_codes; - size_t short_codes_len, long_codes_len; - string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len); - string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len); - - double ret = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt); - - free(short_codes); free(long_codes); - return ret; -} - -double jaro_winkler_distance(char* short_str, size_t short_str_len, char* long_str, size_t long_str_len, LibJaroOption *opt){ - if(!short_str_len || !long_str_len) return 0.0; - - uint64_t *short_codes, *long_codes; - size_t short_codes_len, long_codes_len; - string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len); - string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len); - - double ret = jaro_winkler_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt); - - free(short_codes); free(long_codes); - return ret; -} - -double jaro_distance_from_codes(uint64_t* short_codes, size_t short_codes_len, uint64_t* long_codes, size_t long_codes_len, LibJaroOption *opt){ +double jaro_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt){ if(!short_codes_len || !long_codes_len) return 0.0; if(short_codes_len > long_codes_len){ @@ -111,7 +82,7 @@ double jaro_distance_from_codes(uint64_t* short_codes, size_t short_codes_len, u return (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3; } -double jaro_winkler_distance_from_codes(uint64_t* short_codes, size_t short_codes_len, uint64_t* long_codes, size_t long_codes_len, LibJaroOption *opt){ +double jaro_winkler_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt){ double jaro_distance = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt); if(jaro_distance < opt->threshold) return jaro_distance; diff --git a/ext/jaro_winkler/jaro.h b/ext/jaro_winkler/jaro.h index 44f15a3..04ae079 100644 --- a/ext/jaro_winkler/jaro.h +++ b/ext/jaro_winkler/jaro.h @@ -1,16 +1,14 @@ #pragma once #include - -#define DEFAULT_WEIGHT 0.1 -#define DEFAULT_THRESHOLD 0.7 +#include typedef struct LibJaroOption{ double weight, threshold; char ignore_case, adj_table; } LibJaroOption; - extern const LibJaroOption DEFAULT_OPT; -double jaro_distance(char *str1, size_t len1, char *str2, size_t len2, LibJaroOption *opt); -double jaro_winkler_distance(char *str1, size_t len1, char *str2, size_t len2, LibJaroOption *opt); + +double jaro_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt); +double jaro_winkler_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt); diff --git a/ext/jaro_winkler/jaro_winkler.c b/ext/jaro_winkler/jaro_winkler.c index 465e60a..4819917 100644 --- a/ext/jaro_winkler/jaro_winkler.c +++ b/ext/jaro_winkler/jaro_winkler.c @@ -1,5 +1,6 @@ #include "ruby.h" #include "jaro.h" +#include "code.h" VALUE rb_mJaroWinkler, rb_eError, @@ -7,7 +8,7 @@ VALUE rb_mJaroWinkler, VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self); VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self); -VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, size_t len1, char *str2, size_t len2, LibJaroOption *opt)); +VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt)); void Init_jaro_winkler_ext(void){ rb_mJaroWinkler = rb_define_module("JaroWinkler"); @@ -18,9 +19,14 @@ void Init_jaro_winkler_ext(void){ } -VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, size_t len1, char *str2, size_t len2, LibJaroOption *opt)){ +VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt)){ VALUE s1, s2, opt; + CodePoints cp1, cp2; + rb_scan_args(argc, argv, "2:", &s1, &s2, &opt); + codepoints_init(&cp1, s1); + codepoints_init(&cp2, s2); + LibJaroOption c_opt = DEFAULT_OPT; if(TYPE(opt) == T_HASH){ VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))), @@ -33,13 +39,13 @@ VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(char if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1; if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1; } - return rb_float_new((*distance_fn)(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt)); + return rb_float_new((*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt)); } VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self){ - return distance(argc, argv, self, jaro_distance); + return distance(argc, argv, self, jaro_distance_from_codes); } VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self){ - return distance(argc, argv, self, jaro_winkler_distance); + return distance(argc, argv, self, jaro_winkler_distance_from_codes); } diff --git a/test/test_jaro_winkler.rb b/test/test_jaro_winkler.rb index d23b895..2b4c223 100644 --- a/test/test_jaro_winkler.rb +++ b/test/test_jaro_winkler.rb @@ -93,12 +93,24 @@ def test_long_string JaroWinkler.distance 'haisai' * 20, 'haisai' * 20 end + def test_encoding + assert_encoding '焦玟綾', '焦紋綾', Encoding::Big5 + assert_encoding '簡煒航', '簡偉航', Encoding::Big5_HKSCS + assert_encoding '西島之', '西鳥志', Encoding::EUCJP + assert_encoding '松本行弘', '枩本行弘', Encoding::Shift_JIS + assert_distance 1.0, "\xe8".force_encoding('iso8859-1'), 'è' + end + private def assert_distance score, str1, str2, options={} assert_in_delta score, JaroWinkler.distance(str1, str2, options) end + def assert_encoding str1, str2, encoding, options={} + assert_distance JaroWinkler.distance(str1, str2), str1.encode(encoding), str2.encode(encoding) + end + def assert_jaro_distance score, str1, str2, options={} assert_in_delta score, JaroWinkler.jaro_distance(str1, str2, options) end