Skip to content

Commit

Permalink
feat: support encodings other than utf-8
Browse files Browse the repository at this point in the history
Do not treat every string as UTF-8 encoding, instead, use Ruby's API to get unicode codepoints.

close #7
  • Loading branch information
tonytonyjan committed Sep 30, 2017
1 parent 1a37c7e commit fe72ab4
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 70 deletions.
4 changes: 2 additions & 2 deletions ext/jaro_winkler/adj_matrix.c
Expand Up @@ -77,8 +77,8 @@ AdjMatrix* adj_matrix_default(){
for(size_t i = 0; i < length; i += 2){
uint64_t code_1, code_2;
size_t dummy_length;
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length);
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
code_2 = *DEFAULT_ADJ_TABLE[i+1] & 0xff;
adj_matrix_add(ret_matrix, code_1, code_2);
}
first_time = 0;
Expand Down
51 changes: 29 additions & 22 deletions ext/jaro_winkler/code.c
@@ -1,30 +1,37 @@
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "ruby.h"
#include "ruby/encoding.h"
#include "code.h"

void utf_char_to_code(char *str, uint64_t *ret_code, size_t *ret_byte_length){
unsigned char first_char = str[0];
if(first_char >= 252) *ret_byte_length = 6; // 1111110x
else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
else *ret_byte_length = 1;
*ret_code = 0;
memcpy(ret_code, str, *ret_byte_length);
}

void string_to_codes(char *str, size_t length, uint64_t **ret_codes, size_t *ret_length){
uint32_t code;
char byte_length;
void codepoints_init(CodePoints *codepoints, VALUE str){
int32_t n;
uint32_t c;
const char *ptr, *end;
rb_encoding *enc;

*ret_codes = calloc(length, sizeof(long long));
*ret_length = 0;
codepoints->length = 0;
codepoints->size = 32;
codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
str = rb_str_new_frozen(str);
ptr = RSTRING_PTR(str);
end = RSTRING_END(str);
enc = rb_enc_get(str);

for(size_t i = 0; i < length;){
size_t byte_length;
utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
*ret_length += 1;
i += byte_length;
while (ptr < end) {
c = rb_enc_codepoint_len(ptr, end, &n, enc);
if(codepoints->length == codepoints->size) {
codepoints->size *= 2;
codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) * codepoints->size);
}
codepoints->data[codepoints->length++] = c;
ptr += n;
}
RB_GC_GUARD(str);
}


void codepoints_free(CodePoints *codepoints) {
free(codepoints->data);
}
11 changes: 9 additions & 2 deletions ext/jaro_winkler/code.h
@@ -1,6 +1,13 @@
#pragma once
#include <stdint.h>
#include <stddef.h>
#include "ruby.h"

void utf_char_to_code(char *str, uint64_t *ret_code, size_t *ret_byte_length);
void string_to_codes(char *str, size_t length, uint64_t **ret_codes, size_t *ret_length);
typedef struct {
uint32_t *data;
size_t length;
size_t size;
} CodePoints;

void codepoints_init(CodePoints*, VALUE str);
void codepoints_free(CodePoints*);
37 changes: 4 additions & 33 deletions ext/jaro_winkler/jaro.c
Expand Up @@ -6,42 +6,13 @@
#include <stdlib.h>
#include <ctype.h>

#define DEFAULT_WEIGHT 0.1
#define DEFAULT_THRESHOLD 0.7
#define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)

const LibJaroOption DEFAULT_OPT = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};

double jaro_distance_from_codes(uint64_t *codes1, size_t len1, uint64_t *codes2, size_t len2, LibJaroOption *opt);
double jaro_winkler_distance_from_codes(uint64_t *codes1, size_t len1, uint64_t *codes2, size_t len2, LibJaroOption *opt);

double jaro_distance(char* short_str, size_t short_str_len, char* long_str, size_t long_str_len, LibJaroOption *opt){
if(!short_str_len || !long_str_len) return 0.0;

uint64_t *short_codes, *long_codes;
size_t short_codes_len, long_codes_len;
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);

double ret = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);

free(short_codes); free(long_codes);
return ret;
}

double jaro_winkler_distance(char* short_str, size_t short_str_len, char* long_str, size_t long_str_len, LibJaroOption *opt){
if(!short_str_len || !long_str_len) return 0.0;

uint64_t *short_codes, *long_codes;
size_t short_codes_len, long_codes_len;
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);

double ret = jaro_winkler_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);

free(short_codes); free(long_codes);
return ret;
}

double jaro_distance_from_codes(uint64_t* short_codes, size_t short_codes_len, uint64_t* long_codes, size_t long_codes_len, LibJaroOption *opt){
double jaro_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt){
if(!short_codes_len || !long_codes_len) return 0.0;

if(short_codes_len > long_codes_len){
Expand Down Expand Up @@ -111,7 +82,7 @@ double jaro_distance_from_codes(uint64_t* short_codes, size_t short_codes_len, u
return (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
}

double jaro_winkler_distance_from_codes(uint64_t* short_codes, size_t short_codes_len, uint64_t* long_codes, size_t long_codes_len, LibJaroOption *opt){
double jaro_winkler_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt){
double jaro_distance = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);

if(jaro_distance < opt->threshold) return jaro_distance;
Expand Down
10 changes: 4 additions & 6 deletions ext/jaro_winkler/jaro.h
@@ -1,16 +1,14 @@
#pragma once

#include <stddef.h>

#define DEFAULT_WEIGHT 0.1
#define DEFAULT_THRESHOLD 0.7
#include <stdint.h>

typedef struct LibJaroOption{
double weight, threshold;
char ignore_case, adj_table;
} LibJaroOption;


extern const LibJaroOption DEFAULT_OPT;
double jaro_distance(char *str1, size_t len1, char *str2, size_t len2, LibJaroOption *opt);
double jaro_winkler_distance(char *str1, size_t len1, char *str2, size_t len2, LibJaroOption *opt);

double jaro_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt);
double jaro_winkler_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt);
16 changes: 11 additions & 5 deletions ext/jaro_winkler/jaro_winkler.c
@@ -1,13 +1,14 @@
#include "ruby.h"
#include "jaro.h"
#include "code.h"

VALUE rb_mJaroWinkler,
rb_eError,
rb_eInvalidWeightError;

VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, size_t len1, char *str2, size_t len2, LibJaroOption *opt));
VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt));

void Init_jaro_winkler_ext(void){
rb_mJaroWinkler = rb_define_module("JaroWinkler");
Expand All @@ -18,9 +19,14 @@ void Init_jaro_winkler_ext(void){
}


VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, size_t len1, char *str2, size_t len2, LibJaroOption *opt)){
VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt)){
VALUE s1, s2, opt;
CodePoints cp1, cp2;

rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
codepoints_init(&cp1, s1);
codepoints_init(&cp2, s2);

LibJaroOption c_opt = DEFAULT_OPT;
if(TYPE(opt) == T_HASH){
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
Expand All @@ -33,13 +39,13 @@ VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(char
if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
}
return rb_float_new((*distance_fn)(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
return rb_float_new((*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
}

VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self){
return distance(argc, argv, self, jaro_distance);
return distance(argc, argv, self, jaro_distance_from_codes);
}

VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self){
return distance(argc, argv, self, jaro_winkler_distance);
return distance(argc, argv, self, jaro_winkler_distance_from_codes);
}
12 changes: 12 additions & 0 deletions test/test_jaro_winkler.rb
Expand Up @@ -93,12 +93,24 @@ def test_long_string
JaroWinkler.distance 'haisai' * 20, 'haisai' * 20
end

def test_encoding
assert_encoding '焦玟綾', '焦紋綾', Encoding::Big5
assert_encoding '簡煒航', '簡偉航', Encoding::Big5_HKSCS
assert_encoding '西島之', '西鳥志', Encoding::EUCJP
assert_encoding '松本行弘', '枩本行弘', Encoding::Shift_JIS
assert_distance 1.0, "\xe8".force_encoding('iso8859-1'), 'è'
end

private

def assert_distance score, str1, str2, options={}
assert_in_delta score, JaroWinkler.distance(str1, str2, options)
end

def assert_encoding str1, str2, encoding, options={}
assert_distance JaroWinkler.distance(str1, str2), str1.encode(encoding), str2.encode(encoding)
end

def assert_jaro_distance score, str1, str2, options={}
assert_in_delta score, JaroWinkler.jaro_distance(str1, str2, options)
end
Expand Down

0 comments on commit fe72ab4

Please sign in to comment.