From d18e70021c2faefe2600564dfcb44cec5f7ceff7 Mon Sep 17 00:00:00 2001 From: Aaron Patterson Date: Fri, 31 Jan 2014 11:06:47 -0800 Subject: [PATCH 1/2] wrap up the text hashing API --- ext/phashion_ext/phashion_ext.c | 82 +++++++++++++++++++++++++++++++++ lib/phashion.rb | 3 ++ test/test_phashion.rb | 21 +++++++++ 3 files changed, 106 insertions(+) diff --git a/ext/phashion_ext/phashion_ext.c b/ext/phashion_ext/phashion_ext.c index 6652438..c037b95 100644 --- a/ext/phashion_ext/phashion_ext.c +++ b/ext/phashion_ext/phashion_ext.c @@ -54,6 +54,86 @@ static VALUE hamming_distance(VALUE self, VALUE a, VALUE b) { return INT2NUM(result); } +static VALUE texthash_for(VALUE self, VALUE file) { + int nbpoints, i; + VALUE list; + VALUE point_class; + + TxtHashPoint *points = ph_texthash(StringValuePtr(file), &nbpoints); + + point_class = rb_const_get(self, rb_intern("TextHashPoint")); + + list = rb_ary_new2((long)nbpoints); + + for(i = 0; i < nbpoints; i++) { + VALUE point; + VALUE args[2]; + + args[0] = ULL2NUM(points[i].hash); + args[1] = ULL2NUM(points[i].index); + + point = rb_class_new_instance(2, args, point_class); + rb_ary_push(list, point); + } + + free(points); + + return list; +} + +static TxtHashPoint * rb2phash_points(VALUE list) { + int i; + TxtHashPoint * txt_list; + + txt_list = (TxtHashPoint *)xcalloc(RARRAY_LEN(list), sizeof(TxtHashPoint)); + + for(i = 0; i < RARRAY_LEN(list); i++) { + VALUE elem = rb_ary_entry(list, i); + txt_list[i].hash = NUM2ULL(rb_funcall(elem, rb_intern("hash"), 0)); + txt_list[i].index = NUM2INT(rb_funcall(elem, rb_intern("index"), 0)); + } + + return txt_list; +} + +static VALUE textmatches_for(VALUE self, VALUE list1, VALUE list2) { + int nbmatches, i; + VALUE list; + VALUE match_class; + TxtHashPoint *txt_list1; + TxtHashPoint *txt_list2; + + txt_list1 = rb2phash_points(list1); + txt_list2 = rb2phash_points(list2); + + TxtMatch *matches = ph_compare_text_hashes(txt_list1, RARRAY_LEN(list1), + txt_list2, RARRAY_LEN(list2), + &nbmatches); + + xfree(txt_list1); + xfree(txt_list2); + + match_class = rb_const_get(self, rb_intern("TextMatch")); + + list = rb_ary_new2((long)nbmatches); + + for(i = 0; i < nbmatches; i++) { + VALUE match; + VALUE args[3]; + + args[0] = INT2NUM(matches[i].first_index); + args[1] = INT2NUM(matches[i].second_index); + args[2] = INT2NUM(matches[i].length); + + match = rb_class_new_instance(3, args, match_class); + rb_ary_push(list, match); + } + + free(matches); + + return list; +} + #ifdef __cplusplus extern "C" { #endif @@ -63,6 +143,8 @@ extern "C" { rb_define_singleton_method(c, "hamming_distance", (VALUE(*)(ANYARGS))hamming_distance, 2); rb_define_singleton_method(c, "image_hash_for", (VALUE(*)(ANYARGS))image_hash_for, 1); + rb_define_singleton_method(c, "texthash_for", (VALUE(*)(ANYARGS))texthash_for, 1); + rb_define_singleton_method(c, "textmatches_for", (VALUE(*)(ANYARGS))textmatches_for, 2); } #ifdef HAVE_SQLITE3EXT_H diff --git a/lib/phashion.rb b/lib/phashion.rb index b8166c3..1744aac 100644 --- a/lib/phashion.rb +++ b/lib/phashion.rb @@ -11,6 +11,9 @@ module Phashion VERSION = '1.0.8' + TextHashPoint = Struct.new :hash, :index + TextMatch = Struct.new :first_index, :second_index, :length + class Image DEFAULT_DUPE_THRESHOLD = 15 diff --git a/test/test_phashion.rb b/test/test_phashion.rb index 25edc59..f72b79d 100644 --- a/test/test_phashion.rb +++ b/test/test_phashion.rb @@ -1,8 +1,29 @@ require 'helper' require 'sqlite3' +require 'tempfile' class TestPhashion < Test::Unit::TestCase + def test_text_hash + matches = Tempfile.open('foo') do |f| + 100.times { |i| + f.write "hello world #{i}" + } + f.close + a = Phashion.texthash_for f.path + b = Phashion.texthash_for f.path + + assert_operator a.length, :>, 0 + assert_operator b.length, :>, 0 + a.each { |hash| assert_instance_of Phashion::TextHashPoint, hash } + b.each { |hash| assert_instance_of Phashion::TextHashPoint, hash } + + Phashion.textmatches_for(a, b) + end + assert_operator matches.length, :>, 0 + matches.each { |match| assert_instance_of Phashion::TextMatch, match } + end + def split(hash) r = hash & 0xFFFFFFFF l = (hash >> 32) & 0xFFFFFFFF From 3bd059ddc112f1f7224b8b7fca0d80465623ab3b Mon Sep 17 00:00:00 2001 From: Weston Platter Date: Sat, 1 Mar 2014 10:32:28 -0700 Subject: [PATCH 2/2] remove wp as author --- phashion.gemspec | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phashion.gemspec b/phashion.gemspec index 3b4f934..4f029b6 100644 --- a/phashion.gemspec +++ b/phashion.gemspec @@ -3,8 +3,8 @@ Gem::Specification.new do |s| s.name = %q{phashion} s.version = "1.0.8" - s.authors = ["Mike Perham", "Weston Platter"] - s.email = ["mperham@gmail.com", "westonplatter@gmail.com"] + s.authors = ["Mike Perham"] + s.email = ["mperham@gmail.com"] s.description = %q{Simple wrapper around the pHash library} s.homepage = %q{http://github.com/westonplatter/phashion}