From 14f7d76d6c69ec3854f0e23261b8651a9f820663 Mon Sep 17 00:00:00 2001 From: Matt Amos Date: Tue, 24 Apr 2012 18:26:24 +0100 Subject: [PATCH] Changes to tag handling for wider odbl=clean catching and whitespace handling. --- tags.rb | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tags.rb b/tags.rb index 08f9e81..b33df99 100644 --- a/tags.rb +++ b/tags.rb @@ -23,10 +23,13 @@ def self.odbl_clean?(tags) # special case for this one misspelling, as it's fairly # common to find "obdl" and there's no chance that we're # confusing "obdl" with anything else. - if k.downcase == "odbl" or k.downcase == "obdl" + if (k.downcase == "odbl" or + k.downcase == "obdl" or + k.downcase == "oodbl") val = tags[k].downcase # tag synonyms for "clean" in this context (val == "clean" || + val == "clear" || val == "true" || val == "yes" || val == "1") @@ -286,9 +289,13 @@ def self.significant_tag?(old_v, new_v) # now check for homophones (TODO: is this really appropriate?) return false if Text::Metaphone.metaphone(old) == Text::Metaphone.metaphone(new) - # finally, look for changes in abbreviation. + # look for changes in abbreviation. return false if Abbrev.equal_expansions(old, new) + # check if the strings are the same except for whitespace + # presence. this would be considered insignificant. + return false if old.gsub(/ /,"") == new.gsub(/ /,"") + # otherwise, just look at the strings... old != new end