update unicode representation, fix test cases for similarity algorithms

usc-isi-i2 · Mar 22, 2018 · 09c9b2a · 09c9b2a
1 parent 525769c
commit 09c9b2a
Show file tree

Hide file tree

Showing 11 changed files with 131 additions and 109 deletions.
diff --git a/rltk/similarity/cosine.py b/rltk/similarity/cosine.py
@@ -31,7 +31,7 @@ def string_cosine_similarity(bag1, bag2):
 
     intersection = set(d1.keys()) & set(d2.keys())
     v_x_y = sum([d1[x] * d2[x] for x in intersection])
-    v_x_2 = sum([v * v for k, v in d1.iteritems()])
-    v_y_2 = sum([v * v for k, v in d2.iteritems()])
+    v_x_2 = sum([v * v for k, v in d1.items()])
+    v_y_2 = sum([v * v for k, v in d2.items()])
 
     return 0.0 if v_x_y == 0 else float(v_x_y) / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
diff --git a/rltk/similarity/hamming.py b/rltk/similarity/hamming.py
@@ -4,10 +4,9 @@
 def hamming_distance(s1, s2):
 
     utils.check_for_none(s1, s2)
-    utils.check_for_type(str, s1, s2)
+    # utils.check_for_type(str, s1, s2)
 
     if len(s1) != len(s2):
-        print('s1s2', s1, s2)
         raise ValueError('Unequal length')
 
     return sum(c1 != c2 for c1, c2 in zip(s1, s2))

diff --git a/rltk/similarity/jaro.py b/rltk/similarity/jaro.py
@@ -79,6 +79,9 @@ def _jaro_distance(s1, s2):
     utils.check_for_none(s1, s2)
     utils.check_for_type(str, s1, s2)
 
+    s1 = utils.unicode_normalize(s1)
+    s2 = utils.unicode_normalize(s2)
+
     shorter, longer = s1.lower(), s2.lower()
 
     if len(s1) > len(s2):

diff --git a/rltk/similarity/levenshtein.py b/rltk/similarity/levenshtein.py
@@ -32,6 +32,9 @@ def levenshtein_distance(s1, s2, insert={}, delete={}, substitute={},
     utils.check_for_none(s1, s2)
     utils.check_for_type(str, s1, s2)
 
+    s1 = utils.unicode_normalize(s1)
+    s2 = utils.unicode_normalize(s2)
+
     n1, n2 = len(s1), len(s2)
     if n1 == 0 and n2 == 0:
         return 0
@@ -124,6 +127,9 @@ def damerau_levenshtein_distance(s1, s2):
     utils.check_for_none(s1, s2)
     utils.check_for_type(str, s1, s2)
 
+    s1 = utils.unicode_normalize(s1)
+    s2 = utils.unicode_normalize(s2)
+
     n1, n2 = len(s1), len(s2)
     infinite = n1 + n2
 

diff --git a/rltk/similarity/metaphone.py b/rltk/similarity/metaphone.py
@@ -25,6 +25,8 @@ def _metaphone(s):
     utils.check_for_none(s)
     utils.check_for_type(str, s)
 
+    s = utils.unicode_normalize(s)
+
     if len(s) == 0:
         raise ValueError('Empty string')
 
@@ -39,19 +41,19 @@ def _metaphone(s):
 
     while i < len(s):
         c = s[i]
-        next = s[i + 1] if i < len(s) - 1 else '*****'
-        nextnext = s[i + 2] if i < len(s) - 2 else '*****'
+        next = s[i+1] if i < len(s)-1 else '*****'
+        nextnext = s[i+2] if i < len(s)-2 else '*****'
 
         # skip doubles except for cc
         if c == next and c != 'c':
             i += 1
             continue
 
         if c in 'aeiou':
-            if i == 0 or s[i - 1] == ' ':
+            if i == 0 or s[i-1] == ' ':
                 result.append(c)
         elif c == 'b':
-            if (not (i != 0 and s[i - 1] == 'm')) or next:
+            if (not (i != 0 and s[i-1] == 'm')) or next:
                 result.append('b')
         elif c == 'c':
             if next == 'i' and nextnext == 'a' or next == 'h':
@@ -78,10 +80,10 @@ def _metaphone(s):
             elif next == 'h' and nextnext and nextnext not in 'aeiou':
                 i += 1
         elif c == 'h':
-            if i == 0 or next in 'aeiou' or s[i - 1] not in 'aeiou':
+            if i == 0 or next in 'aeiou' or s[i-1] not in 'aeiou':
                 result.append('h')
         elif c == 'k':
-            if i == 0 or s[i - 1] != 'c':
+            if i == 0 or s[i-1] != 'c':
                 result.append('k')
         elif c == 'p':
             if next == 'h':

diff --git a/rltk/similarity/needleman.py b/rltk/similarity/needleman.py
@@ -15,6 +15,9 @@ def needleman_wunsch_score(s1, s2, match=2, mismatch=-1, gap=-0.5, score_table={
     utils.check_for_none(s1, s2)
     utils.check_for_type(str, s1, s2)
 
+    s1 = utils.unicode_normalize(s1)
+    s2 = utils.unicode_normalize(s2)
+
     n1, n2 = len(s1), len(s2)
     if n1 == 0 and n2 == 0:
         return 0

diff --git a/rltk/similarity/nysiis.py b/rltk/similarity/nysiis.py
@@ -26,6 +26,8 @@ def _nysiis(s):
     utils.check_for_none(s)
     utils.check_for_type(str, s)
 
+    s = utils.unicode_normalize(s)
+
     if len(s) == 0:
         raise ValueError('Empty string')
 

diff --git a/rltk/similarity/soundex.py b/rltk/similarity/soundex.py
@@ -21,6 +21,8 @@ def _soundex(s):
     utils.check_for_none(s)
     utils.check_for_type(str, s)
 
+    s = utils.unicode_normalize(s)
+
     if len(s) == 0:
         raise ValueError('Empty string')
 

diff --git a/rltk/similarity/tf_idf.py b/rltk/similarity/tf_idf.py
@@ -33,8 +33,8 @@ def tf_idf_similarity(bag1, bag2, df_corpus, doc_size, math_log=False):
 
     # term frequency for input strings
     t_x, t_y = collections.Counter(bag1), collections.Counter(bag2)
-    tf_x = {k: float(v) / len(bag1) for k, v in t_x.iteritems()}
-    tf_y = {k: float(v) / len(bag2) for k, v in t_y.iteritems()}
+    tf_x = {k: float(v) / len(bag1) for k, v in t_x.items()}
+    tf_y = {k: float(v) / len(bag2) for k, v in t_y.items()}
 
     # unique element
     total_unique_elements = set()
@@ -66,7 +66,7 @@ def compute_tf(t, bag_len):
     Args:
         t (dict): {term: count,...}
     """
-    return {k: float(v) / bag_len for k, v in t.iteritems()}
+    return {k: float(v) / bag_len for k, v in t.items()}
 
 
 # # plus 1