Skip to content

Commit

Permalink
update unicode representation, fix test cases for similarity algorithms
Browse files Browse the repository at this point in the history
  • Loading branch information
GreatYYX committed Mar 22, 2018
1 parent 525769c commit 09c9b2a
Show file tree
Hide file tree
Showing 11 changed files with 131 additions and 109 deletions.
4 changes: 2 additions & 2 deletions rltk/similarity/cosine.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def string_cosine_similarity(bag1, bag2):

intersection = set(d1.keys()) & set(d2.keys())
v_x_y = sum([d1[x] * d2[x] for x in intersection])
v_x_2 = sum([v * v for k, v in d1.iteritems()])
v_y_2 = sum([v * v for k, v in d2.iteritems()])
v_x_2 = sum([v * v for k, v in d1.items()])
v_y_2 = sum([v * v for k, v in d2.items()])

return 0.0 if v_x_y == 0 else float(v_x_y) / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
3 changes: 1 addition & 2 deletions rltk/similarity/hamming.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
def hamming_distance(s1, s2):

utils.check_for_none(s1, s2)
utils.check_for_type(str, s1, s2)
# utils.check_for_type(str, s1, s2)

if len(s1) != len(s2):
print('s1s2', s1, s2)
raise ValueError('Unequal length')

return sum(c1 != c2 for c1, c2 in zip(s1, s2))
Expand Down
3 changes: 3 additions & 0 deletions rltk/similarity/jaro.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ def _jaro_distance(s1, s2):
utils.check_for_none(s1, s2)
utils.check_for_type(str, s1, s2)

s1 = utils.unicode_normalize(s1)
s2 = utils.unicode_normalize(s2)

shorter, longer = s1.lower(), s2.lower()

if len(s1) > len(s2):
Expand Down
6 changes: 6 additions & 0 deletions rltk/similarity/levenshtein.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ def levenshtein_distance(s1, s2, insert={}, delete={}, substitute={},
utils.check_for_none(s1, s2)
utils.check_for_type(str, s1, s2)

s1 = utils.unicode_normalize(s1)
s2 = utils.unicode_normalize(s2)

n1, n2 = len(s1), len(s2)
if n1 == 0 and n2 == 0:
return 0
Expand Down Expand Up @@ -124,6 +127,9 @@ def damerau_levenshtein_distance(s1, s2):
utils.check_for_none(s1, s2)
utils.check_for_type(str, s1, s2)

s1 = utils.unicode_normalize(s1)
s2 = utils.unicode_normalize(s2)

n1, n2 = len(s1), len(s2)
infinite = n1 + n2

Expand Down
14 changes: 8 additions & 6 deletions rltk/similarity/metaphone.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def _metaphone(s):
utils.check_for_none(s)
utils.check_for_type(str, s)

s = utils.unicode_normalize(s)

if len(s) == 0:
raise ValueError('Empty string')

Expand All @@ -39,19 +41,19 @@ def _metaphone(s):

while i < len(s):
c = s[i]
next = s[i + 1] if i < len(s) - 1 else '*****'
nextnext = s[i + 2] if i < len(s) - 2 else '*****'
next = s[i+1] if i < len(s)-1 else '*****'
nextnext = s[i+2] if i < len(s)-2 else '*****'

# skip doubles except for cc
if c == next and c != 'c':
i += 1
continue

if c in 'aeiou':
if i == 0 or s[i - 1] == ' ':
if i == 0 or s[i-1] == ' ':
result.append(c)
elif c == 'b':
if (not (i != 0 and s[i - 1] == 'm')) or next:
if (not (i != 0 and s[i-1] == 'm')) or next:
result.append('b')
elif c == 'c':
if next == 'i' and nextnext == 'a' or next == 'h':
Expand All @@ -78,10 +80,10 @@ def _metaphone(s):
elif next == 'h' and nextnext and nextnext not in 'aeiou':
i += 1
elif c == 'h':
if i == 0 or next in 'aeiou' or s[i - 1] not in 'aeiou':
if i == 0 or next in 'aeiou' or s[i-1] not in 'aeiou':
result.append('h')
elif c == 'k':
if i == 0 or s[i - 1] != 'c':
if i == 0 or s[i-1] != 'c':
result.append('k')
elif c == 'p':
if next == 'h':
Expand Down
3 changes: 3 additions & 0 deletions rltk/similarity/needleman.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ def needleman_wunsch_score(s1, s2, match=2, mismatch=-1, gap=-0.5, score_table={
utils.check_for_none(s1, s2)
utils.check_for_type(str, s1, s2)

s1 = utils.unicode_normalize(s1)
s2 = utils.unicode_normalize(s2)

n1, n2 = len(s1), len(s2)
if n1 == 0 and n2 == 0:
return 0
Expand Down
2 changes: 2 additions & 0 deletions rltk/similarity/nysiis.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def _nysiis(s):
utils.check_for_none(s)
utils.check_for_type(str, s)

s = utils.unicode_normalize(s)

if len(s) == 0:
raise ValueError('Empty string')

Expand Down
2 changes: 2 additions & 0 deletions rltk/similarity/soundex.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def _soundex(s):
utils.check_for_none(s)
utils.check_for_type(str, s)

s = utils.unicode_normalize(s)

if len(s) == 0:
raise ValueError('Empty string')

Expand Down
6 changes: 3 additions & 3 deletions rltk/similarity/tf_idf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def tf_idf_similarity(bag1, bag2, df_corpus, doc_size, math_log=False):

# term frequency for input strings
t_x, t_y = collections.Counter(bag1), collections.Counter(bag2)
tf_x = {k: float(v) / len(bag1) for k, v in t_x.iteritems()}
tf_y = {k: float(v) / len(bag2) for k, v in t_y.iteritems()}
tf_x = {k: float(v) / len(bag1) for k, v in t_x.items()}
tf_y = {k: float(v) / len(bag2) for k, v in t_y.items()}

# unique element
total_unique_elements = set()
Expand Down Expand Up @@ -66,7 +66,7 @@ def compute_tf(t, bag_len):
Args:
t (dict): {term: count,...}
"""
return {k: float(v) / bag_len for k, v in t.iteritems()}
return {k: float(v) / bag_len for k, v in t.items()}


# # plus 1
Expand Down

0 comments on commit 09c9b2a

Please sign in to comment.