Skip to content

Commit

Permalink
fix leveshtein normalization, add damerau and optimal string alignmen…
Browse files Browse the repository at this point in the history
…t similarity
  • Loading branch information
GreatYYX committed Jun 3, 2020
1 parent d68993c commit 1885a45
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 24 deletions.
3 changes: 2 additions & 1 deletion rltk/similarity/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from rltk.similarity.hamming import hamming_distance, hamming_similarity, normalized_hamming_distance
from rltk.similarity.dice import dice_similarity
from rltk.similarity.levenshtein import levenshtein_distance, levenshtein_similarity, \
normalized_levenshtein_distance, damerau_levenshtein_distance, optimal_string_alignment_distance
damerau_levenshtein_distance, damerau_levenshtein_similarity, \
optimal_string_alignment_distance, optimal_string_alignment_similarity
from rltk.similarity.needleman import needleman_wunsch_score, needleman_wunsch_similarity
from rltk.similarity.jaro import jaro_winkler_distance, jaro_winkler_similarity, jaro_distance
from rltk.similarity.jaccard import jaccard_index_similarity, jaccard_index_distance
Expand Down
66 changes: 43 additions & 23 deletions rltk/similarity/levenshtein.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
def levenshtein_distance(s1, s2, insert=None, delete=None, substitute=None,
insert_default=1, delete_default=1, substitute_default=1):
"""
The Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other.
The Levenshtein distance between two words is the minimum number of single-character edits (insertions, \
deletions or substitutions) required to change one word into the other.
Args:
s1 (str): Sequence 1.
Expand Down Expand Up @@ -76,47 +77,42 @@ def levenshtein_distance(s1, s2, insert=None, delete=None, substitute=None,


def levenshtein_similarity(s1, s2, insert=None, delete=None, substitute=None,
insert_default=1, delete_default=1, substitute_default=1):
"""
Computed as 1 - normalized_levenshtein_distance.
"""
return 1.0 - normalized_levenshtein_distance(s1, s2, insert, delete, substitute,
insert_default, delete_default, substitute_default)


def normalized_levenshtein_distance(s1, s2, insert=None, delete=None, substitute=None,
insert_default=1, delete_default=1, substitute_default=1):
"""
Computed as levenshtein - max-insert-cost(s1,s2)
Computed as 1 - levenshtein_distance / max-cost(s1,s2)
"""

insert = insert if isinstance(insert, dict) else {}
delete = delete if isinstance(delete, dict) else {}
substitute = substitute if isinstance(substitute, dict) else {}

def compute_insert_cost(s):
cost = 0
for c in s:
cost += insert[c] if c in insert else insert_default
return cost
def compute_max_cost(s):
return sum([
max(
insert[c] if c in insert else insert_default,
delete[c] if c in delete else delete_default,
substitute[c] if c in substitute else substitute_default
) for c in s
])

lev = levenshtein_distance(s1, s2, insert, delete, substitute,
insert_default, delete_default, substitute_default)

max_cost = max(compute_insert_cost(s1), compute_insert_cost(s2))
max_cost = max(compute_max_cost(s1), compute_max_cost(s2))

if max_cost < lev:
raise ValueError('Illegal value of operation cost')

if max_cost == 0:
return 0
return 1.0

return float(lev) / max_cost
return 1.0 - float(lev) / max_cost


def damerau_levenshtein_distance(s1, s2):
"""
Similar to Levenshtein, Damerau-Levenshtein distance is the minimum number of operations needed to transform one string into the other, where an operation is defined as an insertion, deletion, or substitution of a single character, or a transposition of two adjacent characters.
Similar to Levenshtein, Damerau-Levenshtein distance is the minimum number of operations needed to transform\
one string into the other, where an operation is defined as an insertion, deletion, or substitution of \
a single character, or a transposition of two adjacent characters.
Args:
s1 (str): Sequence 1.
Expand Down Expand Up @@ -171,6 +167,18 @@ def damerau_levenshtein_distance(s1, s2):
return dp[n1 + 1][n2 + 1]


def damerau_levenshtein_similarity(s1, s2):
"""
Computed as 1 - damerau_levenshtein_distance / max(len(s1), len(s2))
"""
max_cost = max(len(s1), len(s2))

if max_cost == 0:
return 1.0

return 1.0 - float(damerau_levenshtein_distance(s1, s2)) / max_cost


def optimal_string_alignment_distance(s1, s2):
"""
This is a variation of the Damerau-Levenshtein distance that returns the strings' edit distance
Expand Down Expand Up @@ -214,7 +222,19 @@ def optimal_string_alignment_distance(s1, s2):
dp[i - 1][j] + 1,
dp[i - 1][j - 1] + cost)

if (i > 1 and j > 1 and s1[i - 1] == s2[j - 2] and s1[i - 2] == s2[j - 1]):
if i > 1 and j > 1 and s1[i - 1] == s2[j - 2] and s1[i - 2] == s2[j - 1]:
dp[i][j] = min(dp[i][j], dp[i - 2][j - 2] + cost)

return dp[n1][n2]
return dp[n1][n2]


def optimal_string_alignment_similarity(s1, s2):
"""
Computed as 1 - optimal_string_alignment_distance / max(len(s1), len(s2))
"""
max_cost = max(len(s1), len(s2))

if max_cost == 0:
return 1.0

return 1.0 - float(optimal_string_alignment_distance(s1, s2)) / max_cost

0 comments on commit 1885a45

Please sign in to comment.