Skip to content

Commit

Permalink
add lcs similarity metric
Browse files Browse the repository at this point in the history
  • Loading branch information
GreatYYX committed Apr 20, 2018
1 parent 97f0e28 commit 6d4c387
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 0 deletions.
1 change: 1 addition & 0 deletions rltk/similarity/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from rltk.similarity.jaccard import jaccard_index_similarity, jaccard_index_distance
from rltk.similarity.cosine import cosine_similarity, string_cosine_similarity
from rltk.similarity.tf_idf import tf_idf_similarity, compute_idf, compute_tf, tf_idf_similarity_by_dict
from rltk.similarity.lcs import longest_common_subsequence_distance, metric_longest_common_subsequence

# # hybrid
from rltk.similarity.hybrid import hybrid_jaccard_similarity, monge_elkan_similarity, symmetric_monge_elkan_similarity
Expand Down
77 changes: 77 additions & 0 deletions rltk/similarity/lcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import rltk.utils as utils


def _lcs(s1, s2):
m, n = len(s1), len(s2)

dp = [[None] * (n + 1) for i in range(m + 1)]

for i in range(m + 1):
for j in range(n + 1):
if i == 0 or j == 0:
dp[i][j] = 0
elif s1[i - 1] == s2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

return dp[m][n]


def longest_common_subsequence_distance(s1, s2):
"""
The LCS distance between strings X (of length n) and Y (of length m) is n + m - 2 |LCS(X, Y)| min = 0 max = n + m
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: Longest Common Subsequence Distance.
Examples:
>>> rltk.longest_common_subsequence_distance('abcd', 'acbd')
2
>>> rltk.longest_common_subsequence_distance('abcdefg', 'acef')
3
"""
utils.check_for_none(s1, s2)
utils.check_for_type(str, s1, s2)

m, n = len(s1), len(s2)

# dp = [[None] * (n + 1) for i in range(m + 1)]

lcs = _lcs(s1, s2)
return n + m - 2 * lcs


def metric_longest_common_subsequence(s1, s2):
"""
The Metric LCS distance between 2 strings is similar to LCS between 2 string where Metric Longest Common Subsequence is computed as 1 - |LCS(s1, s2)| / max(|s1|, |s2|)
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: Metric Longest Common Subsequence Distance.
Examples:
>>> rltk.longest_common_subsequence('ABCDEFG', 'ABCDEFHJKL')
0.4
# LCS: ABCDEF => length = 6
# longest = s2 => length = 10
# => 1 - 6/10 = 0.4
>>> rltk.optimal_string_alignment_distance('ABDEF', 'ABDIF')
4
# LCS: ABDF => length = 4
# longest = ABDEF => length = 5
# => 1 - 4 / 5 = 0.2
"""
utils.check_for_none(s1, s2)
utils.check_for_type(str, s1, s2)

lcs = _lcs(s1, s2)
return 1 - float(lcs) / max(len(s1), len(s2), 1)
35 changes: 35 additions & 0 deletions rltk/tests/test_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,41 @@ def test_jaro_winkler(s1, s2, similarity):
assert pytest.approx(jaro_winkler_similarity(s1, s2), 0.001) == similarity


@pytest.mark.parametrize('s1, s2, distance', [
('', '', 0),
('abc', '', 3),
('bc', 'abc', 1),
('fuor', 'four', 2),
('abcd', 'acb', 3),
('jellyifhs', 'jellyfish', 4),
('ifhs', 'fish', 4),
])
def test_longest_common_subsequence_distance(s1, s2, distance):
if s1 is None or s2 is None:
with pytest.raises(ValueError):
longest_common_subsequence_distance(s1, s2)
else:
assert longest_common_subsequence_distance(s1, s2) == distance


@pytest.mark.parametrize('s1, s2, distance', [
('', '', 1.0),
("ABCDEFG", "ABCDEFHJKL", 0.4),
('bc', 'abc', 0.33333333333333337),
('fuor', 'four', 0.25),
('abcd', 'acb', 0.5),
('jellyifhs', 'jellyfish', 0.2222222222222222),
('ifhs', 'fish', 0.5),
('Hello, world!', u'Hello, world!', 0.0714285714285714),
])
def test_metric_longest_common_subsequence(s1, s2, distance):
if s1 is None or s2 is None:
with pytest.raises(ValueError):
metric_longest_common_subsequence(s1, s2)
else:
assert metric_longest_common_subsequence(s1, s2) == distance


@pytest.mark.parametrize('vec1, vec2, similarity', [
([1, 2, 1, 3], [2, 5, 2, 3], 0.916),
([1, 2], [2, 3], 0.992)
Expand Down

0 comments on commit 6d4c387

Please sign in to comment.