Skip to content

Commit

Permalink
Merge pull request #16 from Dhvanan/master
Browse files Browse the repository at this point in the history
new-metrics
  • Loading branch information
GreatYYX committed Apr 20, 2018
2 parents b1de8bf + 2b5adf4 commit 8a61535
Show file tree
Hide file tree
Showing 7 changed files with 607 additions and 4 deletions.
132 changes: 132 additions & 0 deletions rltk/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -830,6 +830,25 @@ def damerau_levenshtein_distance(self, s1, s2):
"""
return damerau_levenshtein_distance(s1, s2)

def optimal_string_alignment_distance(self, s1, s2):
"""
This is a variation of the Damerau-Levenshtein distance that returns the strings' edit distance taking into account deletion, insertion, substitution, and transposition, under the condition that no substring is edited more than once.
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: Optimal String Alignment Distance.
Examples:
>>> rltk.optimal_string_alignment_distance('abcd', 'acbd')
1
>>> rltk.optimal_string_alignment_distance('ca', 'abc')
3
"""
return optimal_string_alignment_distance(s1, s2)

def needleman_wunsch_similarity(self, s1, s2, name=None, match=2, mismatch=-1, gap=-0.5):
"""
This Needleman Wunsch Similarity is computed as needlman_wunsch_score over maximum score of s1 and s2.
Expand Down Expand Up @@ -1163,6 +1182,119 @@ def nysiis_similarity(self, s1, s2):
"""
return nysiis_similarity(s1, s2)

def longest_common_subsequence_distance(self, s1, s2):
"""
The LCS distance between strings X (of length n) and Y (of length m) is n + m - 2 |LCS(X, Y)| min = 0 max = n + m
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: Longest Common Subsequence Distance.
Examples:
>>> rltk.longest_common_subsequence_distance('abcd', 'acbd')
2
>>> rltk.longest_common_subsequence_distance('abcdefg', 'acef')
3
"""
return longest_common_subsequence_distance(s1, s2)

def metric_longest_common_subsequence(self, s1, s2):
"""
The Metric LCS distance between 2 strings is similar to LCS between 2 string where Metric Longest Common Subsequence is computed as 1 - |LCS(s1, s2)| / max(|s1|, |s2|)
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: Metric Longest Common Subsequence Distance.
Examples:
>>> rltk.longest_common_subsequence('ABCDEFG', 'ABCDEFHJKL')
0.4
# LCS: ABCDEF => length = 6
# longest = s2 => length = 10
# => 1 - 6/10 = 0.4
>>> rltk.metric_longest_common_subsequence('ABDEF', 'ABDIF')
4
# LCS: ABDF => length = 4
# longest = ABDEF => length = 5
# => 1 - 4 / 5 = 0.2
"""
return metric_longest_common_subsequence(s1, s2)

def ngram_distance(self, s1, s2, n=2):
"""
N-Gram Distance as defined by Kondrak, "N-Gram Similarity and Distance" String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: NGram Distance.
Examples:
>>> rltk.ngram_distance('ABCD', 'ABTUIO')
0.5833
"""
return ngram_distance(s1, s2, n)

def ngram_similarity(self, s1, s2, n=2):
"""
N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance" String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: NGram Similarity.
Examples:
>>> rltk.ngram_similarity('ABCD', 'ABTUIO')
0.4166666666666667
"""
return ngram_similarity(s1, s2, n)

def qgram_distance(self, s1, s2, n=2):
"""
QGram Distance is the number of distinct q-grams (n-grams) between 2 strings
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: QGram Distance.
Examples:
>>> rltk.qgram_distance('abcde','abdcde')
3
"""
return qgram_distance(s1, s2, n)

def qgram_similarity(self, s1, s2, n=2):
"""
QGram Similarity is the number of common q-grams (n-grams) between 2 strings
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: QGram Similarity.
Examples:
>>> rltk.qgram_similarity('abcde','abdcde')
3
"""
return qgram_similarity(s1, s2, n)

def q_gram_blocking(self, output_file_path, **kwargs):
"""
Q-Gram.
Expand Down
7 changes: 4 additions & 3 deletions rltk/similarity/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
from hamming import hamming_distance, hamming_similarity, normalized_hamming_distance
from dice import dice_similarity
from levenshtein import levenshtein_distance, levenshtein_similarity, \
normalized_levenshtein_distance, damerau_levenshtein_distance
normalized_levenshtein_distance, damerau_levenshtein_distance, optimal_string_alignment_distance
from needleman import needleman_wunsch_score, needleman_wunsch_similarity
from jaro import jaro_winkler_distance, jaro_winkler_similarity, jaro_distance
from jaccard import jaccard_index_similarity, jaccard_index_distance
from cosine import cosine_similarity, string_cosine_similarity
from tf_idf import tf_idf_similarity, compute_idf, compute_tf, tf_idf_similarity_by_dict

from lcs import longest_common_subsequence_distance, metric_longest_common_subsequence
from hybrid import hybrid_jaccard_similarity, monge_elkan_similarity, symmetric_monge_elkan_similarity

from ngram import ngram_distance, ngram_similarity
from qgram import qgram_distance, qgram_similarity
from soundex import soundex_similarity
from metaphone import metaphone_similarity
from nysiis import nysiis_similarity
76 changes: 76 additions & 0 deletions rltk/similarity/lcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from collections import defaultdict

import rltk.utils as utils

def _lcs(s1, s2):
m, n = len(s1), len(s2)

dp = [[None]*(n+1) for i in xrange(m+1)]

for i in range(m+1):
for j in range(n+1):
if i == 0 or j == 0 :
dp[i][j] = 0
elif s1[i-1] == s2[j-1]:
dp[i][j] = dp[i-1][j-1]+1
else:
dp[i][j] = max(dp[i-1][j] , dp[i][j-1])

return dp[m][n]

def longest_common_subsequence_distance(s1, s2):
"""
The LCS distance between strings X (of length n) and Y (of length m) is n + m - 2 |LCS(X, Y)| min = 0 max = n + m
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: Longest Common Subsequence Distance.
Examples:
>>> rltk.longest_common_subsequence_distance('abcd', 'acbd')
2
>>> rltk.longest_common_subsequence_distance('abcdefg', 'acef')
3
"""
utils.check_for_none(s1, s2)
utils.check_for_type(basestring, s1, s2)

m, n = len(s1), len(s2)

dp = [[None]*(n+1) for i in xrange(m+1)]

lcs = _lcs(s1, s2)
return n + m - 2*lcs

def metric_longest_common_subsequence(s1, s2):
"""
The Metric LCS distance between 2 strings is similar to LCS between 2 string where Metric Longest Common Subsequence is computed as 1 - |LCS(s1, s2)| / max(|s1|, |s2|)
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: Metric Longest Common Subsequence Distance.
Examples:
>>> rltk.longest_common_subsequence('ABCDEFG', 'ABCDEFHJKL')
0.4
# LCS: ABCDEF => length = 6
# longest = s2 => length = 10
# => 1 - 6/10 = 0.4
>>> rltk.optimal_string_alignment_distance('ABDEF', 'ABDIF')
4
# LCS: ABDF => length = 4
# longest = ABDEF => length = 5
# => 1 - 4 / 5 = 0.2
"""
utils.check_for_none(s1, s2)
utils.check_for_type(basestring, s1, s2)

lcs = _lcs(s1, s2)
return 1 - float(lcs)/max(len(s1),len(s2),1)
48 changes: 48 additions & 0 deletions rltk/similarity/levenshtein.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,51 @@ def damerau_levenshtein_distance(s1, s2):
char_arr[s1[i - 1]] = i

return dp[n1 + 1][n2 + 1]

def optimal_string_alignment_distance(s1, s2):
"""
This is a variation of the Damerau-Levenshtein distance that returns the strings' edit distance
taking into account deletion, insertion, substitution, and transposition, under the condition
that no substring is edited more than once.
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
Returns:
float: Optimal String Alignment Distance.
Examples:
>>> rltk.optimal_string_alignment_distance('abcd', 'acbd')
1
>>> rltk.optimal_string_alignment_distance('ca', 'abc')
3
"""

utils.check_for_none(s1, s2)
utils.check_for_type(basestring, s1, s2)

s1 = utils.unicode_normalize(s1)
s2 = utils.unicode_normalize(s2)

n1, n2 = len(s1), len(s2)

dp = [[0] * (n2 + 1) for _ in xrange(n1 + 1)]

for i in xrange(0, n1 + 1):
dp[i][0] = i
for j in xrange(0, n2 + 1):
dp[0][j] = j

for i in xrange(1, n1 + 1):
for j in xrange(1, n2 + 1):
cost = 0 if s1[i-1] == s2[j-1] else 1

dp[i][j] = min(dp[i][j-1] + 1,
dp[i - 1][j] + 1,
dp[i - 1][j-1] + cost)

if(i > 1 and j > 1 and s1[i-1] == s2[j-2] and s1[i-2] == s2[j-1]):
dp[i][j] = min(dp[i][j], dp[i-2][j-2] + cost)

return dp[n1][n2]

0 comments on commit 8a61535

Please sign in to comment.