-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #16 from Dhvanan/master
new-metrics
- Loading branch information
Showing
7 changed files
with
607 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,16 @@ | ||
from hamming import hamming_distance, hamming_similarity, normalized_hamming_distance | ||
from dice import dice_similarity | ||
from levenshtein import levenshtein_distance, levenshtein_similarity, \ | ||
normalized_levenshtein_distance, damerau_levenshtein_distance | ||
normalized_levenshtein_distance, damerau_levenshtein_distance, optimal_string_alignment_distance | ||
from needleman import needleman_wunsch_score, needleman_wunsch_similarity | ||
from jaro import jaro_winkler_distance, jaro_winkler_similarity, jaro_distance | ||
from jaccard import jaccard_index_similarity, jaccard_index_distance | ||
from cosine import cosine_similarity, string_cosine_similarity | ||
from tf_idf import tf_idf_similarity, compute_idf, compute_tf, tf_idf_similarity_by_dict | ||
|
||
from lcs import longest_common_subsequence_distance, metric_longest_common_subsequence | ||
from hybrid import hybrid_jaccard_similarity, monge_elkan_similarity, symmetric_monge_elkan_similarity | ||
|
||
from ngram import ngram_distance, ngram_similarity | ||
from qgram import qgram_distance, qgram_similarity | ||
from soundex import soundex_similarity | ||
from metaphone import metaphone_similarity | ||
from nysiis import nysiis_similarity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
from collections import defaultdict | ||
|
||
import rltk.utils as utils | ||
|
||
def _lcs(s1, s2): | ||
m, n = len(s1), len(s2) | ||
|
||
dp = [[None]*(n+1) for i in xrange(m+1)] | ||
|
||
for i in range(m+1): | ||
for j in range(n+1): | ||
if i == 0 or j == 0 : | ||
dp[i][j] = 0 | ||
elif s1[i-1] == s2[j-1]: | ||
dp[i][j] = dp[i-1][j-1]+1 | ||
else: | ||
dp[i][j] = max(dp[i-1][j] , dp[i][j-1]) | ||
|
||
return dp[m][n] | ||
|
||
def longest_common_subsequence_distance(s1, s2): | ||
""" | ||
The LCS distance between strings X (of length n) and Y (of length m) is n + m - 2 |LCS(X, Y)| min = 0 max = n + m | ||
Args: | ||
s1 (str): Sequence 1. | ||
s2 (str): Sequence 2. | ||
Returns: | ||
float: Longest Common Subsequence Distance. | ||
Examples: | ||
>>> rltk.longest_common_subsequence_distance('abcd', 'acbd') | ||
2 | ||
>>> rltk.longest_common_subsequence_distance('abcdefg', 'acef') | ||
3 | ||
""" | ||
utils.check_for_none(s1, s2) | ||
utils.check_for_type(basestring, s1, s2) | ||
|
||
m, n = len(s1), len(s2) | ||
|
||
dp = [[None]*(n+1) for i in xrange(m+1)] | ||
|
||
lcs = _lcs(s1, s2) | ||
return n + m - 2*lcs | ||
|
||
def metric_longest_common_subsequence(s1, s2): | ||
""" | ||
The Metric LCS distance between 2 strings is similar to LCS between 2 string where Metric Longest Common Subsequence is computed as 1 - |LCS(s1, s2)| / max(|s1|, |s2|) | ||
Args: | ||
s1 (str): Sequence 1. | ||
s2 (str): Sequence 2. | ||
Returns: | ||
float: Metric Longest Common Subsequence Distance. | ||
Examples: | ||
>>> rltk.longest_common_subsequence('ABCDEFG', 'ABCDEFHJKL') | ||
0.4 | ||
# LCS: ABCDEF => length = 6 | ||
# longest = s2 => length = 10 | ||
# => 1 - 6/10 = 0.4 | ||
>>> rltk.optimal_string_alignment_distance('ABDEF', 'ABDIF') | ||
4 | ||
# LCS: ABDF => length = 4 | ||
# longest = ABDEF => length = 5 | ||
# => 1 - 4 / 5 = 0.2 | ||
""" | ||
utils.check_for_none(s1, s2) | ||
utils.check_for_type(basestring, s1, s2) | ||
|
||
lcs = _lcs(s1, s2) | ||
return 1 - float(lcs)/max(len(s1),len(s2),1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.