Skip to content

Commit

Permalink
remove opt cost for normalized levenshtein & update doc
Browse files Browse the repository at this point in the history
  • Loading branch information
GreatYYX committed Mar 1, 2017
1 parent c271d69 commit 86b7a17
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 68 deletions.
8 changes: 6 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@ RLTK

Record Linkage ToolKit (RLTK)

.. image:: https://readthedocs.org/projects/rltk/badge/?version=latest
:target: http://rltk.readthedocs.io/en/latest/
:alt: Documents

Documentation
=============

* `Tutorials <#>`_
* `API Reference <#>`_
* `Tutorials <http://rltk.readthedocs.io>`_
* `API Reference <http://rltk.readthedocs.io>`_
8 changes: 7 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,13 @@
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
html_theme_options = {
'page_width': '1600px',
'sidebar_width': '300px',
'github_user': 'usc-isi-i2',
'github_repo': 'rltk',
'github_banner': 'true',
}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
Expand Down
5 changes: 0 additions & 5 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,13 @@ In RLTK, you can simply load the the resource you need and give it name, then re
Example 2::

>>> import rltk
>>>
>>> edit_distance_cost = {'insert': {'c':50}, 'insert_default':100, 'delete_default':100, 'substitute_default':100}
>>>
>>> tk = rltk.init()
>>> tk.load_edit_distance_table('A1', edit_distance_cost) # load resource
>>> tk.levenshtein_distance('a', 'abc')
>>> 2
>>> tk.levenshtein_distance('a', 'abc', name='A1')
>>> 150
>>> tk.normalized_levenshtein_distance('a', 'abc', name='A1')
>>> 50.0


API Reference
--------------
Expand Down
1 change: 0 additions & 1 deletion examples/ex1.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,4 @@

print tk.levenshtein_distance('a', 'abc')
print tk.levenshtein_distance('a', 'abc', name='A1')
print tk.normalized_levenshtein_distance('a', 'abc', name='A1')
print tk.tf_idf(['a', 'b', 'a'], ['a', 'c','d','f'], name='B1')
36 changes: 6 additions & 30 deletions rltk/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def hamming_distance(self, s1, s2):
"""
return hamming_distance(s1, s2)

def levenshtein_similarity(self, s1, s2, name=None):
def levenshtein_similarity(self, s1, s2):
"""
The Levenshtein similarity is computed as 1 - normalized_levenshtein_distance.
Expand All @@ -122,19 +122,7 @@ def levenshtein_similarity(self, s1, s2, name=None):
Returns:
float: Levenshtein Similarity between [0.0, 1.0].
"""
if name is None:
return levenshtein_similarity(s1, s2)
else:
self._has_resource(name, 'edit_distance_table')

insert = self._rs_dict[name]['data']['insert']
delete = self._rs_dict[name]['data']['delete']
substitute = self._rs_dict[name]['data']['substitute']
insert_default = self._rs_dict[name]['data']['insert_default']
delete_default = self._rs_dict[name]['data']['delete_default']
substitute_default = self._rs_dict[name]['data']['substitute_default']
return levenshtein_similarity(s1, s2, insert, delete, substitute,
insert_default, delete_default, substitute_default)
return levenshtein_similarity(s1, s2)

def levenshtein_distance(self, s1, s2, name=None):
"""
Expand Down Expand Up @@ -169,31 +157,19 @@ def levenshtein_distance(self, s1, s2, name=None):
return levenshtein_distance(s1, s2, insert, delete, substitute,
insert_default, delete_default, substitute_default)

def normalized_levenshtein_distance(self, s1, s2, name=None):
def normalized_levenshtein_distance(self, s1, s2):
"""
This distance is computed as levenshtein distance divided by the length of the longest string.
This distance is computed as levenshtein distance divided by the length of the longest string. This method
doesn't support customization of operation cost.
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
name (str): Name of resource (edit distance table).
Returns:
float: Normalized Levenshtein Distance between [0.0, 1.0].
"""
if name is None:
return normalized_levenshtein_distance(s1, s2)
else:
self._has_resource(name, 'edit_distance_table')

insert = self._rs_dict[name]['data']['insert']
delete = self._rs_dict[name]['data']['delete']
substitute = self._rs_dict[name]['data']['substitute']
insert_default = self._rs_dict[name]['data']['insert_default']
delete_default = self._rs_dict[name]['data']['delete_default']
substitute_default = self._rs_dict[name]['data']['substitute_default']
return normalized_levenshtein_distance(s1, s2, insert, delete, substitute,
insert_default, delete_default, substitute_default)
return normalized_levenshtein_distance(s1, s2)

def damerau_levenshtein_distance(self, s1, s2):
"""
Expand Down
37 changes: 8 additions & 29 deletions rltk/similarity/levenshtein.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import utils

def _levenshtein(s1, s2, insert, delete, substitute,
insert_default, delete_default, substitute_default):
def _levenshtein(s1, s2, insert={}, delete={}, substitute={},
insert_default=1, delete_default=1, substitute_default=1):
utils.check_for_none(s1, s2)
utils.check_for_type(str, s1, s2)

Expand Down Expand Up @@ -45,27 +45,19 @@ def _levenshtein(s1, s2, insert, delete, substitute,
dp[i-1][j-1] + substitute_cost)
return dp[n1][n2]

def levenshtein_similarity(s1, s2, insert={}, delete={}, substitute={},
insert_default=1, delete_default=1, substitute_default=1):
def levenshtein_similarity(s1, s2):
"""
The Levenshtein similarity is computed as 1 - normalized_levenshtein_distance.
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
insert (dict(str, int), optional): Insert cost of characters. Defaults to empty dict.
delete (dict(str, int), optional): Delete cost of characters. Defaults to empty dict.
substitute (dict(str, dict(str, int)), optional): Substitute cost of characters. Defaults to empty dict.
insert_default (int, optional): Default value of insert cost. Defaults to 1.
delete_default (int, optional): Default value of delete cost. Defaults to 1.
substitute_default (int, optional): Default value of substitute cost. Defaults to 1.
Returns:
float: Levenshtein Similarity between [0.0, 1.0].
"""

return 1 - _normalized_levenshtein(s1, s2, insert, delete, substitute,
insert_default, delete_default, substitute_default)
return 1 - _normalized_levenshtein(s1, s2)

def levenshtein_distance(s1, s2, insert={}, delete={}, substitute={},
insert_default=1, delete_default=1, substitute_default=1):
Expand Down Expand Up @@ -97,44 +89,31 @@ def levenshtein_distance(s1, s2, insert={}, delete={}, substitute={},
insert_default, delete_default, substitute_default)


def _normalized_levenshtein(s1, s2, insert, delete, substitute,
insert_default, delete_default, substitute_default):
lev = _levenshtein(s1, s2, insert, delete, substitute,
insert_default, delete_default, substitute_default)
def _normalized_levenshtein(s1, s2):
lev = _levenshtein(s1, s2)

max_len = max(len(s1), len(s2))
if max_len == 0:
return 0

return float(lev) / max_len

def normalized_levenshtein_distance(s1, s2, insert={}, delete={}, substitute={},
insert_default=1, delete_default=1, substitute_default=1):
def normalized_levenshtein_distance(s1, s2):
"""
This distance is computed as levenshtein distance divided by the length of the longest string.
Args:
s1 (str): Sequence 1.
s2 (str): Sequence 2.
insert (dict(str, int), optional): Insert cost of characters. Defaults to empty dict.
delete (dict(str, int), optional): Delete cost of characters. Defaults to empty dict.
substitute (dict(str, dict(str, int)), optional): Substitute cost of characters. Defaults to empty dict.
insert_default (int, optional): Default value of insert cost. Defaults to 1.
delete_default (int, optional): Default value of delete cost. Defaults to 1.
substitute_default (int, optional): Default value of substitute cost. Defaults to 1.
Returns:
float: Normalized Levenshtein Distance between [0.0, 1.0].
Examples:
>>> rltk.normalized_levenshtein_distance('ab', 'abc')
0.333333333333
>>> rltk.normalized_levenshtein_distance('a', 'abc', insert = {'c':50},
... insert_default=100, delete_default=100, substitute_default=100)
50.0
"""
return _normalized_levenshtein(s1, s2, insert, delete, substitute,
insert_default, delete_default, substitute_default)
return _normalized_levenshtein(s1, s2)

def damerau_levenshtein_distance(s1, s2):
"""
Expand Down

0 comments on commit 86b7a17

Please sign in to comment.