# Similarity

RLTK supports many different similarity measurements. For detailed usage and optional settings, please refer to the API References.

In [40]:
# this two lines are just for locating the package.
import sys
sys.path.append('..')

import rltk
tk = rltk.init()
tk.set_root_path('../examples')

## Sequence based

In [41]:
# levenshtein measure
print tk.levenshtein_distance('John Singer Sargent', 'Jane Klinger Sargent')
print tk.levenshtein_similarity('John Singer Sargent', 'Jane Klinger Sargent')
print tk.normalized_levenshtein_distance('John Singer Sargent', 'Jane Klinger Sargent')
print tk.damerau_levenshtein_distance('John Singer Sargent', 'Jane Klinger Sargent')

5
0.75
0.25
5


In [42]:
# weighted levenshtein measure
edit_distance_cost = {'insert': {'c':50}, 'insert_default':100, 'delete_default':100, 'substitute_default':100}
tk.load_edit_distance_table('A1', edit_distance_cost)
print tk.levenshtein_distance('John Singer Sargent', 'Jane Klinger Sargent', name='A1')

500


In [43]:
# hamming
print tk.hamming_distance('abc', 'cde')
print tk.hamming_similarity([1,2,3], [3,4,5])

3
3


In [44]:
# needleman wunsch
print tk.needleman_wunsch_similarity('John Singer Sargent', 'Jane Klinger Sargent')

0.7125


In [45]:
# jaro
print tk.jaro_distance('John Singer Sargent', 'John S. Sargent')
print tk.jaro_winkler_distance('John Singer Sargent', 'John S. Sargent')
print tk.jaro_winkler_similarity('John Singer Sargent', 'John S. Sargent')

0.818629908104
0.108822055138
0.891177944862


## Set / Vector based

In [46]:
set1, set2 = set([1, 2, 3]), set([3, 4, 5])

In [47]:
# dice
print tk.dice_similarity(set1, set2)

0.333333333333


In [48]:
# jaccard
print tk.jaccard_index_similarity(set1, set2)
print tk.jaccard_index_distance(set1, set2)

0.2
0.8


In [49]:
# cosine
print tk.cosine_similarity([1, 2, 3], [2, 3, 4])

0.992583333971


## Corpus Based

In [50]:
# tf/idf
tk.load_df_corpus('B1', 'df_corpus_1.txt', file_type='text', mode='append')
tk.load_df_corpus('B2', 'jl_file_1.jsonl', file_type='json_lines', json_path='desc[*]', mode='append')
print tk.tf_idf_similarity(['a', 'b', 'a'], ['a', 'c','d','f'], name='B1')
print tk.tf_idf_similarity(['abc'], ['abc', 'def'], name='B2')

0.175411603861
0.894427191


## Phonetic

In [51]:
# soundex
print tk.soundex_similarity('too', 'to')

1


In [52]:
# metaphone
print tk.metaphone_similarity('reel', 'real')

1


In [53]:
# NYSIIS
print tk.nysiis_similarity('sale', 'sail')

1


## Hybrid

In [54]:
# hybrid jaccard
print tk.hybrid_jaccard_similarity(set(['a', 'b', 'c']), set(['a', 'd', 'f']))

0.333333333333


In [55]:
# monge elkan
print tk.monge_elkan_similarity(['a', 'b', 'c'], ['a', 'd', 'f'])

0.333333333333


In [56]:
# self defined function
def hybrid_test_similarity(m ,n):
    if m == 'a' and n == 'p':
        return 0.7
    if m == 'a' and n == 'q':
        return 0.8
    if m == 'b' and n == 'p':
        return 0.5
    if m == 'b' and n == 'q':
        return 0.9
    if m == 'c' and n == 'p':
        return 0.2
    if m == 'c' and n == 'q':
        return 0.1

print tk.hybrid_jaccard_similarity(set(['a','b','c']), set(['p', 'q']), function=hybrid_test_similarity)

0.533333333333
