# Similarity

RLTK supports many different similarity measurements. For detailed usages and optional settings, please refer to the API References.

In [123]:
# this two lines are just for locating the package.
import sys
sys.path.append('..')

import rltk
tk = rltk.init()
tk.set_root_path('../examples')

## Sequence based

In [124]:
# levenshtein measure
print tk.levenshtein_distance('John Singer Sargent', 'Jane Klinger Sargent')
print tk.levenshtein_similarity('John Singer Sargent', 'Jane Klinger Sargent')
print tk.normalized_levenshtein_distance('John Singer Sargent', 'Jane Klinger Sargent')
print tk.damerau_levenshtein_distance('John Singer Sargent', 'Jane Klinger Sargent')

5
0.75
0.25
5


In [125]:
# weighted levenshtein measure
edit_distance_cost = {
    'insert': {'l':50}, 'substitute': {'o': {'a': 30}}, 
    'insert_default':100, 'delete_default':100, 'substitute_default':100
}
tk.load_edit_distance_table('A1', edit_distance_cost)
print tk.levenshtein_distance('John Singer Sargent', 'Jane Klinger Sargent', name='A1')

380


In [126]:
# hamming
print tk.hamming_distance('abc', 'cde')
print tk.hamming_similarity([1,2,3], [3,4,5])

3
3


In [127]:
# needleman wunsch
print tk.needleman_wunsch_similarity('John Singer Sargent', 'Jane Klinger Sargent')

0.7125


In [128]:
# jaro
print tk.jaro_distance('John Singer Sargent', 'John S. Sargent')
print tk.jaro_winkler_distance('John Singer Sargent', 'John S. Sargent')
print tk.jaro_winkler_similarity('John Singer Sargent', 'John S. Sargent')

0.818629908104
0.108822055138
0.891177944862


## Set / Vector based

In [129]:
set1, set2 = set(['shell', 'chevron', '76']), set(['mobil', '76', 'love'])

In [130]:
# dice
print tk.dice_similarity(set1, set2)

0.333333333333


In [131]:
# jaccard
print tk.jaccard_index_similarity(set1, set2)
print tk.jaccard_index_distance(set1, set2)

0.2
0.8


In [132]:
# cosine
print tk.cosine_similarity([1, 2, 3], [2, 3, 4])

0.992583333971


## Corpus Based

In [133]:
# tf/idf
tk.load_df_corpus('B1', 'df_corpus_1.txt', file_type='text', mode='append')
tk.load_df_corpus('B2', 'jl_file_1.jsonl', file_type='json_lines', json_path='desc[*]', mode='append')
print tk.tf_idf_similarity(['a', 'b', 'a'], ['a', 'c','d','f'], name='B1')
print tk.tf_idf_similarity(['abc'], ['abc', 'def'], name='B2')

0.175411603861
0.894427191


## Phonetic

In [134]:
# soundex
print tk.soundex_similarity('too', 'to')

1


In [135]:
# metaphone
print tk.metaphone_similarity('reel', 'real')

1


In [136]:
# NYSIIS
print tk.nysiis_similarity('sale', 'sail')

1


## Hybrid

In [137]:
# hybrid jaccard
print tk.hybrid_jaccard_similarity(set(['john', 'harry', 'potter']), set(['jane', 'carry', 'potter']))

0.844444444444


In [138]:
# monge elkan
print tk.monge_elkan_similarity(['john', 'harry', 'potter'], ['jack', 'carry', 'potter'])

0.788888888889


Every hybrid measurement has a default similarity metric function, you can set it to the function provided by RLTK or you own function.

In [139]:
# user Levenshtein to replace Jaro-Winkler
print tk.hybrid_jaccard_similarity(
    set(['john', 'harry', 'potter']), set(['jane', 'carry', 'potter']), function=tk.levenshtein_similarity)

# self defined function
def user_similarity(m ,n):
    return m[:2] == n[:2]
print tk.hybrid_jaccard_similarity(
    set(['john', 'harry', 'potter']), set(['jane', 'carry', 'potter']), function=user_similarity)

0.6
0.333333333333
