# Edit Distance

**Edit Distance** (a.k.a. Levenshtein Distance) is a measure of similarity between two strings referred to as the source string and the target string.

## Example #1: Edit Distance between two words

In [16]:
import nltk

w1 = 'mapping'
w2 = 'mappings'

nltk.edit_distance(w1, w2)

1

## Example #2: Basic Spelling Checker

In [17]:
import nltk

mistake = "ligting"

words = ['apple', 'bag', 'drawing', 'listing', 'linking', 'living', 'lighting', 'orange', 'walking', 'zoo']

for word in words:
    ed = nltk.edit_distance(mistake, word)
    print(word, ed)

apple 7
bag 6
drawing 4
listing 1
linking 2
living 2
lighting 1
orange 6
walking 4
zoo 7


In [18]:
# The same example with NLTK "words"

import nltk
#nltk.download('words')

mistake = "ligting"

words = nltk.corpus.words.words()

eds = []

for word in words:
    ed = nltk.edit_distance(mistake, word)
    eds.append((ed, word))

In [19]:
# Print the nearest 10 words

print(*sorted(eds)[0:10], sep="\n")

(1, 'lifting')
(1, 'lighting')
(1, 'listing')
(2, 'Lagting')
(2, 'biting')
(2, 'blighting')
(2, 'clinting')
(2, 'digging')
(2, 'figging')
(2, 'fighting')


## Example #3: Sentence-level Edit Distance

In [20]:
# Sentence or paragraph comparison (e.g. plagiarism detection or translation memory)

import nltk

sent1 = "It might help to re-install Python if possible."
sent2 = "It can help to install Python again if possible."
sent3 = "It can be so helpful to reinstall C++ if possible."
sent4 = "help It possible Python to re-install if might." # The same words as sent1 with a different order.
sent5 = "I love Python programming."

ed_sent_1_2 = nltk.edit_distance(sent1, sent2)
ed_sent_1_3 = nltk.edit_distance(sent1, sent3)
ed_sent_1_4 = nltk.edit_distance(sent1, sent4)
ed_sent_1_5 = nltk.edit_distance(sent1, sent5)


print(ed_sent_1_2, 'Edit Distance between sent1 and sent2')
print(ed_sent_1_3, 'Edit Distance between sent1 and sent3')
print(ed_sent_1_4, 'Edit Distance between sent1 and sent4')
print(ed_sent_1_5, 'Edit Distance between sent1 and sent5')

14 Edit Distance between sent1 and sent2
19 Edit Distance between sent1 and sent3
32 Edit Distance between sent1 and sent4
33 Edit Distance between sent1 and sent5


## Example #4: Character-level n-gram Edit Distance

In [21]:
import nltk

sent1 = "It might help to re-install Python if possible."
sent2 = "It can help to install Python again if possible."
sent3 = "It can be so helpful to reinstall C++ if possible."
sent4 = "help It possible Python to re-install if might." # The same words as sent1 with a different order.
sent5 = "I love Python programming."


ng1_chars = list(nltk.ngrams(sent1, n=3))
ng2_chars = list(nltk.ngrams(sent2, n=3))
ng3_chars = list(nltk.ngrams(sent3, n=3))
ng4_chars = list(nltk.ngrams(sent4, n=3))
ng5_chars = list(nltk.ngrams(sent5, n=3))

ed_sent_1_2 = nltk.edit_distance(ng1_chars, ng2_chars)
ed_sent_1_3 = nltk.edit_distance(ng1_chars, ng3_chars)
ed_sent_1_4 = nltk.edit_distance(ng1_chars, ng4_chars)
ed_sent_1_5 = nltk.edit_distance(ng1_chars, ng5_chars)

print(ed_sent_1_2, "Edit Distance between sent1 and sent2 with ngram 3")
print(ed_sent_1_3, "Edit Distance between sent1 and sent3 with ngram 3")
print(ed_sent_1_4, "Edit Distance between sent1 and sent4 with ngram 3")
print(ed_sent_1_5, "Edit Distance between sent1 and sent5 with ngram 3")

18 Edit Distance between sent1 and sent2 with ngram 3
27 Edit Distance between sent1 and sent3 with ngram 3
39 Edit Distance between sent1 and sent4 with ngram 3
39 Edit Distance between sent1 and sent5 with ngram 3


## Example #5: Token-level n-gram Edit Distance

In [22]:
import nltk

sent1 = "It might help to re-install Python if possible."
sent2 = "It can help to install Python again if possible."
sent3 = "It can be so helpful to reinstall C++ if possible."
sent4 = "help It possible Python to re-install if might." # The same words as sent1 with a different order.
sent5 = "I love Python programming."

tokens1 = nltk.word_tokenize(sent1)
tokens2 = nltk.word_tokenize(sent2)
tokens3 = nltk.word_tokenize(sent3)
tokens4 = nltk.word_tokenize(sent4)
tokens5 = nltk.word_tokenize(sent5)

ng1_tokens = list(nltk.ngrams(tokens1, n=3))
ng2_tokens = list(nltk.ngrams(tokens2, n=3))
ng3_tokens = list(nltk.ngrams(tokens3, n=3))
ng4_tokens = list(nltk.ngrams(tokens4, n=3))
ng5_tokens = list(nltk.ngrams(tokens5, n=3))

ed_sent_1_2 = nltk.edit_distance(ng1_tokens, ng2_tokens)
ed_sent_1_3 = nltk.edit_distance(ng1_tokens, ng3_tokens)
ed_sent_1_4 = nltk.edit_distance(ng1_tokens, ng4_tokens)
ed_sent_1_5 = nltk.edit_distance(ng1_tokens, ng5_tokens)

print(ed_sent_1_2, "Edit Distance between tokens1 and tokens2 with ngram 3")
print(ed_sent_1_3, "Edit Distance between tokens1 and tokens3 with ngram 3")
print(ed_sent_1_4, "Edit Distance between tokens1 and tokens4 with ngram 3")
print(ed_sent_1_5, "Edit Distance between tokens1 and tokens5 with ngram 3")

7 Edit Distance between tokens1 and tokens2 with ngram 3
8 Edit Distance between tokens1 and tokens3 with ngram 3
7 Edit Distance between tokens1 and tokens4 with ngram 3
7 Edit Distance between tokens1 and tokens5 with ngram 3


Read the full tutorial of Edit Distance at: https://python.gotrained.com/nltk-edit-distance-jaccard-distance/