In [15]:
! pip install pytrec_eval
import pytrec_eval
import json
import numpy as np
import time
import pickle
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')
wordnet.all_synsets()

# This function creates the input for pytrec_eval
def DictonaryGeneration(topCorrections, Misspelleds, Corrects):
    Correction = {}
    for c in range(len(Misspelleds)):
        Correction[Misspelleds[c]] = {}
        comm = topCorrections[c]
        for n in range(len(comm)):
            Correction[Misspelleds[c]][comm[n]] = 1
    Truth = {}
    for c in range(len(Corrects)):
        Truth[Misspelleds[c]] = {}
        Truth[Misspelleds[c]][Corrects[c]] = 1
    return  Correction, Truth


def MED_DP(incorrectWord, correctWord,):
    len_Incorrect = len(incorrectWord)
    len_Correct = len(correctWord)
    dp = [[0 for x in range(len_Correct + 1)] for x in range(len_Incorrect + 1)]
    for i in range(len_Incorrect + 1):
        for j in range(len_Correct + 1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif incorrectWord[i-1] == correctWord[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = min(dp[i][j-1]+1, dp[i-1][j]+1, dp[i-1][j-1]+2)
 
    return dp[len_Incorrect][len_Correct]

## Dataset Preparation
dataset = [i.strip() for i in open("missp.dat").readlines()]
correct = []
incorrect = []
for word in dataset:
    if word[0]=='$':
        previous_word = word[1:]
    else:
        correct.append(previous_word.lower())
        incorrect.append(word.lower())
limit = 50 # It limits your input stream. Just first 500 incorrect words are computed now.
correct = correct[:limit]
incorrect = incorrect[:limit]
print(f'Dataset loaded with {len(dataset)} entries and {len(correct)} incorrect words.')
Dictionary = []
for i in wordnet.all_synsets():
    Dictionary.append(i.name().split('.')[0])
Dictionary = np.asarray(Dictionary)
print("len dictionary:", len(Dictionary))
Dictionary = np.unique(Dictionary)
print("len dictionary after removing duplicates:", len(Dictionary))

K=10 #number of similar words to display
TOPS = []
for mcidx,ms in enumerate(incorrect):
    tic = time.time()
    if mcidx == limit:
        break
    distances = []
    for cidx, c in enumerate(Dictionary):
        distances.append(MED_DP(ms,c))
    distances = np.asarray(distances)
    idx = distances.argsort()
    kTops = Dictionary[np.array(idx)][:K]
    TOPS.append(kTops)
    print(f'Most similar words to {ms}: {kTops}')
    print(f'Most similar edits: {distances[np.array(idx[:K])]}')
    print(f'Ground Truth: {correct[mcidx]}')
    print(f'{mcidx}/{len(incorrect)}')
    print('-------------------------------------------')


# PytrecEval section
q, run = DictonaryGeneration(TOPS, incorrect[:limit], correct[:limit])
evaluator = pytrec_eval.RelevanceEvaluator(q, {'success_1', 'success_5', 'success_10'})
print(json.dumps(evaluator.evaluate(run), indent=1))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Dataset loaded with 42269 entries and 50 incorrect words.
len dictionary: 117659
len dictionary after removing duplicates: 86555
Most similar words to ab: ['ab' 'aba' 'jab' 'a' 'cab' 'dab' 'lab' 'alb' 'b' 'tab']
Most similar edits: [0 1 1 1 1 1 1 1 1 1]
Ground Truth: albert
0/50
-------------------------------------------
Most similar words to ameraca: ['america' 'american' 'camera' 'amerce' 'maraca' 'arca' 'arauca' 'areca'
 'tamer' 'perca']
Most similar edits: [2 3 3 3 3 3 3 4 4 4]
Ground Truth: america
1/50
-------------------------------------------
Most similar words to amercia: ['armeria' 'america' 'ametria' 'merida' 'amerciable' 'american' 'aerial'
 'arca' 'aria' 'amelia']
Most similar edits: [2 2 2 3 3 3 3 3 3 3]
Ground Truth: america
2/50
-------------------------------------------
Most similar words to ameracan: ['american' 'americana' 'cameraman' 'america' 'marac