In [1]:
import numpy as np

In [4]:
# Dictionary : Turn it into a set
vocab = set([line.rstrip() for line in open("document/vocab.txt")])

In [5]:
def generate_candidates(word):
    # generate 1 word
    # 1.insert 2.delete 3.replace
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i],word[i:]) for i in range(len(word)+1)]
    # 1.insert operation
    inserts = [L+c+R for L,R in splits for c in letters]
    # 2.delete operation
    deletes = [L+R[1:] for L,R in splits if R]
    # 3.replace operation
    replaces = [L+c+R[1:] for L,R in splits if R for c in letters]

    candidates = set(inserts+deletes+replaces)

    return [word for word in candidates if word in vocab]

In [6]:
import nltk
nltk.download('punkt')
from nltk.corpus import reuters
# read the corpus
categories = reuters.categories()
corpus = reuters.sents(categories=categories)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\28224\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [7]:
# construct the language model : using bi-gram
term_count = {}
bigram_count = {}
for doc in corpus:
    # preprocess the first one
    doc = ['<s>'] + doc
    for i in range(0, len(doc)-1):
        # bigram:[i,i+1]
        term = doc[i]
        bigram = doc[i:i+2]

        if term in term_count:
            term_count[term] += 1
        else:
            term_count[term] = 1

        bigram = ' '.join(bigram)
        if bigram in bigram_count:
            bigram_count[bigram] += 1
        else:
            bigram_count[bigram] = 1


In [8]:
# channel probability
channel_prob = {}
for line in open('document/spell-errors.txt'):
    items = line.split(":")
    correct = items[0].strip()
    mistakes = [item.strip() for item in items[1].strip().split(",")]
    channel_prob[correct] = {}
    for mis in mistakes:
        channel_prob[correct][mis] = 1.0/len(mistakes)



In [11]:
V = len(term_count.keys())

file = open('document/testdata.txt','r')
for line in file:
    items = line.rstrip().split('\t')
    line = items[2].split()
    for word in line:
        candidates = generate_candidates(word)
        probs = []
        # return the most possible one
        for candi in candidates:
            prob = 0
            # a.channel
            if candi in channel_prob and word in channel_prob[candi]:
                prob += np.log(channel_prob[candi][word])
            else:
                prob += np.log(0.0001)

            # calculate the possibility of language model:
            idx = items[2].index(word) + 1
            if items[2][idx - 1] in bigram_count and candi in bigram_count[items[2][idx-1]]:
                prob += np.log((bigram_count[items[2][idx-1]][candi] + 1.0) / (term_count[bigram_count[items[2][idx-1]]] + V))
            else:
                prob += np.log(1.0 / V)

        probs.append(prob)
    max_idx = probs.index(max(probs))
    print(word)

products.
gain.
cost.
tases.
ltMC.T.
Co.
worried.
U.S.
named.
U.S.
largest.
U.S.
Group.
Japan.
1985.
U.S.
view.
imports.
Industry.
sources.
said.
rxpoets.
Friday.
said.
country.
continue.
economy.
year.
program.
dispute.
vegetables.
methids.
additives.
details.
said.
demand.
said.
gas.
noted.
billion.
products.
prices.
billion.
pct.
pct.
said.
share.
Ramadan.
markats.
figures.
said.
dotay.
said.
laws.
off.
said.
movements.
considered.
introduced.
made.
week.
said.
exchange.
1986.
forward.
said.
first.
said.
this.
end-users.
participation.
disappointing.
said.
rupiah.
traders.
delivery.
unit.
Mt.
mid-1988.
tonnes.
October.
position.
inserview.
weak.
off.
debts.
scratch.
aggressive.
moves.
plae.
Smithson.
other.
analyst.
time.
things.
said.
home.
banking.
opportunities.
said.
related.
want.
said.
Gottardo.
pct.
subsidiary.
wasev.
said.
adned.
Sogo.
Smithson.
aound.
wrong.
ITA.
said.
unnecessahy.
extension.
6.
allocation.
system.
marks.
liquidity.
rates.
weeks.
said.
prices.
1985.
1986.
l