In [1]:
"""Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html

Copyright (c) 2007-2016 Peter Norvig
MIT license: www.opensource.org/licenses/mit-license.php
"""

'Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html\n\nCopyright (c) 2007-2016 Peter Norvig\nMIT license: www.opensource.org/licenses/mit-license.php\n'

In [2]:
import re
from collections import Counter

In [3]:
def words(text):
    return re.findall(r"\w+", text.lower())

In [4]:
WORDS = Counter(words(open("../Corpus/processed_uncased_blanklines/wiki.txt").read()))

In [5]:
def P(word, N=sum(WORDS.values())):
    "Probability of `word`."
    return WORDS[word] / N

In [6]:
def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

In [7]:
def edits1(word):
    "All edits that are one edit away from `word`."
    letters = "abcdefghijklmnopqrstuvwxyz"
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

In [8]:
def edits2(word):
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [9]:
def candidates(word):
    "Generate possible spelling corrections for word."
    return known([word]) or known(edits1(word)) or known(edits2(word)) or [word]

In [10]:
def correction(word):
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

In [11]:
# Test

kata = "mkan"
print("kata typo : ", kata)
print("koreksi : ", correction(kata))

kata typo :  mkan
koreksi :  akan


In [None]:
# Perbaikan ejaan kata

"""Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html

Copyright (c) 2007-2016 Peter Norvig
MIT license: www.opensource.org/licenses/mit-license.php
"""

def words(text):
    return re.findall(r"\w+", text.lower())

WORDS = Counter(words(open("../Corpus/kata-dasar.txt").read()))

def P(word, N=sum(WORDS.values())):
    "Probability of `word`."
    return WORDS[word] / N

def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters = "abcdefghijklmnopqrstuvwxyz"
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def candidates(word):
    "Generate possible spelling corrections for word."
    return known([word]) or known(edits1(word)) or known(edits2(word)) or [word]

def correction(word):
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

In [None]:
# Menerapkan perbaikan kata pada dataset

def apply_correction(text):
    text = [correction(word) for word in text]
    return text

df['review'] = df['review'].apply(apply_correction)
df