In [6]:
import re
import math
import string
from collections import Counter

## 1. build probability distribution over all the words

In [1]:
# read in document
doc = open('big.txt').read()

In [15]:
#filter out punctuations, leaving only vocabulary
def tokenize(text): 
    return re.findall(r'\w+', text.lower())


In [16]:
sent = 'Bali in March, Egypt in November: 12 Months of Travel Deals'
tokenized_sent = tokenize(sent)

#print(tokenized_sent)

In [20]:
#tokenize the entire document
tokenized_doc = tokenize(doc)
len(tokenized_doc)

1115585

In [18]:
# use Counter to count how many times each vocabulary appears in the document
Counter(tokenized_sent)

Counter({'bali': 1,
         'in': 2,
         'march': 1,
         'egypt': 1,
         'november': 1,
         '12': 1,
         'months': 1,
         'of': 1,
         'travel': 1,
         'deals': 1})

In [36]:
VOCAB_COUNT = Counter(tokenized_doc)
VOCAB_COUNT.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [37]:
VOCAB_COUNT.get('the')

79809

## 2. Generate possible candidates of correction for a misspelled word
1. Edit distance 1:
    1. **delete a letter**; e.g. coup -> cou/cup/oup/cop
    2. **insert a letter**; e.g. coup -> acoup/bcoup/..../caoup/cboup/..../coaup/cobup/....
    3. **replace a letter**; e.g. coup -> corp/...
    4. **swap 2 adjacent letters**; e.g. coup -> ocup/cuop/....

2. Edit distance 2: all combinations that are within edit distance 1 of all the words within edit distance 1 of the original word

    --> A lot of words! but not every one of them makes sense <br>
    --> Filter out those that does not appear in the document

In [24]:
def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

To generate the set of candidate words that are one edit away.  One way is to *split* the original word in all possible places, each split forming a *pair* of words, `(a, b)`, before and after the place, and at each place, either delete, transpose, replace, or insert a letter:

<table>
  <tr><td> pairs: <td><tt> Ø+wird <td><tt> w+ird <td><tt> wi+rd <td><tt>wir+d<td><tt>wird+Ø<td><i>Notes:</i><tt> (<i>a</i>, <i>b</i>)</tt> pair</i>
  <tr><td> deletions: <td><tt>Ø+ird<td><tt> w+rd<td><tt> wi+d<td><tt> wir+Ø<td><td><i>Delete first char of b</i>
  <tr><td> transpositions: <td><tt>Ø+iwrd<td><tt> w+rid<td><tt> wi+dr</tt><td><td><td><i>Swap first two chars of b
  <tr><td> replacements: <td><tt>Ø+?ird<td><tt> w+?rd<td><tt> wi+?d<td><tt> wir+?</tt><td><td><i>Replace char at start of b
  <tr><td> insertions: <td><tt>Ø+?+wird<td><tt> w+?+ird<td><tt> wi+?+rd<td><tt> wir+?+d<td><tt> wird+?+Ø</tt><td><i>Insert char between a and b
</table>

In [25]:
def splits(word):
    return [(word[:i], word[i:]) for i in range(len(word)+1)]

splits('wird')

[('', 'wird'), ('w', 'ird'), ('wi', 'rd'), ('wir', 'd'), ('wird', '')]

In [26]:
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    pairs      = splits(word)
    deletes    = [L + R[1:]               for L, R in pairs if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in pairs if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in pairs if R for c in letters]
    inserts    = [L + c + R               for L, R in pairs for c in letters]
    return set(deletes + transposes + replaces + inserts)



In [30]:
# print(edits1('class'))

In [34]:
def known(words): 
    "The subset of 'words' that exist in the document"
    return set(w for w in words if w in VOCAB_COUNT)

In [50]:
#known(edits1('class'))

In [39]:
def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

# 3. Correct misspelled words

In [47]:
def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=VOCAB_COUNT.get)

In [49]:
list(map(correction, tokenize('Speling errurs in somethink. Whutever; unusuel misteakes everyware?')))

['spelling',
 'errors',
 'in',
 'something',
 'whatever',
 'unusual',
 'mistakes',
 'everywhere']

## Text Preprocessing with NLTK

In [4]:
import nltk

# the following codes only need to run once
nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leno13win10\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leno13win10\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\leno13win10\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [5]:
# convert text to lower case

sent = 'Bali in M;arch, Egypt in November: 12 Months of Travel Deals'
sent = sent.lower()

In [6]:
# remove numbers and punctuations
import re

## remove punctuation
sent = re.sub(r'[^\w\s]','',sent)
# print(sent)

In [117]:
## remove digits
sent = re.sub(r'\d+','', sent)
# print(sent)

In [7]:
# tokenize
sent = sent.split()

In [118]:
# remove stop words
from nltk.corpus import stopwords

sent = [s for s in sent if not s in stopwords.words('english')]

# print(sent)

In [119]:
# stemming
strings = ['likes', 'liking', 'liked']

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

# for s in strings:
#     print(stemmer.stem(s))

## Reference and Resources
1. [Regex Cheat Sheet 1](https://regexr.com/), [Regex Cheat Sheet 2](https://www.rexegg.com/regex-quickstart.html)
2. [More on spell check](https://nbviewer.jupyter.org/url/norvig.com/ipython/How%20to%20Do%20Things%20with%20Words.ipynb)
3. [Data Source](http://norvig.com/big.txt)
4. https://norvig.com/spell-correct.html
5. https://www.kaggle.com/shashanksai/text-preprocessing-using-python
