<a href="https://colab.research.google.com/github/swapnil6969/ml-road/blob/master/spell_check.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import math
import string
from collections import Counter
import pandas as pd

In [None]:
def tokenise(text):
  return re.findall(r"[\w]+(?:[\.'-][\w]+)*", text.lower())

def remove_punctuation(text):
  return ''.join([c for c in text if c not in string.punctuation])

def ngram(tokens, n):
  return Counter(' '.join(tokens[i:i+n]) for i in range(len(tokens) - n))

print(tokenise("This is sentence with abbreviations like U.S.A. and hyphenated-words. How will the regex han'le the text?"))

['this', 'is', 'sentence', 'with', 'abbreviations', 'like', 'u.s.a', 'and', 'hyphenated-words', 'how', 'will', 'the', 'regex', "han'le", 'the', 'text']


In [None]:
!wget -O sample_data/big.txt https://raw.githubusercontent.com/swapnil6969/spell_checker/main/data/big.txt

--2024-07-05 15:52:03--  https://raw.githubusercontent.com/swapnil6969/spell_checker/main/data/big.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6488395 (6.2M) [text/plain]
Saving to: ‘sample_data/big.txt’


2024-07-05 15:52:03 (109 MB/s) - ‘sample_data/big.txt’ saved [6488395/6488395]



In [None]:
file = open("sample_data/big.txt", "r")
text = file.read()
file.close()

In [None]:
tokens = tokenise(text)

WORDS = Counter(tokens)

NGRAM_3 = ngram(tokens, 3)

NGRAM_2 = ngram(tokens, 2)

In [None]:
WORDS["gig"]

3

In [None]:
NGRAM_2.most_common(10)

[('of the', 12527),
 ('in the', 6445),
 ('to the', 4464),
 ('and the', 3202),
 ('on the', 2525),
 ('at the', 2103),
 ('by the', 1937),
 ('from the', 1865),
 ('with the', 1735),
 ('of a', 1709)]

In [None]:
def prob(w, dictionary):
  return dictionary[w] / dictionary.total()

In [None]:
def edits_one(word):
  "Create all edits that are one edit away from `word`."
  alphabets = string.ascii_lowercase
  splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
  deletes = [left + right[1:] for left, right in splits if right]
  inserts = [left + c + right for left, right in splits for c in alphabets]
  replaces = [left + c + right[1:] for left, right in splits if right for c in alphabets]
  transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
  return set(deletes + inserts + replaces + transposes)

def edits_two(word):
  "Create all edits that are two edits away from `word`."
  return set(e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [None]:
def valid_words(words, dictionary):
  valids = set()
  for w in words:
    if w in dictionary:
      valids.add(w)
  return valids

In [None]:
def correction(word, known):
  if word in known:
    return word

  return max(max(valid_words(edits_one(word), WORDS), key=lambda x: 10*prob(x, known)),
             max(valid_words(edits_two(word), WORDS), key=lambda x: prob(x, known)))

print(correction("hossel", WORDS))

house


In [None]:
def context_aware_correction(sentence):
  # Make bigrams from the data
  tokens = [' '.join(sentence.split()[i:i+2]) for i in range(len(sentence.split()) - 1)]
  init_prob = [ (NGRAM_2[token] + 1e-4) / (WORDS[token.split()[0] ]+ 1e-4) for token in tokens ]
  print(init_prob)

  # Changing one word at a time
  probs = []
  for i in range(len(sentence.split())):
    word = sentence.split()[i]
    print("replacing word:", word, end=' ')
    alternatives = valid_words(edits_one(word), WORDS)
    alternatives.add(word)
    print("alternatives:", *alternatives)

    # Compute probabilities for alternative bigrams
    for alternative in alternatives:
      print("trying word", alternative)
      recomputed_prob = 1
      if i>0:
        recomputed_prob *= (NGRAM_2[' '.join([sentence.split()[i - 1], alternative])] + 1e-4) / (WORDS[sentence.split()[i - 1]] + 1e-4)
      if i<len(sentence.split())-1:
        recomputed_prob *= (NGRAM_2[' '.join([alternative, sentence.split()[i]])] + 1e-4) / (WORDS[alternative] + 1e-4)
      if alternative in ["u", "submit"]:
        print(recomputed_prob)
      # Iterating over old probabilities
      for j in range(len(sentence.split()) - 1):
        if (i==0 and j==0) or (i==len(sentence.split()) - 1 and j==i - 1):
          continue
        elif j==i - 1 or j==i:
          continue
        else:
          recomputed_prob *= init_prob[j]

      probs.append((recomputed_prob, sentence.replace(word, alternative, 1)))
      print(sentence.replace(word, alternative, 1) + ':', recomputed_prob*1e9)

  return probs

In [None]:
p = context_aware_correction("i summit that is what is happening in this case")

[1.3772207495218187e-08, 2.4999375015624612e-05, 0.030970921499623747, 0.006448981405988522, 0.08682332016263805, 0.0005118333451547411, 0.1702145314584437, 0.014180197819714581, 0.004430248475750714]
replacing word: i alternatives: e ti xi w ix d if hi mi g y vi s si il h r id q o b a iv j x m p l z ni u is in it ii c f n v i k t li
trying word e
e summit that is what is happening in this case: 8.787817000138546e-15
trying word ti
ti summit that is what is happening in this case: 2.372482129641443e-13
trying word xi
xi summit that is what is happening in this case: 8.473967513881763e-15
trying word w
w summit that is what is happening in this case: 1.3181701089740656e-14
trying word ix
ix summit that is what is happening in this case: 9.49083954805944e-15
trying word d
d summit that is what is happening in this case: 1.0785039058251585e-14
trying word if
if summit that is what is happening in this case: 1.2514352193645e-10
trying word hi
hi summit that is what is happening in this cas

In [None]:
print(max(p, key=lambda x: x[0]), p[0])

(2.1572137069592035e-19, 'u summit that is what is happening in this case') (8.787817000138546e-24, 'e summit that is what is happening in this case')


In [None]:
NGRAM_2["submit that"]

0