In [7]:
import re
import random
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from nltk import tokenize
import nltk.data
import nltk
from nltk import trigrams
from nltk.util import pad_sequence
import os
from os import listdir
from os.path import isfile, join

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
!unzip "/content/books.zip"

Archive:  /content/books.zip
   creating: books/
  inflating: books/Alices_Adventures_in_Wonderland_by_Lewis_Carroll.rtf  
  inflating: books/Anna_Karenina_by_Leo_Tolstoy.rtf  
  inflating: books/David_Copperfield_by_Charles_Dickens.rtf  
  inflating: books/Don_Quixote_by_Miguel_de_Cervantes.rtf  
  inflating: books/Dracula_by_Bram_Stoker.rtf  
  inflating: books/Emma_by_Jane_Austen.rtf  
  inflating: books/Frankenstein_by_Mary_Shelley.rtf  
  inflating: books/Great_Expectations_by_Charles_Dickens.rtf  
  inflating: books/Grimms_Fairy_Tales_by_The_Brothers_Grimm.rtf  
  inflating: books/Metamorphosis_by_Franz_Kafka.rtf  
  inflating: books/Oliver_Twist_by_Charles_Dickens.rtf  
  inflating: books/Pride_and_Prejudice_by_Jane_Austen.rtf  
  inflating: books/The_Adventures_of_Sherlock_Holmes_by_Arthur_Conan_Doyle.rtf  
  inflating: books/The_Adventures_of_Tom_Sawyer_by_Mark_Twain.rtf  
  inflating: books/The_Count_of_Monte_Cristo_by_Alexandre_Dumas.rtf  
  inflating: books/The_Picture_of_D

In [9]:
# Method to Load the book from his files
def load_book(path):    
    input_file = os.path.join(path)
    with open(input_file) as f:
        book = f.read()
    return book

In [8]:
path = './books/'
book_files = [f for f in listdir(path) if isfile(join(path, f))]
book_files = book_files[1:]

In [10]:
# Load the books using the file names
books = []
for book in book_files:
    books.append(load_book(path+book))

In [11]:
# Getting number of words in each book 
for i in range(len(books)):
    print("The number of {} words in book of named {}.".format(len(books[i].split()), book_files[i]))

The number of 165188 words in book of named Oliver_Twist_by_Charles_Dickens.rtf.
The number of 96185 words in book of named The_Adventures_of_Tom_Sawyer_by_Mark_Twain.rtf.
The number of 83657 words in book of named The_Picture_of_Dorian_Gray_by_Oscar_Wilde.rtf.
The number of 361612 words in book of named Anna_Karenina_by_Leo_Tolstoy.rtf.
The number of 126999 words in book of named Pride_and_Prejudice_by_Jane_Austen.rtf.
The number of 191598 words in book of named Great_Expectations_by_Charles_Dickens.rtf.
The number of 480495 words in book of named The_Count_of_Monte_Cristo_by_Alexandre_Dumas.rtf.
The number of 53211 words in book of named The_Prince_by_Nicolo_Machiavelli.rtf.
The number of 113452 words in book of named David_Copperfield_by_Charles_Dickens.rtf.
The number of 105428 words in book of named Grimms_Fairy_Tales_by_The_Brothers_Grimm.rtf.
The number of 25395 words in book of named Metamorphosis_by_Franz_Kafka.rtf.
The number of 433993 words in book of named Don_Quixote_by_Mi

In [23]:
# Method to clean the data
def clean_tweet(tweet):
    if type(tweet) == np.float:
        return "" 
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9']"," ", temp)
    temp = re.sub(r"^rt", "", temp)
    return temp.split()

In [32]:
# Clean the text of the books
clean_books = []
for book in books:
    clean_books.append(clean_tweet(book))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until


In [35]:
# Check to ensure the text has been cleaned properly
clean_books[0][:500]

['rtf1',
 'ansi',
 'ansicpg1252',
 'cocoartf1404',
 'cocoasubrtf470',
 'fonttbl',
 'f0',
 'fmodern',
 'fcharset0',
 'courier',
 'colortbl',
 'red255',
 'green255',
 'blue255',
 'red0',
 'green0',
 'blue0',
 'margl1440',
 'margr1440',
 'vieww10800',
 'viewh8400',
 'viewkind0',
 'deftab720',
 'pard',
 'pardeftab720',
 'sl280',
 'partightenfactor0',
 'f0',
 'fs24',
 'cf2',
 'expnd0',
 'expndtw0',
 'kerning0',
 'outl0',
 'strokewidth0',
 'strokec2',
 'the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'oliver',
 'twist',
 'by',
 'charles',
 'dickens',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrictions',
 'whatsoever',
 'you',
 'may',
 'copy',
 'it',
 'give',
 'it',
 'away',
 'or',
 're',
 'use',
 'it',
 'under',
 'the',
 'terms',
 'of',
 'the',
 'project',
 'gutenberg',
 'license',
 'included',
 'with',
 'this',
 'ebook',
 'or',
 'online',
 'at',
 'www',
 'gutenberg',
 'net',
 'title',
 'o

In [40]:
print(list(trigrams(clean_books[2], pad_right=True, pad_left=True)))



In [41]:
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for sentence in clean_books:
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [42]:
d = model["this", "is"]
sorted(d.items(), key=lambda x: x[1], reverse=True)

[('the', 0.18490566037735848),
 ('a', 0.1339622641509434),
 ('not', 0.07735849056603773),
 ('what', 0.05471698113207547),
 ('my', 0.03773584905660377),
 ('all', 0.02830188679245283),
 ('no', 0.020754716981132074),
 ('your', 0.018867924528301886),
 ('an', 0.018867924528301886),
 ('only', 0.011320754716981131),
 ('awful', 0.011320754716981131),
 ('one', 0.011320754716981131),
 ('that', 0.011320754716981131),
 ('quite', 0.011320754716981131),
 ('too', 0.011320754716981131),
 ('him', 0.009433962264150943),
 ('mr', 0.009433962264150943),
 ('to', 0.009433962264150943),
 ('very', 0.009433962264150943),
 ('it', 0.007547169811320755),
 ('going', 0.005660377358490566),
 ('how', 0.005660377358490566),
 ('something', 0.005660377358490566),
 ('done', 0.005660377358490566),
 ('as', 0.005660377358490566),
 ('written', 0.005660377358490566),
 ('just', 0.005660377358490566),
 ('indeed', 0.005660377358490566),
 ('by', 0.0037735849056603774),
 ('great', 0.0037735849056603774),
 ('merely', 0.0037735849056

In [43]:
def split(word):
    return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [44]:
def delete(word):
    return [l + r[1:] for l,r in split(word) if r]

In [45]:
def swap(word):
    return [l + r[1] + r[0] + r[2:] for l, r in split(word) if len(r)>1]

In [46]:
import string
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [47]:
def replace(word):
    letters = string.ascii_lowercase
    return [l + c + r[1:] for l, r in split(word) if r for c in letters]

In [48]:
def insert(word):
    letters = string.ascii_lowercase
    return [l + c + r for l, r in split(word) for c in letters]

In [49]:
def edit1(word):
    return set(delete(word) + swap(word) + replace(word) + insert(word))

In [50]:
def edit2(word):
    return set(e2 for e1 in edit1(word) for e2 in edit1(e1))

In [51]:
def edit3(word):
    return set(e3 for e2 in edit2(word) for e3 in edit2(e2))

In [52]:
def read_corpus(filename):
    with open(filename, "r", encoding='utf-8') as file:
        lines = file.readlines()
        words = []
        for line in lines:
            words += re.findall(r'\w+', line.lower())
    return words

In [53]:
words = []
for book in clean_books:
    for word in book:
      words.append(word)


In [55]:
print(f"There are {len(words)} total words in the corpus")

There are 2852954 total words in the corpus


In [56]:
vocabs = set(words)
print(f"There are {len(vocabs)} unique words in the vocabulary")

There are 40357 unique words in the vocabulary


In [57]:
word_counts = Counter(words)
print(word_counts["love"])

1639


In [58]:
total_word_count = float(sum(word_counts.values()))
word_probas = {word: word_counts[word] / total_word_count for word in word_counts.keys()}

In [59]:
print(word_probas["love"])

0.0005744922631069411


In [60]:
def correct_spelling(word, vocabulary, word_probabilities):
    if word in vocabulary:
        return 

    suggestions = edit1(word) or edit2(word) or edit3(word) or [word]
    best_guesses = [w for w in suggestions if w in vocabulary]
    return [(w, word_probabilities[w]) for w in best_guesses]

In [61]:
word = "wron"
corrections = correct_spelling(word, vocabs, word_probas)

if corrections:
    print(corrections)
    probs = np.array([c[1] for c in corrections])
    best_ix = np.argmax(probs)
    correct = corrections[best_ix][0]
    print(f"{correct} is suggested for {word}")

[('worn', 4.661834715876947e-05), ('iron', 7.886562489265512e-05), ('won', 0.00014476223591407362), ('wren', 3.154624995706205e-06), ('wrong', 0.00014511274980248543)]
wrong is suggested for wron


In [62]:
def checkSpelling(word, vocabulary):
    if word in vocabulary:
        return 

    suggestions = edit1(word) or edit2(word) or edit3(word) or [word]
    best_guesses = [w for w in suggestions if w in vocabulary]
    return [(w) for w in best_guesses]

In [None]:
def corrS(sentence):
    print(sentence)
    sentence = sentence.lower()
    words=[]
    words.append("None")
    words.append("None")
    for i in sentence.split():
        words.append(i)
    words.append("None")
    words.append("None")
    for j,i in enumerate(words): 
        corrections = checkSpelling(str(i), vocabs)
        if corrections and i!="None":
            d = model[words[j-2],words[j-1]]
            guses=sorted(d.items(), key=lambda x: x[1], reverse=True)
            maxV=0
            best_guess=""
            for c in corrections:
                for g in guses:
                    if c==g[0]:
                        if g[1]>maxV:
                            maxV=g[1]
                            best_guess=g[0]
            if best_guess != "":
                words[j]= best_guess
            else :
                pcorrections = correct_spelling(str(i), vocabs, word_probas)
                if pcorrections:
                    probs = np.array([c[1] for c in pcorrections])
                    best_ix = np.argmax(probs)
                    correct = pcorrections[best_ix][0]
                    words[j]= correct
    print('\nthe correct sentence :\n')            
    cs = ""
    for i in words:
        if i != 'None':
            cs += i
            cs += ' '
    print(cs)                

In [68]:
corrS("accodring to a resarch at cmabrdge unversity")

accodring to a resarch at cmabrdge unversity

the correct sentence :

according to a research at cmabrdge university 


In [67]:
corrS("the olny important tihng is taht")

the olny important tihng is taht

the correct sentence :

the only important thing is that 


In [74]:
corrS("it does not mtter in waht oredr the ltters")

it does not mtter in waht oredr the ltters

the correct sentence :

it does not matter in what order the letters 
