In [1]:
import re
import string
from time import time 

import torch
import torch.nn as nn
from torchsummary import summary

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [2]:
nltk.download('stopwords')
punct = string.punctuation
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()
path = "C:\\Users\\Ayush\\Desktop\\Let_us_start_once_again\\NLP\\2_WordEmbeddings\\shakespeare.txt"

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def preprocessing(string, stopwords, stemmer):
    '''We can do all the preprocessing in just one step by creating a pipeline
    First, we have to make all the words in lowercase,
    then we have to tokenize the string,
    then we have to remove stopwords and 
    finally we have to stem all the words.
    This is how it will be ready to be analyzed further'''
    string = re.sub(r'\n', ' ', string)
    string  = string.lower()
    tokens = re.split('\s|(?<!\d)[,.](?!\d)', string)
    '''
    clean_tokens = []
    for word in tokens:
        if word not in stopwords:
            clean_tokens.append(word)
    
    stemmed_words = []
    for word in tokens:
        stemmed_words.append(stemmer.stem(word))
    '''
    preprocessed_array = []
    for word in tokens:
        if word!='':
            preprocessed_array.append(word)
            
    return preprocessed_array
#########################################
#dataset['preprocessed'] = dataset['text'].apply(lambda x:preprocessing(x, stopwords_english, stemmer))    
#dataset.head(10)
with open(path) as f:
    data = f.read()
data = preprocessing(data, stopwords_english, stemmer)
print(len(data))

52202


In [4]:
def build_frequency(data):
    vocab = {}
    for word in data:
        if word in vocab.keys():
            vocab[word]+=1
        else:
            vocab[word] = 1
    return vocab
#####################################
vocab = build_frequency(data)
print(len(vocab))

7751


In [5]:
def build_prob(vocab):
    total = 0
    for word in vocab.keys():
        total+= vocab[word]
    probs = {}
    for word in vocab.keys():
        probs[word] = vocab[word]/total
    return probs

probs = build_prob(vocab)
print(len(probs))

7751


### ***Now we have to find the edit words at one distance and two distance away***

In [6]:
def delete(word):
    possible_words = []
    for i in range(len(word)):
        possible_words.append(word[:i] + word[i+1:])
    return possible_words

In [7]:
deletes = delete('at')
print(len(deletes))
print(deletes)

2
['t', 'a']


In [8]:
def insert(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    possible_words = []
    for i in range(1+len(word)):
        for l in letters:
            possible_words.append(word[:i] + l + word[i:])
    return possible_words

In [9]:
inserts = insert('at')
print(len(inserts))
print(inserts)

78
['aat', 'bat', 'cat', 'dat', 'eat', 'fat', 'gat', 'hat', 'iat', 'jat', 'kat', 'lat', 'mat', 'nat', 'oat', 'pat', 'qat', 'rat', 'sat', 'tat', 'uat', 'vat', 'wat', 'xat', 'yat', 'zat', 'aat', 'abt', 'act', 'adt', 'aet', 'aft', 'agt', 'aht', 'ait', 'ajt', 'akt', 'alt', 'amt', 'ant', 'aot', 'apt', 'aqt', 'art', 'ast', 'att', 'aut', 'avt', 'awt', 'axt', 'ayt', 'azt', 'ata', 'atb', 'atc', 'atd', 'ate', 'atf', 'atg', 'ath', 'ati', 'atj', 'atk', 'atl', 'atm', 'atn', 'ato', 'atp', 'atq', 'atr', 'ats', 'att', 'atu', 'atv', 'atw', 'atx', 'aty', 'atz']


In [10]:
def replace(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    possible_words = []
    for i in range(len(word)):
        for l in letters:
            possible_words.append(word[:i] + l + word[i+1:])
    return possible_words

In [11]:
replaces = replace('at')
print(len(replaces))
print(replaces)

52
['at', 'bt', 'ct', 'dt', 'et', 'ft', 'gt', 'ht', 'it', 'jt', 'kt', 'lt', 'mt', 'nt', 'ot', 'pt', 'qt', 'rt', 'st', 'tt', 'ut', 'vt', 'wt', 'xt', 'yt', 'zt', 'aa', 'ab', 'ac', 'ad', 'ae', 'af', 'ag', 'ah', 'ai', 'aj', 'ak', 'al', 'am', 'an', 'ao', 'ap', 'aq', 'ar', 'as', 'at', 'au', 'av', 'aw', 'ax', 'ay', 'az']


In [12]:
def switch(word):
    possible_words = []
    for i in range(len(word)-1):
        possible_words.append(word[:i] + word[i+1] + word[i] + word[i+2:])
    return possible_words

In [13]:
switches = switch('at')
print(len(switches))
print(switches)

1
['ta']


In [14]:
def one_edit_dist(word):
    # we are only looking for delete, insert, switch (adjacent letters) and replace
    return list(set(delete(word) + insert(word) + replace(word) + switch(word)))

In [15]:
one_edits = one_edit_dist('at')
print(len(one_edits))
print(one_edits)

130
['adt', 'ot', 'ax', 'ait', 'bat', 'abt', 'aut', 'dt', 'tat', 'az', 'aqt', 'ah', 'lt', 'ak', 'ayt', 'cat', 'ab', 'fat', 'av', 'aj', 'atj', 'aft', 'yat', 'atq', 'yt', 'ath', 'atv', 'ats', 'aw', 'xt', 'aat', 'qat', 'ft', 'iat', 'azt', 'vt', 'ajt', 'nat', 'atn', 'nt', 'atg', 'ag', 'kat', 'dat', 'atw', 'am', 'atu', 't', 'aot', 'jt', 'an', 'jat', 'atl', 'ar', 'aq', 'mt', 'ac', 'awt', 'rat', 'ata', 'wat', 'at', 'al', 'uat', 'wt', 'ht', 'bt', 'tt', 'a', 'ad', 'zt', 'sat', 'pat', 'amt', 'ct', 'aa', 'apt', 'st', 'ast', 'att', 'ati', 'atx', 'et', 'aet', 'atz', 'hat', 'ut', 'atb', 'alt', 'as', 'pt', 'ta', 'atd', 'aht', 'atp', 'ant', 'ap', 'axt', 'qt', 'ato', 'eat', 'ao', 'mat', 'atc', 'atm', 'oat', 'zat', 'au', 'agt', 'ate', 'lat', 'it', 'art', 'kt', 'rt', 'ae', 'vat', 'atf', 'aty', 'xat', 'akt', 'atk', 'ay', 'af', 'gat', 'ai', 'gt', 'atr', 'act', 'avt']


In [16]:
def two_edit_dist(word):
    possible_word = []
    for w in one_edit_dist(word):
        possible_word+= one_edit_dist(w)
    return list(set(possible_word))

In [17]:
two_edits = two_edit_dist('at')
print(len(two_edits))
print(two_edits)

7154
['atfs', '', 'uo', 'vay', 'sbt', 'cact', 'ttv', 'atan', 'aqs', 'utx', 'aiht', 'ajf', 'atme', 'lan', 'ajwt', 'vaqt', 'ynt', 'htr', 'xeat', 'vwat', 'ycat', 'zq', 'qtz', 'ajty', 'pq', 'atqj', 'otn', 'aelt', 'ppt', 'atdq', 'daot', 'oaht', 'xaot', 'pjat', 'awit', 'xty', 'ibt', 'fp', 'aua', 'iut', 'ltl', 'nag', 'kmt', 'zaz', 'agit', 'hz', 'ltf', 'iatf', 'atje', 'tap', 'oant', 'katr', 'arit', 'pag', 'azft', 'awtp', 'iab', 'atsg', 'dap', 'hagt', 'athz', 'zs', 'abct', 'ftv', 'wtu', 'wst', 'zatv', 'asy', 'ret', 'ztd', 'atkp', 'amy', 'atwi', 'fap', 'ftf', 'kant', 'aftf', 'cta', 'acct', 'cah', 'uatz', 'oatg', 'atmm', 'bkat', 'ratk', 'dtz', 'fatp', 'brt', 'avtr', 'im', 'mf', 'fai', 'tuat', 'yaq', 'afmt', 'xa', 'ali', 'azi', 'fas', 'aatm', 'aqet', 'aur', 'atwn', 'satc', 'ctr', 'ja', 'atfn', 'dht', 'ha', 'aimt', 'so', 'ayvt', 'atrv', 'mjt', 'catb', 'fft', 'wr', 'kaf', 'azte', 'vata', 'savt', 'tyt', 'htq', 'eata', 'autp', 'smat', 'cqt', 'eatw', 'kaut', 'pak', 'btk', 'axtv', 'dgt', 'aqto', 'li', '

### ***Now we can simply predict the most probable correct word***
- Strategy is that the word should be one edit distance away, if there is no such word then, we will try for two edit distance

In [18]:
def most_probable_words(word, vocab=vocab, probs=probs, n=5):
    # n means no. of possible words we have to predict 
    possible_words = []
    if word in vocab.keys():
        return word
    else:
        # first trying for one_edit_distance
        for w in one_edit_dist(word):
            if w in vocab.keys():
                possible_words.append(w)
        # then trying out for two_edit_distance
        if len(possible_words)==0:
            for w in two_edit_dist(word):
                if w in vocab.keys():
                    possible_words.append(w)
    
    
    if len(possible_words)==0:
        return word # this is the most probable word
    else:
        prob = [probs[w] for w in possible_words]
        sort = np.argsort(prob)
        lst = []
        sort = sort[::-1]
        for i in range(min(n, len(sort))):
            lst.append(possible_words[sort[i]])
        return lst

In [19]:
word = 'dys'
corrected_words = most_probable_words(word)
for w in corrected_words:
    print(f"Predicted word is {w} with probability {probs[w]:.6f}")

Predicted word is days with probability 0.000383
Predicted word is dye with probability 0.000019


In [20]:
word = 'kig'
corrected_words = most_probable_words(word)
for w in corrected_words:
    print(f"Predicted word is {w} with probability {probs[w]:.6f}")

Predicted word is king with probability 0.002893
Predicted word is big with probability 0.000038
Predicted word is kin with probability 0.000019


In [21]:
word = 'een'
corrected_words = most_probable_words(word)
for w in corrected_words:
    print(f"Predicted word is {w} with probability {probs[w]:.6f}")

Predicted word is men with probability 0.000670
Predicted word is even with probability 0.000613
Predicted word is been with probability 0.000594
Predicted word is seen with probability 0.000345
Predicted word is ten with probability 0.000230
