In [1]:
import numpy as np
import pandas as pd
import textdistance 
from collections import Counter
import re

# File Opening And Cleaning (change formate to utf-8

In [2]:
words = []
with open('autocorrect book.txt','r',encoding='utf-8') as f:
    data = f.read()
    data = data.lower()
    word = re.findall('\w+', data)
    words +=word

  word = re.findall('\w+', data)


In [3]:
print(words[0:10])

['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale']


# make vocabulary

In [4]:
len(words)

222670

In [5]:
V = set(words)

# build the frequency of those words

In [6]:
word_freq_dict = Counter(words)

In [7]:
word_freq_dict.most_common(10)

[('the', 14703),
 ('of', 6742),
 ('and', 6517),
 ('a', 4799),
 ('to', 4707),
 ('in', 4240),
 ('that', 3081),
 ('it', 2534),
 ('his', 2530),
 ('i', 2120)]

In [8]:


def most_rare(word_freq_dict, n=10):
 
    df = pd.DataFrame(list(word_freq_dict.items()), columns=['Word', 'Frequency'])

    df_sorted = df.sort_values(by='Frequency')

    return df_sorted.head(n)



print(most_rare(word_freq_dict, 5))


              Word  Frequency
8823   courteously          1
11525        await          1
11527    fatalists          1
11528      congeal          1
11529    eyelashes          1


# Relative Frequency of words
Now we want to get the probability of occurrence of each word, this equals the relative frequencies of the words:

The formula used to calculate the probability of a word in the provided code is:

Probability(word) = Frequency(word) / Total count of all words

In [9]:
Total_words_freq = sum(word_freq_dict.values())


probs = {}
for k in word_freq_dict.keys():
    probs[k] = word_freq_dict[k] / Total_words_freq

In [10]:
probs

{'the': 0.06603044864597836,
 'project': 0.0004086765168186105,
 'gutenberg': 0.00042214936902142185,
 'ebook': 4.490950734270445e-05,
 'of': 0.03027798985045134,
 'moby': 0.00040418556608434004,
 'dick': 0.00040418556608434004,
 'or': 0.0035792877352135446,
 'whale': 0.005523869403152647,
 'by': 0.005487941797278484,
 'herman': 1.796380293708178e-05,
 'melville': 1.796380293708178e-05,
 'this': 0.0064624781066151705,
 'is': 0.00786365473570755,
 'for': 0.007383123007140612,
 'use': 0.0002200565859792518,
 'anyone': 2.694570440562267e-05,
 'anywhere': 7.185521174832712e-05,
 'at': 0.005995419230251044,
 'no': 0.0026676247361566443,
 'cost': 1.796380293708178e-05,
 'and': 0.029267525935240492,
 'with': 0.007944491848924417,
 'almost': 0.0008847172946512777,
 'restrictions': 8.98190146854089e-06,
 'whatsoever': 3.1436655139893116e-05,
 'you': 0.0043023308034310865,
 'may': 0.0011451924372389635,
 'copy': 8.532806395113846e-05,
 'it': 0.011380069160641307,
 'give': 0.00040418556608434004,

# Finding Similar Words¶
Now we will sort similar words according to the Jaccard distance by calculating the 2 grams Q of the words. Next, we will return the 5 most similar words ordered by similarity and probability:

The Jaccard distance measures the dissimilarity between two sets by comparing their intersection and union

In [14]:
def autocorrect(word):
    word = word.lower()
    
    if word in word_freq_dict:
        print('The word is already correct:', word)
    else:
        
        jaccard = textdistance.Jaccard(qval=2)
        
        
        similarities = [1 - jaccard.distance(w, word) for w in word_freq_dict.keys()]
        
        
        df = pd.DataFrame({
            'Word': word_freq_dict.keys(),
            'Prob': [word_freq_dict[w] for w in word_freq_dict.keys()],
            'Similarity': similarities
        })
        
        
        output = df.sort_values(['Similarity', 'Prob'], ascending=False).head(10)
        return output

print(autocorrect('th'))

      Word   Prob  Similarity
0      the  14703    0.500000
4742   thy    112    0.500000
1153   tho      2    0.500000
9734   4th      1    0.500000
9752   5th      1    0.500000
504   that   3081    0.333333
22    with   1769    0.333333
12    this   1439    0.333333
860   they    668    0.333333
260   then    630    0.333333
