Step 1: Import Required Libraries

In [1]:
import re
import pandas as pd
import numpy as np
import textdistance
from collections import Counter


Step 2: Load and Preprocess the Text Data

We use a text file as our corpus. Here, the text is converted to lowercase, and words are extracted using regular expressions.

In [8]:
# Load the text data with UTF-8 encoding
with open('book.txt', 'r', encoding='utf-8') as f:
    text_data = f.read().lower()

# Extract words using regular expressions
words = re.findall(r'\w+', text_data)

# Create a set of unique words (vocabulary)
vocabulary = set(words)
print(f"Total unique words in the corpus: {len(vocabulary)}")


Total unique words in the corpus: 17647


Step 3: Build Word Frequency Distribution

To calculate the likelihood of words, we first need to build a frequency distribution of words in our corpus.

In [9]:
# Calculate word frequency using Counter
word_freq = Counter(words)

# Display the most common words
print("Most common words:", word_freq.most_common(10))


Most common words: [('the', 14703), ('of', 6742), ('and', 6517), ('a', 4799), ('to', 4707), ('in', 4238), ('that', 3081), ('it', 2534), ('his', 2530), ('i', 2120)]


Step 4: Calculate Word Probabilities

Next, we compute the relative probabilities of each word based on its frequency.

In [10]:
# Calculate total number of words
total_words = sum(word_freq.values())

# Calculate probability of each word
word_probs = {word: freq / total_words for word, freq in word_freq.items()}


Step 5: Define the Autocorrect Function

This function takes an input word and attempts to find the closest match from the vocabulary using the Jaccard distance, which measures similarity between two sets of characters.

In [11]:
def autocorrect(input_word):
    """
    Function to autocorrect the input word based on the closest match from the vocabulary.
    
    Parameters:
    input_word (str): The word to be corrected.
    
    Returns:
    pd.DataFrame: A DataFrame containing the most similar words and their probabilities.
    """
    input_word = input_word.lower()
    
    # Check if the word is already correct
    if input_word in vocabulary:
        return f"'{input_word}' is already correct."
    
    # Calculate similarity between input word and words in the vocabulary
    similarities = [1 - textdistance.Jaccard(qval=2).distance(word, input_word) for word in word_freq.keys()]
    
    # Create a DataFrame with words, probabilities, and similarities
    df = pd.DataFrame({'Word': list(word_freq.keys()), 
                       'Probability': list(word_probs.values()), 
                       'Similarity': similarities})
    
    # Sort the DataFrame by similarity and probability
    df = df.sort_values(by=['Similarity', 'Probability'], ascending=False)
    
    # Return the top 5 suggestions
    return df.head(5)


Step 6: Test the Autocorrect Function

Finally, let's test the autocorrect function with some sample input.

In [12]:
# Test the autocorrect function
suggestions = autocorrect('neverteless')
print(suggestions)


               Word  Probability  Similarity
2571   nevertheless     0.000225    0.750000
13657      boneless     0.000013    0.416667
12684      elevates     0.000004    0.416667
1105          never     0.000925    0.400000
7136          level     0.000108    0.400000


In [13]:
import re
from collections import Counter
import textdistance
import pandas as pd

# Function to load text data and create vocabulary
def load_text_data(file_path):
    """
    Load text data from a file, clean it, and create a set of unique words (vocabulary).
    
    Parameters:
    - file_path (str): The path to the text file.
    
    Returns:
    - set: A set containing all unique words in the text.
    - Counter: A Counter object containing word frequencies.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        # Read and clean the text data
        text_data = file.read().lower()
        
    # Extract words using regular expressions
    words = re.findall(r'\w+', text_data)
    
    # Create a set of unique words (vocabulary) and word frequencies
    word_freq = Counter(words)
    vocabulary = set(word_freq.keys())
    
    return vocabulary, word_freq

# Function to calculate word probabilities
def calculate_probabilities(word_freq):
    """
    Calculate the probabilities of words based on their frequencies.
    
    Parameters:
    - word_freq (Counter): A Counter object containing word frequencies.
    
    Returns:
    - dict: A dictionary containing word probabilities.
    """
    total_words = sum(word_freq.values())
    return {word: freq / total_words for word, freq in word_freq.items()}

# Function to generate similar words using edit distance
def generate_candidates(word, max_edits=2):
    """
    Generate candidate words that are within a certain edit distance of the input word.
    
    Parameters:
    - word (str): The input word to generate candidates for.
    - max_edits (int): The maximum number of edits allowed. Default is 2.
    
    Returns:
    - set: A set of candidate words.
    """
    letters = 'abcdefghijklmnopqrstuvwxyz'
    
    # Splits, deletions, transpositions, and replacements
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    
    return set(deletes + transposes + replaces + inserts)

# Function to get the best correction for a misspelled word
def autocorrect(word, vocabulary, word_freq, word_probs):
    """
    Get the most likely correction for a misspelled word based on edit distance and word probability.
    
    Parameters:
    - word (str): The input word to autocorrect.
    - vocabulary (set): The set of unique words in the corpus.
    - word_freq (Counter): A Counter object containing word frequencies.
    - word_probs (dict): A dictionary containing word probabilities.
    
    Returns:
    - pd.DataFrame: A DataFrame containing the most similar words with their probabilities.
    """
    # If the word is in the vocabulary, it's already correct
    if word in vocabulary:
        return f"'{word}' is spelled correctly."
    
    # Generate candidate words within a certain edit distance
    candidates = (generate_candidates(word) & vocabulary) or [word]
    
    # Calculate similarities and probabilities for candidates
    similarities = [(candidate, 1 - textdistance.Jaccard(qval=2).distance(word, candidate))
                    for candidate in candidates]
    
    # Create a DataFrame with candidates, similarities, and probabilities
    df = pd.DataFrame(similarities, columns=['Word', 'Similarity'])
    df['Probability'] = df['Word'].apply(lambda w: word_probs.get(w, 0))
    
    # Sort by similarity and probability
    df = df.sort_values(by=['Similarity', 'Probability'], ascending=False).head(5)
    
    return df

# Main function to demonstrate the autocorrect functionality
if __name__ == "__main__":
    # Load text data and create vocabulary
    vocabulary, word_freq = load_text_data('book.txt')
    
    # Calculate word probabilities
    word_probs = calculate_probabilities(word_freq)
    
    # Test the autocorrect function
    word_to_correct = 'neverteless'
    corrections = autocorrect(word_to_correct, vocabulary, word_freq, word_probs)
    print(f"Top corrections for '{word_to_correct}':\n{corrections}")



Top corrections for 'neverteless':
           Word  Similarity  Probability
0  nevertheless        0.75     0.000225


In [15]:
import re
from collections import Counter
import textdistance

# Function to load text data and create vocabulary
def load_text_data(file_path):
    """
    Load text data from a file, clean it, and create a set of unique words (vocabulary).
    
    Parameters:
    - file_path (str): The path to the text file.
    
    Returns:
    - set: A set containing all unique words in the text.
    - Counter: A Counter object containing word frequencies.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        # Read and clean the text data
        text_data = file.read().lower()
        
    # Extract words using regular expressions
    words = re.findall(r'\w+', text_data)
    
    # Create a set of unique words (vocabulary) and word frequencies
    word_freq = Counter(words)
    vocabulary = set(word_freq.keys())
    
    # Release memory used by the raw text
    del text_data, words
    
    return vocabulary, word_freq

# Function to calculate word probabilities
def calculate_probabilities(word_freq):
    """
    Calculate the probabilities of words based on their frequencies.
    
    Parameters:
    - word_freq (Counter): A Counter object containing word frequencies.
    
    Returns:
    - dict: A dictionary containing word probabilities.
    """
    total_words = sum(word_freq.values())
    return {word: freq / total_words for word, freq in word_freq.items()}

# Function to generate similar words using edit distance
def generate_candidates(word, max_edits=2):
    """
    Generate candidate words that are within a certain edit distance of the input word.
    
    Parameters:
    - word (str): The input word to generate candidates for.
    - max_edits (int): The maximum number of edits allowed. Default is 2.
    
    Returns:
    - set: A set of candidate words.
    """
    letters = 'abcdefghijklmnopqrstuvwxyz'
    
    # Splits, deletions, transpositions, and replacements
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    
    return set(deletes + transposes + replaces + inserts)

# Function to get the best correction for a misspelled word
def autocorrect(word, vocabulary, word_freq, word_probs):
    """
    Get the most likely correction for a misspelled word based on edit distance and word probability.
    
    Parameters:
    - word (str): The input word to autocorrect.
    - vocabulary (set): The set of unique words in the corpus.
    - word_freq (Counter): A Counter object containing word frequencies.
    - word_probs (dict): A dictionary containing word probabilities.
    
    Returns:
    - list: A list of tuples containing the most similar words with their probabilities.
    """
    # If the word is in the vocabulary, it's already correct
    if word in vocabulary:
        return [(word, word_probs[word], 1.0)]
    
    # Generate candidate words within a certain edit distance
    candidates = (generate_candidates(word) & vocabulary) or [word]
    
    # Calculate similarities and probabilities for candidates
    results = [
        (candidate, word_probs.get(candidate, 0), 1 - textdistance.Jaccard(qval=2).distance(word, candidate))
        for candidate in candidates
    ]
    
    # Sort by similarity and probability, and return top results
    results.sort(key=lambda x: (-x[2], -x[1]))
    
    return results[:5]

# Main function to demonstrate the autocorrect functionality
if __name__ == "__main__":
    # Load text data and create vocabulary
    vocabulary, word_freq = load_text_data('book.txt')
    
    # Calculate word probabilities
    word_probs = calculate_probabilities(word_freq)
    
    # Free up memory used by word_freq after calculating probabilities
    del word_freq
    
    # Test the autocorrect function
    word_to_correct = 'neverteless'
    corrections = autocorrect(word_to_correct, vocabulary, word_probs)
    print(f"Top corrections for '{word_to_correct}':\n{corrections}")


TypeError: autocorrect() missing 1 required positional argument: 'word_probs'