Performing cleaning corpus using Spacy,
create a pretrained model to avoid main function to recreate/re-clean the data everytime running

In [7]:
import re
import pickle
from nltk import edit_distance, bigrams
from nltk.probability import FreqDist, ConditionalFreqDist
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()

# Open and clean corpus
def train_save_model(file_path, model_file):
    with open(file_path, 'r', encoding="utf-8") as f:
        text = f.read().lower()
    
    # Separate punctuation, numbers and spaces/tabs from words and clean it
    text = re.sub(r"([.,!?'])", r" \1 ", text)  # Add spaces around punctuation
    text = re.sub(r"[^a-zA-Z'.,!? ]", " ", text)  # Remove numbers/symbols
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces / tabs into single space

    tokenizer = Tokenizer(nlp.vocab)
    tokens = tokenizer(text)
    token_text = [token.text for token in tokens]

    unigram_fd = FreqDist(token_text)
    bigram_cfd = ConditionalFreqDist()
    for prev, curr in bigrams(token_text):
        bigram_cfd[prev][curr] += 1

    with open(model_file, 'wb') as f:
         pickle.dump((unigram_fd, bigram_cfd), f)
    
    print("model file is saved at ")
    print(model_file)


corpus_file = 'output-COVID19.txt'  # Update with the corpus path
model_file = 'pretrained_model.pkl' # Update with the model path
train_save_model(corpus_file, model_file)

model file is saved at 
pretrained_model.pkl


Main function

In [9]:
import re
import pickle
from nltk import edit_distance
from spacy.lang.en import English
nlp = English()

# Generate suggestions with prioritization
def load_pretrained_model(model_file):
    with open(model_file, 'rb') as f:
         unigram_fd, bigram_cfd = pickle.load(f)

    return unigram_fd, bigram_cfd

def generate_suggestions(word, correct_word, max_distance=2):
    suggestions = []
    for suggestion in correct_word:
        distance = edit_distance(word, suggestion, transpositions=True)
        if distance <= max_distance:
            suggestions.append((suggestion, distance))
    # Sort by edit distance first
    return [c[0] for c in sorted(suggestions, key=lambda x: x[1])][:100]

# Calculate the probability
def calc_prob(word, prev_word, unigram_fd, bigram_cfd, vocab_size):
    # Check if word exists in corpus file
    if word not in unigram_fd:
        return 0.0  # treat out-of-vocabulary(OOV) words as invalid words
    
    # Unigram probability
    unigram_prob = unigram_fd[word] / unigram_fd.N()
    
    # Bigram probability if context exists
    if prev_word and prev_word in bigram_cfd:
        bigram_prob = (bigram_cfd[prev_word][word] + 1) / (unigram_fd[prev_word] + vocab_size)
    else:
        bigram_prob = unigram_prob
    
        #choosing  0.1 for unigram prob and 0.9 bigram prob as bigram can provide more information to predict next word.
    return 0.1 * unigram_prob + 0.9 * bigram_prob  #The final probability

# Improve correction logic
def correct_spell(statement, unigram_fd, bigram_cfd, vocab_size, correct_word):
    words = statement.lower().split()
    results = []
    
    # Use top 20% most frequent words as threshold
    common_words = {word for word, _ in unigram_fd.most_common(int(len(unigram_fd)*0.2))} # 20% top freq word

    for i, word in enumerate(words):
        # Skip punctuation-only tokens
        if re.fullmatch(r"[.,!?']+", word):
            continue
            
        # Get previous word context
        prev_word = words[i-1] if i > 0 and words[i-1] in unigram_fd else None
        
        # Generate suggestions if:
        # 1. Word is OOV or
        # 2. Word is not in top 20% frequent words
        if word not in unigram_fd or word not in common_words:
            suggestions = generate_suggestions(word, correct_word)
            if suggestions:
                scored = []
                for suggestion in suggestions:
                    # Skip suggestion if same as original
                    if suggestion == word:
                        continue
                        
                    edit_dist = edit_distance(word, suggestion, transpositions=True)
                    suggestion_prob = calc_prob(suggestion, prev_word, unigram_fd, bigram_cfd, vocab_size)
                    error_prob = 0.7 ** edit_dist  # Less aggressive error model
                    score = suggestion_prob * error_prob
                    scored.append((suggestion, suggestion_prob, edit_dist, score))
                
                if scored:
                    # Sorting by using score and distance
                    top_suggestions = sorted(scored, key=lambda x: (-x[3], x[2]))[:5]
                    results.append((word, top_suggestions))
    
    return results

# Format and display results
def display_results(statement, correction_results):
    words = statement.split()
    output = []
    corrected_statements = []
    
    # Create mapping of incorrect words to their corrections
    corrections_map = {res[0]: res[1][0][0] for res in correction_results}

    for word in words:
        if word.lower() in corrections_map:
            output.append(f"*{word}*")  # Highlight incorrect word to easy view
            corrected_statements.append(corrections_map[word.lower()]) # To add corrected word
        else:
            output.append(word)
            corrected_statements.append(word)
    
    print("\nOriginal statement with Errors Highlighted:")
    print(" ".join(output))
    
    print("\nCorrection Suggestions:")
    for incorrect_word, suggestions in correction_results:
        print(f"\nIncorrect word: {incorrect_word}")
        print("Top 5 suggestions:")
        for i, (word, prob, distance, score) in enumerate(suggestions, 1):
            print(f"{i}. {word} (Prob: {prob:.6f}, Distance: {distance}, Score: {score:.6f})")

    print("\nCorrected statements: ")
    print(" ".join(corrected_statements))
    print("\n")

# Main function
def main():
    # Load corpus and build models
    model_file = 'pretrained_model.pkl' # Update with the model path
    unigram_fd, bigram_cfd = load_pretrained_model(model_file)
    vocab_size = len(unigram_fd)
    correct_word = list(unigram_fd.keys())
    
    # Example statement
    #statement = "In oder to demonstrate that Mat0-RNA3 can be used as an effective tol in recombination studies, we apply i to examine the recombination activity of too specific sequences derived frm the HCV genome."
    statement = input("Enter your statement below to have spelling check: \n")

    # Get the correction results
    results = correct_spell(statement, unigram_fd, bigram_cfd, vocab_size, correct_word)
    
    # Display the results
    display_results(statement, results)

if __name__ == "__main__":
    main()


Original statement with Errors Highlighted:
In *oder* to demonstrate that Mat0-RNA3 can be used as an effective *tol* in recombination *studies,* we *apply* i to examine the recombination activity of too specific sequences derived *frm* the HCV *genome.*

Correction Suggestions:

Incorrect word: oder
Top 5 suggestions:
1. order (Prob: 0.001597, Distance: 1, Score: 0.001118)
2. our (Prob: 0.001653, Distance: 2, Score: 0.000810)
3. one (Prob: 0.001597, Distance: 2, Score: 0.000783)
4. other (Prob: 0.001322, Distance: 2, Score: 0.000648)
5. or (Prob: 0.000567, Distance: 2, Score: 0.000278)

Incorrect word: tol
Top 5 suggestions:
1. the (Prob: 0.005731, Distance: 2, Score: 0.002808)
2. of (Prob: 0.003705, Distance: 2, Score: 0.001815)
3. to (Prob: 0.002087, Distance: 1, Score: 0.001461)
4. for (Prob: 0.001230, Distance: 2, Score: 0.000603)
5. or (Prob: 0.000590, Distance: 2, Score: 0.000289)

Incorrect word: studies,
Top 5 suggestions:
1. studies (Prob: 0.000323, Distance: 1, Score: 0.000