In [None]:
import spacy
import nltk
from nltk.metrics import edit_distance
from nltk.util import bigrams
from collections import Counter, defaultdict

nlp = spacy.load("en_core_web_sm")

def openfile(path): #open and read the the corpus file
    with open(path, "r", encoding="utf-8") as file:
        txtfile = file.read()

    return txtfile

def clean_corpus(txtfile,outputpath): #clean corpus file, lemmatization and tokenize the token into output file to view
    doc1 = nlp(txtfile)
    correct_word = [token.lemma_.lower() for token in doc1 if not token.is_punct]

    with open(outputpath, "w", encoding="utf-8") as file:
        file.write("\n".join(correct_word))
    
    print(f"Data is cleaned and save to {outputpath}")
    print(f"Sample of correct words: {correct_word[:10]}")  # Debug print

    return correct_word

def bigrams_model(correct_word):
    bigram_list = list(bigrams(correct_word)) #generate bigrams from the corpus
    bigram_count = Counter(bigram_list) # to get the frequency of bigram
    unigram_count = Counter(correct_word) # get the unigram frequency 
    print(f"bigram count is is here")
    print(correct_word)
    print("Bigram list as below:")
    print(bigram_list)
    print("bigram count as below:")
    #print(bigram_count)

    bigram_probability = defaultdict(float)

    for bigram1, count in bigram_count.items():
        bigram_probability[bigram1] = count / unigram_count[bigram1[0]]
    return bigram_probability

def probability_statements(statements, bigram_probability):
    prob_statements = []
    for statement in statements:
        token1 = statement.split()
        tot_prob = 1.00
        range_stop = len(token1) - 1


        for i in range(range_stop):
            bigram = (token1[i], token1[i + 1]) # taking current word and next word
            tot_prob *= bigram_probability.get(bigram, 1e-6) # using a small default probability for the unseen bigrams

        prob_statements.append((statement, tot_prob))
    prob_statements = sorted(prob_statements, key=lambda x:x[1], reverse=True)
        
    return [statement for statement, _ in prob_statements]

def word_suggestion(wrong_word, correct_word, substitution_cost=2, topcount=5): 
    suggest = []
    for word in correct_word:
        distance = edit_distance(wrong_word, word)
        if distance <= substitution_cost:
            suggest.append((word, distance))

    suggest.sort(key=lambda x: x[1])
    return suggest[:topcount]
        
def statement_suggestion(input_statement,suggestion, count=0, statements=[]):
    corrected_text=[]
    
    if count == len(input_statement):
        return [" ".join(statements)]
    
    token = input_statement[count]

    # if there wrong spelling, replace on the possible correction, else remain as it is.
    if token in suggestion: 
        for i in suggestion[token]:
            corrected_text.extend(statement_suggestion(input_statement, suggestion, count + 1, statements + [i[0]]))
    else:
        corrected_text.extend(statement_suggestion(input_statement, suggestion, count + 1, statements + [token]))

    return corrected_text    

def main():
    path = 'D:/MASTER/NLP/Assignment/output-COVID19.txt'
    outputpath = 'D:/MASTER/NLP/Assignment/output-cleaned-no_stop.txt'
    txtfile = openfile(path)

    correct_word = clean_corpus(txtfile, outputpath)
    print(f"corrrect wordssss are {correct_word}")

    bigram_prob = bigrams_model(correct_word)
    print (f"output is {bigram_prob}")

    statement1 = "Finnaly commbination is there "
    #statement1 = input("Enter your statement below to have spelling check: ")

    words = statement1.split()

    need_correction = {}
    for token in words:
        if token not in correct_word:
            suggest = word_suggestion(token, correct_word)

            if suggest:
                need_correction[token] = suggest
                print(f"Wrong words: '{token}' ")
                print(f"Suggestion is : {[j[0] for j in suggest]}")
            else:
                print(f"No suggestions found for '{token}'")  # Debug print

    if need_correction:
        new_statements = statement_suggestion(words, need_correction)
        chk_rank_statement = probability_statements(new_statements, bigram_prob)[:5]

        print("Potential correct statement: ")
        for i, statement2 in enumerate(chk_rank_statement, 1):
            print(f"{i}. {statement2}")

    else:
        print("No errors found.")

#    for word, distance in suggest:
 #       print(suggest,distance)


if __name__ == "__main__":
    main()