Performing cleaning corpus using Spacy,
create a pretrained model to avoid main function to recreate/re-clean the data everytime running

In [2]:
import re
import pickle
from nltk import edit_distance, bigrams
from nltk.probability import FreqDist, ConditionalFreqDist
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()

# Open and clean corpus
def train_save_model(corpus_file, model_file):
    with open(corpus_file, 'r', encoding="utf-8") as f:
        text = f.read().lower()
    
    # Separate punctuation, numbers and spaces/tabs from words and clean it
    text = re.sub(r"([.,!?'])", r" \1 ", text)  # Add spaces around punctuation
    text = re.sub(r"[^a-zA-Z'.,!? ]", " ", text)  # Remove numbers/symbols
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces / tabs into single space

    tokenizer = Tokenizer(nlp.vocab)
    tokens = tokenizer(text)
    token_text = [token.text for token in tokens]

    unigram_fd = FreqDist(token_text)
    bigram_cfd = ConditionalFreqDist()
    for prev, curr in bigrams(token_text):
        bigram_cfd[prev][curr] += 1

    with open(model_file, 'wb') as f:
         pickle.dump((unigram_fd, bigram_cfd), f)
    
    print("model file is saved at ")
    print(model_file)


corpus_file = 'output-corpus.txt'  # Update with the corpus path
model_file = 'pretrained_model.pkl' # Update with the model path
train_save_model(corpus_file, model_file)

model file is saved at 
pretrained_model.pkl
