In [1]:
# take text as input
# tokenize into sentences
# find errors in each sentence
# show with replaced text
# show info about types of mistakes, vocabulary level

In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
from IPython.display import display, Markdown, Latex, HTML
from tabulate import tabulate

In [4]:
cefr_vocab = pd.read_csv('./cefr-vocab-cefrj-octanove.csv')
cefr_dict = {k : v for k,v in cefr_vocab[['headword', 'CEFR']].values}
word_set = set(cefr_vocab.headword)

In [1]:
pip install -U git+https://github.com/PrithivirajDamodaran/Gramformer.git

Collecting git+https://github.com/PrithivirajDamodaran/Gramformer.git
  Cloning https://github.com/PrithivirajDamodaran/Gramformer.git to c:\users\vaibhav\appdata\local\temp\pip-req-build-_ywjkkt9
  Resolved https://github.com/PrithivirajDamodaran/Gramformer.git to commit 23425cd2e98a919384cab6156af8adf1c9d0639a
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Note: you may need to restart the kernel to use updated packages.


  Running command git clone --filter=blob:none --quiet https://github.com/PrithivirajDamodaran/Gramformer.git 'C:\Users\Vaibhav\AppData\Local\Temp\pip-req-build-_ywjkkt9'


In [5]:
# https://github.com/PrithivirajDamodaran/Gramformer
# pip install -U git+https://github.com/PrithivirajDamodaran/Gramformer.git
# python -m spacy download en_core_web_sm 
from gramformer import Gramformer
import torch

def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(1212)

In [6]:
gf = Gramformer(models = 1, use_gpu=False) # 1=corrector, 2=detector

  from .autonotebook import tqdm as notebook_tqdm


[Gramformer] Grammar error correct/highlight model loaded..


In [7]:
grammar_fullforms ={'ADV': 'Adverb', 'PREP': 'Prepositions', 'PRON': 'Pronoun', 'WO': 'Wrong Order', 'VERB': 'Verbs', 'VERB:SVA': 'Singular-Plural', 'VERB:TENSE': 'Verb Tenses', 'VERB:FORM': 'Verb Forms', 'VERB:INFL': 'Verbs', 'SPELL': 'Spelling', 'OTHER': 'Other', 'NOUN': 'Other', 'NOUN:NUM': 'Singular-Plural', 'DET': 'Articles', 'MORPH': 'Other', 'ADJ': 'Adjectives', 'PART': 'Other', 'ORTH': 'Other', 'CONJ': 'Conjugations', 'PUNCT': 'Punctuation'}

In [8]:
def strikethrough(text):
    result = ''
    for c in text:
        result = result + c + '\u0336'
    return result

In [9]:
lemmatizer = WordNetLemmatizer()
spell = SpellChecker()

In [10]:
def misspelled_words(input_text):
    word_list = [a.lower() for a in word_tokenize(re.sub('\W+', ' ', input_text))]
    lemma_words = [lemmatizer.lemmatize(w) for w in word_list]
    misspelled = spell.unknown(lemma_words)
    return misspelled

In [11]:
def sentence_spelling_correction(input_sentence):
    word_list = [a.lower() for a in word_tokenize(re.sub('\W+', ' ', input_text))]
    lemma_words = [lemmatizer.lemmatize(w) for w in word_list]
    corrected_words = []
    for word in lemma_words:
        if len(spell.unknown([word])):
            corrected_words.append(spell.correction(word))
        else:
            corrected_words.append(word)
    corrected_sentence = " ".join(corrected_words) + "."
    return corrected_sentence

In [12]:
def text_grammar_correction(input_text):
    sentences = sent_tokenize(input_text)
    edits = []
    corrected_text = ''
    color_corrected_text = ''
    for sentence in sentences:
        corrected_sentences = gf.correct(sentence, max_candidates=1)
        for corrected_sentence in corrected_sentences:
            all_edits = gf.get_edits(sentence, corrected_sentence)
            if len(all_edits):
                edits += [a[0] for a in all_edits]
                orig = re.split(' ', sentence)
                amend = re.split(' ', corrected_sentence)
                amend_plus = []
                start  = 0
                for edit in all_edits:
                    #print(edit)
                    amend_plus.extend(orig[start:edit[2]])
                    if len(edit[1]):
                        #amend_plus.extend([strikethrough(edit[1])])
                        amend_plus.extend(['<span style="background-color:#ffffff;color:#ff3f33">' + strikethrough(edit[1]) + '</span>'])
                    if len(edit[4]):
                        #amend_plus.extend([edit[4]])
                        amend_plus.extend(['<span style="color:#07b81a">' + edit[4] + '</span>'])
                    start = edit[3]
                amend_plus.extend(orig[edit[3]:])
                color_corrected_sentence = ' '.join(amend_plus)
                #print(sentence)
                #print(corrected_sentence)
                corrected_text += ' ' + corrected_sentence
                color_corrected_text += ' ' + color_corrected_sentence
            else:
                corrected_text += ' ' + sentence
                color_corrected_text += ' ' + sentence             
    mistake_stats = pd.Series([grammar_fullforms[a] for a in edits]).value_counts()
    return corrected_text, color_corrected_text, edits, mistake_stats

In [13]:
def cefr_ratings(input_text):
    nopunc_input_text = re.sub(r'[^\w\s]','', input_text.lower())
    nopunc_input_text = re.sub(r'[0-9]','', nopunc_input_text)
    words = word_tokenize(nopunc_input_text)
    lemma_words = [lemmatizer.lemmatize(word.lower()) for word in words]

    pos_values = ['v', 'a', 'n', 'r', 's']

    cefr_list = []
    cefr_mapping = {}
    for word in lemma_words:
        if word in word_set:
            cefr_list.append(cefr_dict[word])
            cefr_mapping[word] = cefr_dict[word]
        else:      
            for pos_value in pos_values:
                changed_word = lemmatizer.lemmatize(word, pos = pos_value)
                if changed_word != word:
                    break
            if changed_word in word_set:
                cefr_list.append(cefr_dict[changed_word])
                cefr_mapping[changed_word] = cefr_dict[changed_word]
            else:
                #print(changed_word)
                cefr_list.append('uncategorized')
                cefr_mapping[changed_word] = 'uncategorized'
    return cefr_mapping

In [14]:
input_text = input()

In [15]:
corrected_text, color_corrected_text, edits, mistake_stats = text_grammar_correction(input_text)

In [16]:
display(Markdown(color_corrected_text))

 Climate change is <span style="background-color:#ffffff;color:#ff3f33">r̶e̶a̶l̶y̶</span> <span style="color:#07b81a">really</span> a big problem that we all need to pay attention to. The earth is getting warmer and warmer each year. This is <span style="background-color:#ffffff;color:#ff3f33">b̶e̶c̶a̶u̶s̶</span> <span style="color:#07b81a">because</span> of <span style="background-color:#ffffff;color:#ff3f33">a̶l̶o̶t̶</span> <span style="color:#07b81a">a lot</span> of reasons but the biggest reason is human <span style="background-color:#ffffff;color:#ff3f33">a̶c̶t̶i̶v̶i̶t̶e̶s̶.̶</span> <span style="color:#07b81a">activities.</span> We burn to much fossil fuels, like coal and oil, which releases carbon dioxide into the atmosphere. This gas traps heat from the sun and makes the planet hotter. This <span style="background-color:#ffffff;color:#ff3f33">p̶h̶e̶n̶o̶m̶i̶n̶o̶n̶</span> <span style="color:#07b81a">phenomenon</span> is called the greenhouse effect. Another cause of climate change is <span style="background-color:#ffffff;color:#ff3f33">d̶e̶f̶o̶r̶e̶s̶t̶i̶o̶n̶.̶</span> <span style="color:#07b81a">deforestation.</span> Trees absorb carbon dioxide, so when we cut them down, there are fewer trees to soak up this harmful gas. This leads to more carbon dioxide in the atmosphere and more warming. Also, many forests are being destroyed to make broom for agriculture. This is not only bad for the climate but also for the animals that live in these forests. Climate change has many bad <span style="background-color:#ffffff;color:#ff3f33">e̶f̶e̶c̶t̶s̶</span> <span style="color:#07b81a">effects</span> on our planet. For example, it is causing ice caps to melt. This results in rising sea levels, which can lead to flooding in coastal areas. Moreover, the weather is becoming more unpredictable. We are seeing more frequent and severe storms, droughts, and heatwaves. These extreme weather events can cause <span style="background-color:#ffffff;color:#ff3f33">a̶l̶o̶t̶</span> <span style="color:#07b81a">a lot</span> of damage to homes and communities. To fight climate change, we need to make some changes in our lives. We should use less energy and try to use more renewable sources of energy like solar and wind power. We should also try to reduce waste and recycle more. Planting trees can also help a great deal. Everyone can play a part in helping to protect our planet for future generations. It's a big challenge, but it's one we can overcome if we work together.

In [17]:
print(tabulate(pd.DataFrame(mistake_stats), headers = 'keys', tablefmt = 'fancy_grid'))

╒══════════╤═════════╕
│          │   count │
╞══════════╪═════════╡
│ Spelling │       4 │
├──────────┼─────────┤
│ Other    │       4 │
╘══════════╧═════════╛


In [19]:
# Process the text using spaCy
doc = nlp(corrected_text)
 
# Remove stopwords
filtered_words = [token.text for token in doc if not token.is_stop]

# Join the filtered words to form a clean text
clean_text = ' '.join(filtered_words)

cefr_mapping = cefr_ratings(clean_text)
cefr_df = pd.DataFrame(pd.Series(cefr_mapping.values()).value_counts())
print(tabulate(cefr_df, headers = 'keys', tablefmt = 'fancy_grid'))

╒═══════════════╤═════════╕
│               │   count │
╞═══════════════╪═════════╡
│ B1            │      34 │
├───────────────┼─────────┤
│ A1            │      33 │
├───────────────┼─────────┤
│ A2            │      20 │
├───────────────┼─────────┤
│ B2            │      10 │
├───────────────┼─────────┤
│ uncategorized │       2 │
├───────────────┼─────────┤
│ C1            │       1 │
╘═══════════════╧═════════╛


In [20]:
[word for word in cefr_mapping.keys() if cefr_mapping[word] == 'uncategorized']

['heatwaves', 'renewable']

In [3]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
import spacy
from IPython.display import display, Markdown, Latex, HTML
from tabulate import tabulate
from gramformer import Gramformer
import torch

nlp = spacy.load("en_core_web_sm")

# Load CEFR vocabulary
cefr_vocab = pd.read_csv('./cefr-vocab-cefrj-octanove.csv')
cefr_dict = {k: v for k, v in cefr_vocab[['headword', 'CEFR']].values}
word_set = set(cefr_vocab.headword)

grammar_fullforms = {'ADV': 'Adverb', 'PREP': 'Prepositions', 'PRON': 'Pronoun', 'WO': 'Wrong Order', 'VERB': 'Verbs',
                     'VERB:SVA': 'Singular-Plural', 'VERB:TENSE': 'Verb Tenses', 'VERB:FORM': 'Verb Forms',
                     'VERB:INFL': 'Verbs', 'SPELL': 'Spelling', 'OTHER': 'Other', 'NOUN': 'Other', 'NOUN:NUM': 'Singular-Plural',
                     'DET': 'Articles', 'MORPH': 'Other', 'ADJ': 'Adjectives', 'PART': 'Other', 'ORTH': 'Other',
                     'CONJ': 'Conjugations', 'PUNCT': 'Punctuation'}

lemmatizer = WordNetLemmatizer()
spell = SpellChecker()

# Initialize Gramformer
def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(1212)
gf = Gramformer(models=1, use_gpu=False)  # 1=corrector, 2=detector

def strikethrough(text):
    return ''.join([c + '\u0336' for c in text])

def misspelled_words(input_text):
    word_list = [a.lower() for a in word_tokenize(re.sub('\W+', ' ', input_text))]
    lemma_words = [lemmatizer.lemmatize(w) for w in word_list]
    return spell.unknown(lemma_words)

def sentence_spelling_correction(input_sentence):
    word_list = [a.lower() for a in word_tokenize(re.sub('\W+', ' ', input_sentence))]
    lemma_words = [lemmatizer.lemmatize(w) for w in word_list]
    corrected_words = [spell.correction(word) if len(spell.unknown([word])) else word for word in lemma_words]
    return " ".join(corrected_words) + "."

def text_grammar_correction(input_text):
    sentences = sent_tokenize(input_text)
    edits = []
    corrected_text = ''
    color_corrected_text = ''
    for sentence in sentences:
        corrected_sentences = gf.correct(sentence, max_candidates=1)
        for corrected_sentence in corrected_sentences:
            all_edits = gf.get_edits(sentence, corrected_sentence)
            if len(all_edits):
                edits += [a[0] for a in all_edits]
                orig = re.split(' ', sentence)
                amend = re.split(' ', corrected_sentence)
                amend_plus = []
                start = 0
                for edit in all_edits:
                    amend_plus.extend(orig[start:edit[2]])
                    if len(edit[1]):
                        amend_plus.extend(['<span style="background-color:#ffffff;color:#ff3f33">' + strikethrough(edit[1]) + '</span>'])
                    if len(edit[4]):
                        amend_plus.extend(['<span style="color:#07b81a">' + edit[4] + '</span>'])
                    start = edit[3]
                amend_plus.extend(orig[edit[3]:])
                color_corrected_sentence = ' '.join(amend_plus)
                corrected_text += ' ' + corrected_sentence
                color_corrected_text += ' ' + color_corrected_sentence
            else:
                corrected_text += ' ' + sentence
                color_corrected_text += ' ' + sentence            
    mistake_stats = pd.Series([grammar_fullforms[a] for a in edits]).value_counts()
    return corrected_text.strip(), color_corrected_text.strip(), edits, mistake_stats

def cefr_ratings(input_text):
    nopunc_input_text = re.sub(r'[^\w\s]', '', input_text.lower())
    nopunc_input_text = re.sub(r'[0-9]', '', nopunc_input_text)
    words = word_tokenize(nopunc_input_text)
    lemma_words = [lemmatizer.lemmatize(word.lower()) for word in words]

    pos_values = ['v', 'a', 'n', 'r', 's']
    cefr_list = []
    cefr_mapping = {}
    for word in lemma_words:
        if word in word_set:
            cefr_list.append(cefr_dict[word])
            cefr_mapping[word] = cefr_dict[word]
        else:      
            for pos_value in pos_values:
                changed_word = lemmatizer.lemmatize(word, pos=pos_value)
                if changed_word != word:
                    break
            if changed_word in word_set:
                cefr_list.append(cefr_dict[changed_word])
                cefr_mapping[changed_word] = cefr_dict[changed_word]
            else:
                cefr_list.append('uncategorized')
                cefr_mapping[changed_word] = 'uncategorized'
    return cefr_mapping

def process_text(input_text):
    corrected_text, color_corrected_text, edits, mistake_stats = text_grammar_correction(input_text)
    display(Markdown(color_corrected_text))
    print(tabulate(pd.DataFrame(mistake_stats), headers='keys', tablefmt='fancy_grid'))

    # Process the text using spaCy
    doc = nlp(corrected_text)
    filtered_words = [token.text for token in doc if not token.is_stop]
    clean_text = ' '.join(filtered_words)

    cefr_mapping = cefr_ratings(clean_text)
    cefr_df = pd.DataFrame(pd.Series(cefr_mapping.values()).value_counts())
    print(tabulate(cefr_df, headers='keys', tablefmt='fancy_grid'))
    return [word for word in cefr_mapping.keys() if cefr_mapping[word] == 'uncategorized']

if __name__ == "__main__":
    input_text = input("Enter the text to be processed: ")
    uncategorized_words = process_text(input_text)
    print("Uncategorized words:", uncategorized_words)

[Gramformer] Grammar error correct/highlight model loaded..
Enter the text to be processed: Climate change is realy a big problem that we all need to pay attention to. The earth is getting warmer and warmer each year. This is becaus of alot of reasons but the biggest reason is human activites. We burn to much fossil fuels, like coal and oil, which releases carbon dioxide into the atmosphere. This gas traps heat from the sun and makes the planet hotter. This phenominon is called the greenhouse effect.  Another cause of climate change is deforestion. Trees absorb carbon dioxide, so when we cut them down, there are fewer trees to soak up this harmful gas. This leads to more carbon dioxide in the atmosphere and more warming. Also, many forests are being destroyed to make room for agriculture. This is not only bad for the climate but also for the animals that live in these forests.  Climate change has many bad efects on our planet. For example, it is causing ice caps to melt. This results i

Climate change is <span style="background-color:#ffffff;color:#ff3f33">r̶e̶a̶l̶y̶</span> <span style="color:#07b81a">really</span> a big problem that we all need to pay attention to. The earth is getting warmer and warmer each year. This is <span style="background-color:#ffffff;color:#ff3f33">b̶e̶c̶a̶u̶s̶</span> <span style="color:#07b81a">because</span> of <span style="background-color:#ffffff;color:#ff3f33">a̶l̶o̶t̶</span> <span style="color:#07b81a">a lot</span> of reasons but the biggest reason is human <span style="background-color:#ffffff;color:#ff3f33">a̶c̶t̶i̶v̶i̶t̶e̶s̶.̶</span> <span style="color:#07b81a">activities.</span> We burn to much fossil fuels, like coal and oil, which releases carbon dioxide into the atmosphere. This gas traps heat from the sun and makes the planet hotter. This <span style="background-color:#ffffff;color:#ff3f33">p̶h̶e̶n̶o̶m̶i̶n̶o̶n̶</span> <span style="color:#07b81a">phenomenon</span> is called the greenhouse effect. Another cause of climate change is <span style="background-color:#ffffff;color:#ff3f33">d̶e̶f̶o̶r̶e̶s̶t̶i̶o̶n̶.̶</span> <span style="color:#07b81a">deforestation.</span> Trees absorb carbon dioxide, so when we cut them down, there are fewer trees to soak up this harmful gas. This leads to more carbon dioxide in the atmosphere and more warming. Also, many forests are being destroyed to make room for agriculture. This is not only bad for the climate but also for the animals that live in these forests. Climate change has many bad <span style="background-color:#ffffff;color:#ff3f33">e̶f̶e̶c̶t̶s̶</span> <span style="color:#07b81a">effects</span> on our planet. For example, it <span style="background-color:#ffffff;color:#ff3f33">i̶s̶ ̶c̶a̶u̶s̶i̶n̶g̶</span> <span style="color:#07b81a">causes</span> ice caps to melt. This results in rising sea levels, which can lead to flooding in coastal areas. Moreover, the weather is becoming more unpredictable. We are seeing more frequent and severe storms, droughts, and heatwaves. These extreme weather events can cause <span style="background-color:#ffffff;color:#ff3f33">a̶l̶o̶t̶</span> <span style="color:#07b81a">a lot</span> of damage to homes and communities. To fight climate change, we need to make some changes in our lives. We should use less energy and try to use more renewable sources of energy like solar and wind power. We should also try to reduce waste and recycle more. Planting trees can also help a great deal. Everyone can play a part in helping to protect our planet for future generations. It's a big challenge, but it's one we can overcome if we work together.

╒═════════════╤═════════╕
│             │   count │
╞═════════════╪═════════╡
│ Spelling    │       4 │
├─────────────┼─────────┤
│ Other       │       4 │
├─────────────┼─────────┤
│ Verb Tenses │       1 │
╘═════════════╧═════════╛
╒═══════════════╤═════════╕
│               │   count │
╞═══════════════╪═════════╡
│ B1            │      34 │
├───────────────┼─────────┤
│ A1            │      33 │
├───────────────┼─────────┤
│ A2            │      20 │
├───────────────┼─────────┤
│ B2            │      10 │
├───────────────┼─────────┤
│ uncategorized │       2 │
├───────────────┼─────────┤
│ C1            │       1 │
╘═══════════════╧═════════╛
Uncategorized words: ['heatwaves', 'renewable']
