# Notebook that does grammar correction.

In [1]:
### Imports
import libraries.utils as u
from libraries.utils import lemmatize_with_postag
import importlib
importlib.reload(u)

<module 'libraries.utils' from '/Users/konst/Documents/GitHub/Master_DS/data-wild-west/code/libraries/utils.py'>

### Grammar Correction

In [2]:
### test data

test = u.pd.read_csv('../test.csv')
test_raw = u.pd.read_csv('../test_raw.csv')
test

Unnamed: 0,ID,text,Not Determined,Staff,Equipment,Hygiene,Location
0,4,have train for many year here and [ ] have jus...,Positive,Positive,Positive,,
1,8,i train 4-6 time a week for several year a the...,Positive,Positive,Positive,,Positive
2,12,we train in purely at [ ' ] it be a nice centr...,Positive,Positive,,Positive,Positive
3,13,new dumbbell be nice the center always have a ...,Neutral,Positive,Positive,Negative,
4,15,great to get go sometimes it can be difficult ...,Positive,Positive,,,
...,...,...,...,...,...,...,...
603,542,very expensive place consider that the sauna a...,Negative,,,Negative,
604,546,i really enjoy fitness world on lens [ ] very ...,Neutral,Positive,,Negative,Positive
605,569,a good part of the fitness world chain of gym ...,Positive,Positive,Positive,,
606,571,the staff be super nice that be why i give thi...,Neutral,Positive,,,


In [3]:
df = u.pd.read_csv('../data/processed_data/google_reviews.csv')
text_rev = df['text'].tolist()

corrected_text = []

In [4]:
### function that does grammar correction

sym_spell = u.SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = u.pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

def grammar_corrector(text:str) -> str:
    """
    Corrects spelling and grammar in the given text.

    Args:
        text (str or list): The input text to be corrected. It can be a single string or a list of strings.

    Returns:
        str or list: The corrected text, with spelling and grammar issues fixed.
    """
    cleaned_text = []

    if isinstance(text, str):
        text = [text]  # Convert a single string to a list of strings for consistency.

    for line in text:
        temp_line = []
        words = line.split()
        for _, word in enumerate(words):
            # Check if the word contains a numeric character
            has_numeric = any(char.isdigit() for char in word)
            
            if has_numeric:
                # If the word contains a numeric character, keep the original word
                corrected_word = word
            else:
                # If the word does not contain a numeric character, perform correction
                corrected_word = sym_spell.lookup(word.lower(), u.Verbosity.CLOSEST, max_edit_distance=2)
                corrected_word = corrected_word[0].term if corrected_word else corrected_word

            # Append the punctuation back to the corrected word if the original word had it
            if word[-1] in ['!', '?', '.']:
                corrected_word += word[-1]

            temp_line.append(corrected_word)

            # Add space between words, except for the last word
            if _ < len(words) - 1:
                temp_line.append(' ')

        cleaned_text.append(''.join(map(str, temp_line)))
       

    if isinstance(text, str):
        return cleaned_text[0]  # Return the corrected string.
    else:
        return cleaned_text
    



In [5]:
sym_spell = u.SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = u.pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

def grammar_corrector_v1(text:str) -> str:
    """
    Corrects spelling and grammar in the given text.

    Args:
        text (str or list): The input text to be corrected. It can be a single string or a list of strings.

    Returns:
        str or list: The corrected text, with spelling and grammar issues fixed.
    """
    cleaned_text = []

    if isinstance(text, str):
        text = [text]  # Convert a single string to a list of strings for consistency.

    for line in text:
        temp_line = []
        words = line.split()
        for _, word in enumerate(words):
            # Check if the word contains a numeric character
            has_numeric = any(char.isdigit() for char in word)

            if has_numeric:
                # If the word contains a numeric character, keep the original word
                corrected_word = word
            else:
                # If the word does not contain a numeric character, perform correction
                corrected_word = sym_spell.lookup(word.lower(), u.Verbosity.CLOSEST, max_edit_distance=2)
                corrected_word = corrected_word[0].term if corrected_word and corrected_word[0] else word

            # Append the punctuation back to the corrected word if the original word had it
            if word[-1] in ['!', '?', '.']:
                corrected_word += word[-1]

            temp_line.append(corrected_word)

            # Add space between words, except for the last word
            if _ < len(words) - 1:
                temp_line.append(' ')

        cleaned_text.append(''.join(map(str, temp_line)))

    if isinstance(text, str):
        return cleaned_text[0]  # Return the corrected string.
    else:
        return cleaned_text


In [6]:
corrected = []
for _ in test_raw['text']:
    correct = grammar_corrector_v1(_)
    corrected.append(correct)
test_raw['corrected_text'] = corrected
test_raw

Unnamed: 0,ID,text,Not Determined,Staff,Equipment,Hygiene,Location,corrected_text
0,4,Has trained for many years here and Bornholmsv...,1.0,1.0,1.0,,,[has trained for many years here and Bornholms...
1,8,"I trained 4-6 times a week for several years, ...",1.0,1.0,1.0,,1.0,[i trained 4-6 times a week for several years ...
2,12,We train in PureGym at Christianshavn. It is a...,1.0,1.0,,1.0,1.0,[we train in purely at Christianshavn.. it is ...
3,13,New dumbbells are nice. The center always has ...,0.0,1.0,1.0,-1.0,,[new dumbbells are nice. the center always has...
4,15,Great to get going! Sometimes it can be diffic...,1.0,1.0,,,,[great to get going! sometimes it can be diffi...
...,...,...,...,...,...,...,...,...
603,542,Very expensive place considering that the saun...,-1.0,,,-1.0,,[very expensive place considering that the sau...
604,546,I really enjoy Fitness World on Jens Baggesens...,0.0,1.0,,-1.0,1.0,[i really enjoy fitness world on lens Baggesen...
605,569,A good part of the Fitness World chain of gyms...,1.0,1.0,1.0,,,[a good part of the fitness world chain of gym...
606,571,The staffs are super nice that is why I give t...,0.0,1.0,,,,[the staffs are super nice that is why i give ...


In [7]:
test_raw['correct_review'] = test_raw['text'].apply(lambda x: grammar_corrector_v1(x))
df['corrected_review'] = df['text'].apply(lambda x: grammar_corrector_v1(x))
test_raw

Unnamed: 0,ID,text,Not Determined,Staff,Equipment,Hygiene,Location,corrected_text,correct_review
0,4,Has trained for many years here and Bornholmsv...,1.0,1.0,1.0,,,[has trained for many years here and Bornholms...,[has trained for many years here and Bornholms...
1,8,"I trained 4-6 times a week for several years, ...",1.0,1.0,1.0,,1.0,[i trained 4-6 times a week for several years ...,[i trained 4-6 times a week for several years ...
2,12,We train in PureGym at Christianshavn. It is a...,1.0,1.0,,1.0,1.0,[we train in purely at Christianshavn.. it is ...,[we train in purely at Christianshavn.. it is ...
3,13,New dumbbells are nice. The center always has ...,0.0,1.0,1.0,-1.0,,[new dumbbells are nice. the center always has...,[new dumbbells are nice. the center always has...
4,15,Great to get going! Sometimes it can be diffic...,1.0,1.0,,,,[great to get going! sometimes it can be diffi...,[great to get going! sometimes it can be diffi...
...,...,...,...,...,...,...,...,...,...
603,542,Very expensive place considering that the saun...,-1.0,,,-1.0,,[very expensive place considering that the sau...,[very expensive place considering that the sau...
604,546,I really enjoy Fitness World on Jens Baggesens...,0.0,1.0,,-1.0,1.0,[i really enjoy fitness world on lens Baggesen...,[i really enjoy fitness world on lens Baggesen...
605,569,A good part of the Fitness World chain of gyms...,1.0,1.0,1.0,,,[a good part of the fitness world chain of gym...,[a good part of the fitness world chain of gym...
606,571,The staffs are super nice that is why I give t...,0.0,1.0,,,,[the staffs are super nice that is why i give ...,[the staffs are super nice that is why i give ...


In [8]:
sentences = ["The quicky brown foxes いち  are jumping over mdadasasds the lazy dogs or dog.",
             "Sheee sells seashells by the seashore."]

cor = grammar_corrector(sentences)
print(sentences)
print("----")
print(cor)

['The quicky brown foxes いち  are jumping over mdadasasds the lazy dogs or dog.', 'Sheee sells seashells by the seashore.']
----
['the quick brown foxes of are jumping over [] the lazy dogs or dog.', 'sheet sells seashells by the seashore.']


### Lemmatizer

In [9]:
sentences = ["The quicky brown foxes a いち are jumping over mddasasd4tgrefwas the lazy dogs or dog.",
             "Sheee sells seashells by the seashore."]

cor = grammar_corrector_v1(sentences)
print(sentences)
print("----")
print(cor)

['The quicky brown foxes a いち are jumping over mddasasd4tgrefwas the lazy dogs or dog.', 'Sheee sells seashells by the seashore.']
----
['the quick brown foxes a of are jumping over mddasasd4tgrefwas the lazy dogs or dog.', 'sheet sells seashells by the seashore.']


In [10]:
### function to help the lemmatizer

import nltk
nltk.download("wordnet") 
nltk.download("averaged_perceptron_tagger")
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def get_wordnet_pos(postag):
    if postag.startswith('J'):
        return wordnet.ADJ
    elif postag.startswith('V'):
        return wordnet.VERB
    elif postag.startswith('N'):
        return wordnet.NOUN
    elif postag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN
    
### lemmatizer   
def lemmatize_sentencelist(sentencelist):
    lemmatized_sentences = []
    for s in sentencelist: 
        s = s.lower()
        pos_s = nltk.pos_tag(s.split())
        lemmatized_sentences.append(" ".join([wnl.lemmatize(w[0], get_wordnet_pos(w[1])) for w in pos_s]))
    return lemmatized_sentences

df['lemmatized_review'] = df['corrected_review'].apply(lambda x: lemmatize_sentencelist(x))


[nltk_data] Downloading package wordnet to /Users/konst/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/konst/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [11]:
### Testing 

sentences = ["The quick brown foxes are jumping over the lazy dogs or dog.",
             "She sells seashells  いち  by the seashore."]

lemmatized_sentences = lemmatize_sentencelist(sentences)

# Print the original and lemmatized sentences
for original, lemmatized in zip(sentences, lemmatized_sentences):
    print(f"Original: {original}")
    print(f"Lemmatized: {lemmatized}")
    print("---")

Original: The quick brown foxes are jumping over the lazy dogs or dog.
Lemmatized: the quick brown fox be jump over the lazy dog or dog.
---
Original: She sells seashells  いち  by the seashore.
Lemmatized: she sell seashell いち by the seashore.
---


In [12]:
lema1 = lemmatize_with_postag(sentences[0])

for original, lemmatized in zip(sentences, lema1):
    print(f"Original: {original}")
    print(f"Lemmatized: {lemmatized}")
    print("---")


Original: The quick brown foxes are jumping over the lazy dogs or dog.
Lemmatized: T
---
Original: She sells seashells  いち  by the seashore.
Lemmatized: h
---


In [13]:
lema1

'The quick brown fox be jump over the lazy dog or dog'