# Notebook that does grammar correction.

In [1]:
### Imports
import imports as i
import importlib
importlib.reload(i)

<module 'imports' from '/Users/konst/Documents/GitHub/Master_DS/data-wild-west/code/imports.py'>

### Grammar Correction

In [8]:
df = i.pd.read_csv('../data/processed_data/google_reviews.csv')
text_rev = df['text'].tolist()

corrected_text = []

In [9]:
### function that does grammar correction

sym_spell = i.SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = i.pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

def grammar_corrector(text:str) -> str:
    """
    Corrects spelling and grammar in the given text.

    Args:
        text (str or list): The input text to be corrected. It can be a single string or a list of strings.

    Returns:
        str or list: The corrected text, with spelling and grammar issues fixed.
    """
    cleaned_text = []

    if isinstance(text, str):
        text = [text]  # Convert a single string to a list of strings for consistency.

    for line in text:
        temp_line = []
        words = line.split()
        for _, word in enumerate(words):
            # Check if the word contains a numeric character
            has_numeric = any(char.isdigit() for char in word)
            
            if has_numeric:
                # If the word contains a numeric character, keep the original word
                corrected_word = word
            else:
                # If the word does not contain a numeric character, perform correction
                corrected_word = sym_spell.lookup(word.lower(), i.Verbosity.CLOSEST, max_edit_distance=2)
                corrected_word = corrected_word[0].term if corrected_word else corrected_word

            # Append the punctuation back to the corrected word if the original word had it
            if word[-1] in ['!', '?', '.']:
                corrected_word += word[-1]

            temp_line.append(corrected_word)

            # Add space between words, except for the last word
            if _ < len(words) - 1:
                temp_line.append(' ')

        cleaned_text.append(''.join(map(str, temp_line)))
       

    if isinstance(text, str):
        return cleaned_text[0]  # Return the corrected string.
    else:
        return cleaned_text
    
for _ in text_rev:
    corrected_text.append(grammar_corrector(_))
df['corrected_review'] = corrected_text
#df

### Lemmatizer

In [11]:
### function to help the lemmatizer

import nltk
nltk.download("wordnet") 
nltk.download("averaged_perceptron_tagger")
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def get_wordnet_pos(postag):
    if postag.startswith('J'):
        return wordnet.ADJ
    elif postag.startswith('V'):
        return wordnet.VERB
    elif postag.startswith('N'):
        return wordnet.NOUN
    elif postag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN
    
### lemmatizer   
def lemmatize_sentencelist(sentencelist):
    lemmatized_sentences = []
    for s in sentencelist: 
        s = s.lower()
        pos_s = nltk.pos_tag(s.split())
        lemmatized_sentences.append(" ".join([wnl.lemmatize(w[0], get_wordnet_pos(w[1])) for w in pos_s]))
    return lemmatized_sentences

df['lemmatized_review'] = df['corrected_review'].apply(lambda x: lemmatize_sentencelist(x))


[nltk_data] Downloading package wordnet to /Users/konst/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/konst/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [19]:
### Testing 

sentences = ["The quick brown foxes are jumping over the lazy dogs or dog.",
             "She sells seashells by the seashore."]

lemmatized_sentences = lemmatize_sentencelist(sentences)

# Print the original and lemmatized sentences
for original, lemmatized in zip(sentences, lemmatized_sentences):
    print(f"Original: {original}")
    print(f"Lemmatized: {lemmatized}")
    print("---")

Original: The quick brown foxes are jumping over the lazy dogs or dog.
Lemmatized: the quick brown fox be jump over the lazy dog or dog.
---
Original: She sells seashells by the seashore.
Lemmatized: she sell seashell by the seashore.
---
