In [None]:
import json
import nltk
import re
import spacy
import string

from nltk.corpus import stopwords

nltk.download('stopwords')
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm')


class DataLoader:

    def __init__(self, path):
        self.path = path


    def load_data(self):
        with open(self.path, 'r') as file:
            data = json.load(file)
        return data


    def save_data(self, data ,save_path):
        with open(save_path, 'w') as file:
            save_data = json.dump(data, file)



class TextProcessor:

    def __init__(self):
        self.stop = set(stopwords.words('english'))


    def clean_text(self, val):
        return [t.strip().lower().replace(" ", "") for t in val]


    def remove_punct(self, val):
        return [t for t in val if t not in string.punctuation]


    def remove_stop(self, val):
        return [t for t in val if t not in self.stop and not t.isdigit()]


    def special_text(self, val):
        text = re.sub(r"\\\\displaystyle", '', str(val))
        return re.sub(r"(r_{(?:\w+)?)", "", str(text))
    

    def remove_digits_special(self, val):
        text = re.sub("(\d+(?:.\d+)?\+?).", "", str(val))
        text = text.replace("!@#$%^&*()[]{};:,./<>?\|`~-=_+", "")
        return text
       
        
    def tokenizer(self, val):
        return [t.text for t in val]


    def lemmatizer(self, val):
        return [t.lemma_ for t in val]


    def preprocess_text(self, text):
        doc = nlp(text)
        lemma = self.lemmatizer(doc)
        remove_sw = self.remove_stop(lemma)
        remove_punct = self.remove_punct(remove_sw)
        clean_text = self.clean_text(remove_punct)
        remove_special = self.special_text(clean_text)
        remove_digits = self.remove_digits_special(remove_special)
        return remove_digits


if __name__== '__main__':
    loader = DataLoader('data_hub/data.json')
    tp = TextProcessor()
    data = loader.load_data()

    for doc in data:
        doc['tokenized_text'] = tp.preprocess_text(doc['text'])
        
    loader.save_data(data, 'data_hub/processed_data.json')
