In [2]:
import json
import re
import spacy


spacy.prefer_gpu()

class DataLoader:

    def __init__(self, path):
        self.path = path


    def load_data(self):
        with open(self.path, 'r') as file:
            data = json.load(file)
        return data


    def save_data(self, data ,save_path):
        with open(save_path, 'w') as file:
            save_data = json.dump(data, file)



class TextProcessor:

    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')


    def special_text(self, val):
        text = re.sub(r'\\displaystyle', '', str(val))
        return re.sub(r'(r_{(?:\w*)?..)', '', str(text))
    

    def remove_digits_special(self, val):
        text = re.sub('(\d+(?:.\d+)?\+?).', '', str(val))
        text = text.replace('!@#$%^&*()[]{};:,./<>?\|`~-=_+', '')
        return text.strip()
       
        
    def lowercase(self, text):
        return self.nlp(text.lower())


    def rm_stop_punct(self, text):
        return [t for t in text if not t.is_punct and not t.is_stop]


    def lemmatizer(self, text):
        return [t.lemma_ for t in text if t.dep_]


    def preprocess_text(self, text):
        doc = self.nlp(text)
        lower = self.lowercase(doc.text)
        no_stop = self.rm_stop_punct(lower)
        lemma = self.lemmatizer(no_stop)
        return lemma


if __name__== '__main__':
    loader = DataLoader('data.json')
    tp = TextProcessor()
    data = loader.load_data()

    for doc in data:
        doc['tokenized_text'] = tp.preprocess_text(doc['text'])
        
    loader.save_data(data, 'processed_data.json')