In [1]:
import json
import pandas as pd
import re
import spacy

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


spacy.prefer_gpu()
vectorizer = TfidfVectorizer(min_df=1)

class DataLoader:

    def __init__(self, path):
        self.path = path


    def load_data(self):
        with open(self.path, 'r') as file:
            data = json.load(file)
        return data


    def save_data(self, data ,save_path):
        with open(save_path, 'w') as file:
            save_data = json.dump(data, file)


class TextProcessor:

    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')


    def special_text(self, val):
        text = re.sub(r'\\displaystyle', '', str(val))
        return re.sub(r'(r_{(?:\w*)?..)', '', str(text))
    

    def remove_digits_special(self, val):
        text = re.sub('(\d+(?:.\d+)?\+?).', '', str(val))
        text = text.replace('!@#$%^&*()[]{};:,./<>?\|`~-=_+', '')
        return text.strip()
       
        
    def lowercase(self, text):
        return self.nlp(text.lower())


    def rm_stop_punct(self, text):
        return [t for t in text if not t.is_punct and not t.is_stop]


    def lemmatizer(self, text):
        return [t.lemma_ for t in text if t.dep_]


    def preprocess_text(self, text):
        doc = self.nlp(text)
        lower = self.lowercase(doc.text)
        no_stop = self.rm_stop_punct(lower)
        lemma = self.lemmatizer(no_stop)
        return lemma


class TFIDFVectorizer:

    """
    The vectorizer in this class was based on workflow by Flavio Clesio described in the link below
    https://flavioclesio.com/cosine-similarity-search-for-new-documents-using-scikit-learn
    """

    def __init__(self, text_data):
        self.data = text_data


    def build_corpus_vocabulary(self, text):
        tok_occ = Counter()
        for doc in self.data:
            for word in doc[text]:
                tok_occ[word] += 1
        return tok_occ


    def unique_vocab(self, text):
        unique = set(word for word in text.keys())
        return list([i for i in unique])


    def process_text(self, text):
        vocab = self.build_corpus_vocabulary(text)
        unique = self.unique_vocab(vocab)
        return unique


    def convert_df(self, col1, col2):
        df = pd.DataFrame(self.data)
        df[col1] = [' '.join(map(str, l)) for l in df[col2]]
        return df 


    def get_tfidfs_vocab(self, df, col):
        """Source:
           https://flavioclesio.com/cosine-similarity-search-for-new-documents-using-scikit-learn 
        """
        tf_idf = vectorizer.fit_transform(df[col].values)
        words = vectorizer.get_feature_names_out()
        return tf_idf, words


    def query_processing(self, vocab, query):
        """Source:
           https://flavioclesio.com/cosine-similarity-search-for-new-documents-using-scikit-learn 
        """
        query_tfidf = vectorizer.fit(vocab)
        query_tfidf = query_tfidf.transform([query])
        return query_tfidf


if __name__== '__main__':
    loader = DataLoader('../data_hub/processed_data.json')
    data = loader.load_data()
    
    vect = TFIDFVectorizer(data)
    tp = TextProcessor()

    vocab = vect.process_text('tokenized_text')
    df = vect.convert_df('token', 'tokenized_text')
    tfidf, words = vect.get_tfidfs_vocab(df, 'token')

    q = 'is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people'
    query = vect.query_processing(words, q)

    idx = cosine_similarity(query, tfidf).flatten().argsort()[::-1][:5]
    df_results = df.loc[idx, 'text']

    for doc in data:
        doc['tf_idf'] = tfidf

    