In [110]:
import pandas as pd
import gensim
import re
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis


class Cord19:
    def __init__(self, start_date: str, end_date: str):
        """Loads a metadata csv that can be used to find and slice papers
        """

        text_metadata = pd.read_csv("data/cord_19/2022-06-02/metadata.csv",
                                    low_memory=False)
        text_metadata["publish_time"] = \
            pd.to_datetime(text_metadata["publish_time"])
        text_metadata = text_metadata.loc[-text_metadata["sha"].isna()]
        text_metadata = \
            text_metadata[text_metadata["publish_time"] > start_date]
        text_metadata = \
            text_metadata[text_metadata["publish_time"] < end_date]

        self.metadata = text_metadata
        # Generate a df contain the sha, abstract and body of each paper
        self.gen_text_df()
        # Generate a dictionary and bag of words based on previous df
        self.gen_dictionary()
        self.gen_bow()


    def get_sha_series(self, start_date: str, end_date: str) -> pd.Series:
        """Returns a sha sum list of papers between 2 dates
        """
        sha_list = self.metadata.loc[
            (self.metadata["publish_time"] > pd.to_datetime(start_date))
            & (self.metadata["publish_time"] < pd.to_datetime(end_date))
            ]["sha"]

        return sha_list

        
    def get_abstract(self, sha):
        """Gets the abstract of a paper based on its sha sum
        """
        pdf_json_path = "data/cord_19/2022-06-02/document_parses/pdf_json/"
        temp_paper = pd.read_json(pdf_json_path + str(sha) + ".json",
                                  orient="index")
        text_list = [i["text"] for i in temp_paper.loc["abstract"].values[0]]
        
        text = ""
        for i in text_list:
            text += str(i).lower()
        
        return text
    
    
    def get_body(self, sha):
        """Gets the body text of a paper based on its sha sum
        """
        pdf_json_path = "data/cord_19/2022-06-02/document_parses/pdf_json/"
        temp_paper = pd.read_json(pdf_json_path + str(sha) + ".json",
                                  orient="index")
        text_list = [i["text"] for i in temp_paper.loc["body_text"].values[0]]
    
        text = ""
        for i in text_list:
            text += str(i).lower()
        
        return text 
    
    
    def gen_text_df(self) -> None:
        """Generates a dataframe containing the sha, abstract and body of
           a collection of documents contained in self.metadata"""
        abstract_list = []
        abstract_len = []
        body_list = []
        body_len = []
        
        # Getting the abstract for every SHA and removing SHAs for which we do
        # not have a paper
        for sha in self.metadata["sha"]:
            try:
                abstract_list.append(self.get_abstract(sha))
                body_list.append(self.get_body(sha))
            except ValueError:
                self.metadata = self.metadata.drop(
                    self.metadata.loc[self.metadata["sha"] == sha].index
                )
        
        # Putting the results in a dataframe
        df = pd.DataFrame({
            "sha": self.metadata["sha"],
            "abstract": abstract_list,
            "body": body_list,
        })
        
        # Remove abstracts with 0 length
        df["abstract_len"] = df["abstract"].str.len()
        df = df[df["abstract_len"] != 0]
        df = df.drop("abstract_len", axis=1)
        
        # Apply preprocessing
        df["abstract"] = self.preprocess_text(df["abstract"])
        df["body"] = self.preprocess_text(df["body"])
        
        self.text_df = df

    
    def preprocess_text(self, series):
        """Applies some preprocessing steps to a collection of documents
        """
        # Remove stopwords
        series = [gensim.parsing.preprocessing.remove_stopwords(i)
                  for i in series]
        # Stem
        series = gensim.parsing.porter.PorterStemmer().stem_documents(series)
        # Remove numeric
        series = [gensim.parsing.preprocessing.strip_numeric(i)
                  for i in series]
        # Remove punctuation
        series = [gensim.parsing.preprocessing.strip_punctuation(i)
                  for i in series]
        # Remove special characters
        series = [re.sub("\\W+", " ", i) for i in series]
        # Remove short words
        series = [gensim.parsing.preprocessing.strip_short(i) for i in series]
        
        return series

    
    def gen_dictionary(self):
        documents = self.text_df["abstract"]
        documents_split = [i.split() for i in documents.values.tolist()]
        dictionary = gensim.corpora.Dictionary(documents_split)
        self.dictionary = dictionary


    def gen_bow(self):
        documents = self.text_df["abstract"]
        documents_split = [i.split() for i in documents.values.tolist()]
        bow = [self.dictionary.doc2bow(i) for i in documents_split]
        self.bow = bow


class LdaModel:
    def __init__(self, bow, dictionary, pickle: bool):
        self.bow = bow
        self.dictionary = dictionary
        self.pickle = pickle
        self.model = self.generate()
        

    def generate(self):
        lda_model = gensim.models.LdaMulticore(
            corpus=self.bow, id2word=self.dictionary,
            num_topics=10, passes=10, random_state=1337
        )
        
        return lda_model
    
    
    def visualize(self):
        pyLDAvis.enable_notebook()
        vis = gensimvis.prepare(self.model, self.bow, self.dictionary)
        
        if self.pickle:
            pickle.dump(vis, open("output/pickle/temp.p", "wb"))
        
        return p

In [82]:
cord = Cord19(start_date="2020-01-01", end_date="2020-01-07")

NameError: name 'LdaModel' is not defined

In [111]:
lda_model = LdaModel(bow=cord.bow, dictionary=cord.dictionary, pickle=False)
lda_model.visualize()

  default_term_info = default_term_info.sort_values(
