In [None]:
import io
import string
import requests
import re
from nltk import sent_tokenize
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
import pandas as pd
import numpy as np

In [None]:
from wisdomaiengine import pdfdocumentextracter, wordcloud

In [None]:
corpus = []
for pdfurl in ["https://arxiv.org/pdf/2001.09903.pdf", "http://arxiv.org/pdf/1811.04422v1",
            "https://arxiv.org/pdf/2001.09956",
            "https://arxiv.org/pdf/2001.09412.pdf",
            "http://arxiv.org/pdf/1411.6753v1",
            "https://arxiv.org/pdf/2001.10393.pdf"]:
    corpus.append([pdfdocumentextracter(pdfurl)])

In [None]:
# global variables and functions
stop = set(stopwords.words("english"))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

extras = ["et", "al", "le", "eg"]
for extra in extras:
    stop.add(extra)
    
extras = ["•", "−"]
for extra in extras:
    exclude.add(extra)

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [None]:
def frequency_processor(corpus):
    all_text = ' '.join(i[0] for i in corpus)
    formatted_all_text = all_text.lower()
    formatted_all_text = re.sub(r'[^\w\s]',' ',formatted_all_text)
    formatted_all_text = " ".join(x for x in formatted_all_text.split() if x not in stop)
    all_text_sent = all_text
    # If there is no data
    if not formatted_all_text or not all_text:
        frequency = None
        return frequency
    # Otherwise
    sentence_list = sent_tokenize(all_text_sent)
    split_words = [f for f in formatted_all_text.split(" ") if len(f) > 2]
    frequency = pd.value_counts(split_words).reset_index()
    frequency.columns = ["words", "frequency"]
    frequency = frequency[frequency["words"] != "-"]
    frequency = frequency[frequency["words"] != "_"]
    maximum_frequency = max(frequency["frequency"].values)
    frequency["weighted_frequency"] = frequency["frequency"]/maximum_frequency
    for i, word in enumerate(frequency["words"]):
        frequency.loc[i, 'idf'] = np.log(len(sentence_list)/len([x for x in sentence_list if word in x.lower()]))
    for i, word in enumerate(frequency["words"]):
        try:
            frequency.loc[i, 'lemmatized word'] = lemma.lemmatize(word)
        except:
            frequency.loc[i, 'lemmatized word'] = " "
    frequency['tf_idf'] = frequency['frequency'] * frequency['idf']
    return frequency

In [None]:
search_term = "neural network"
num_words = 10

In [None]:
if corpus:
    # Important Words Modeling
    frequency = frequency_processor(corpus)
    important_words = []
    if frequency is not None:
        top_N = pd.DataFrame(frequency.groupby("lemmatized word")["tf_idf"].sum())
        top_N = top_N.sort_values(by=["tf_idf"], ascending=False)
        split = search_term.split()
        counter1=0
        counter2=1
        while counter2<(num_words+1):
            word = top_N.index[counter1]
            value = top_N.values[counter1]
            thresh=0
            for i in split:
                if i in word:
                    thresh+=1
                else:
                    pass
            if thresh>0:
                counter1+=1
            else:
                important_words.append([word, value[0]])
                counter1+=1
                counter2+=1
        if important_words[0] == "":
            important_words = None

In [None]:
important_words

In [1]:
from wisdomaiengine import pdfdocumentextracter, wordcloud

In [2]:
corpus = []
for pdfurl in ["https://arxiv.org/pdf/2001.09903.pdf"]:
    corpus.append([pdfdocumentextracter(pdfurl)])

In [3]:
words = wordcloud("neural network", corpus)

In [4]:
for word in words:
    print(word[0], " - ", word[1])

outburst  -  57.60532022606475
ray  -  50.124016458243645
emission  -  40.035536662232005
optical  -  38.65305966459018
aql  -  38.562574190707245
accretion  -  34.09103986456174
star  -  34.01852194787096
kev  -  31.40874999400895
lmxbs  -  29.2208913124815
observation  -  27.12220296926123
