In [None]:
#import packages
import io
import string
import requests
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from textblob import TextBlob
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

In [None]:
escapes = "ΔΩπϴλθ°îĵk̂ûαβγδεζηθικλμνξοπρςστυφχψωΓΔΘΛΞΠΣΦΨΩϴ≤="

In [None]:
# get bytes stream of web pdf
#pdfurl = "https://arxiv.org/pdf/2001.09903.pdf"
#pdfurl = "http://arxiv.org/pdf/1811.04422v1"
pdfurl = "https://arxiv.org/pdf/2001.09956"
#pdfurl = "https://arxiv.org/pdf/2001.09412.pdf"
#pdfurl = "http://arxiv.org/pdf/1411.6753v1"
#pdfurl = "https://arxiv.org/pdf/2001.10393.pdf"
r = requests.get(pdfurl, stream=True)
f = io.BytesIO(r.content)

In [None]:
# set up pdfminer
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pagenos=set()

In [None]:
# extract abstract
content = []
for page in PDFPage.get_pages(fp=f, pagenos=pagenos, maxpages=0, caching=True, check_extractable=True):
    interpreter.process_page(page)

text = retstr.getvalue()
content.append(text)
# close apps
device.close()
retstr.close()

In [None]:
text = []
splits = content[0].split("\n\n")
for chunk in splits:
    # if ocr has picked up annoying numbers along side with many "\n"
    if chunk.count("\n") >= 3:
        dummy = chunk.split("\n")
        dummy_cnt = 0
        for d in dummy:
            if len(d)>1:
                dummy_cnt += 1
        if dummy_cnt > 2:
            text.append(chunk)
    else:
        text.append(chunk)

In [None]:
# identify abstract
if "abstract" in " ".join(i for i in text).lower():
    cnt = 0
    while True:
        if "abstract" in text[cnt].lower():
            if len(text[cnt].split()) < 2:
                cnt += 1
                break
            else:
                if re.match("^abstract", text[cnt][:8].lower()):
                    text[cnt] = text[cnt][8:]
                break
        cnt += 1
    text = text[cnt:]

In [None]:
# identify references
cnt = 0
while True:
    if "References" in text[cnt] or "Appendix" in text[cnt] or "Bibliography" in text[cnt] or "REFERENCES" in text[cnt] or "acknowledgements" in text[cnt] or "ACKNOWLEDGEMENTS" in text[cnt]:
        if len(text[cnt].split()) < 3:
            break
        if "\n" in text[cnt]:
            break
    cnt += 1

text = text[:cnt]

In [None]:
# remove equation number references
clean1 = []
for t in text:
    if len(t.split()) == 1:
        if re.match("^\(\d\)", t):
            pass
        if re.match("\d", t):
            pass
    elif len(t.split()) == 0:
        pass
    else:
        clean1.append(t)

In [None]:
# remove headers
clean2 = []
cnt = 0
while cnt < len(clean1):
    if len(clean1[cnt].split()) < 10:
        dummy = re.sub("\d", "", clean1[cnt])
        dummy = dummy.strip()
        if len(dummy.split()) <= 1:
            pass
        elif dummy[-1] not in string.punctuation:
            dummy2 = re.sub("\d", "", clean1[cnt+1])
            dummy2 = re.sub('[^\w\s]','', dummy2)
            dummy2 = dummy2.strip()
            if dummy2:
                if dummy2[0].isupper():
                    pass
                else:
                    clean2.append(clean1[cnt])
            else:
                pass
        else:
            clean2.append(clean1[cnt])
    else:
        clean2.append(clean1[cnt])
    cnt += 1

In [None]:
# remove figure captions
clean3 = []
cnt = 0
while cnt < len(clean2):
    if re.match('^Fig', clean2[cnt]) or re.match('^fig.', clean2[cnt].lower()) :
        pass
    else:
        clean3.append(clean2[cnt])
    cnt += 1

In [None]:
# remove table data
clean4 = []
cnt = 0
while cnt < len(clean3)-1:
    if clean3[cnt][-1] == ".":
        if re.sub("\d", "", clean3[cnt+1]).strip()[0].islower():
            pass
        else:
            clean4.append(clean3[cnt])
    elif clean3[cnt][-1] == "-" or clean3[cnt][-1] == "-":
        if clean3[cnt+1][0].islower():
            clean4.append(clean3[cnt])
        else:
            pass
    elif "©" in clean3[cnt]:
        pass
    else:
        clean4.append(clean3[cnt])
    cnt += 1

In [None]:
# remove citations
clean5 = " ".join(c for c in clean4)
clean5 = re.sub("\(cid:\d\d\)", "", clean5)
clean5 = re.sub("\(cid:\d\)", "", clean5)
clean5 = re.sub("cid:", "", clean5)
clean5 = re.sub("cid:", "", clean5)
clean5 = re.sub("cid", "", clean5)
clean5 = re.sub("^\[\d\]", "", clean5)
clean5 = re.sub("^\[\d\d\]", "", clean5)
clean5 = re.sub("^\[\d\d\]", "", clean5)
clean5 = re.sub("-\n", "", clean5)
clean5 = re.sub("\n", " ", clean5)
clean5 = re.sub("  ", " ", clean5)

In [None]:
blob = TextBlob(clean5)
sentences = [str(sentence) for sentence in blob.sentences]
# remove sentences with math
no_math = []
for sentence in sentences:
    cnt = 0
    for symbol in escapes:
        if symbol in sentence:
            cnt += 1
    if cnt == 0:
        no_math.append(sentence)
# remove sentences with "Figure X:" or "Fig X:""
no_figs = []
for sentence in no_math:
    if re.search("Figure \d:", sentence):
        pass
    elif re.search("Fig. \d:", sentence):
        pass
    elif re.search("Fig \d:", sentence):
        pass
    else:
        no_figs.append(sentence)
sentences = " ".join(n for n in no_figs)

In [None]:
# Summarisation of top 5 key points
key_points = 5
summary = []
blob = TextBlob(sentences)
sentences = [str(sentence) for sentence in blob.sentences]  
for sentence in sentences:
    if sentence.find(":", 0, 1) != -1 and sentence.find("-", 1, 3) != -1:
        pass
    else:
        if len(sentence)>2:
            if len(sentence.split()) < 150:
                summary.append(sentence)
parser = PlaintextParser.from_string(' '.join(str(sentence) for sentence in summary), Tokenizer("english"))
summarizer = TextRankSummarizer()
doc_summary = summarizer(parser.document, key_points)
doc_summary = [str(sentence) for sentence in doc_summary]
for sent in doc_summary:
    print(sent, "\n")

In [None]:
from wisdomaiengine import pdfdocumentextracter, summarisepdfdocument

In [2]:
#pdfurl = "https://arxiv.org/pdf/2001.09903.pdf"
pdfurl = "http://arxiv.org/pdf/1811.04422v1"
#pdfurl = "https://arxiv.org/pdf/2001.09956"
#pdfurl = "https://arxiv.org/pdf/2001.09412.pdf"
#pdfurl = "http://arxiv.org/pdf/1411.6753v1"
#pdfurl = "https://arxiv.org/pdf/2001.10393.pdf"

In [3]:
text = pdfdocumentextracter(pdfurl)

In [4]:
summary = summarisepdfdocument(text)
for i in summary:
     print(i, "\n")

• I describe an optimal control view of adversarial machine learning, where the dynamical system is the machine learner, the input are adversarial actions, and the control costs are deﬁned by the adversary’s goals to do harm and be hard to detect. 

• The system to be controlled is called the plant, which is deﬁned by the system dynamics: where xt ∈ Xt is the state of the system, ut ∈ Ut is the control input, and Ut is the control constraint set. 

• In all cases, the adversary attempts to control the machine learning system, and the control costs reﬂect the adversary’s desire to do harm and be hard to detect. 

• The adversary’s running cost gt then measures the eﬀort in performing the action at step t. One limitation of the optimal control view is that the action cost is assumed to be additive over the steps. 

• The adversary intercepts the environmental reward rIt in each iteration, and may choose to modify (“shape”) the reward into with some ut ∈ R before sending the modiﬁed rewar