-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_extractor.py
24 lines (23 loc) · 870 Bytes
/
pdf_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
def extract_pdf_content(pdf):
rsrcmgr = PDFResourceManager()
codec = 'utf-8'
outfp = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr=rsrcmgr, outfp=outfp, codec=codec, laparams=laparams)
with open(pdf, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
mystr = outfp.getvalue()
device.close()
outfp.close()
return mystr