You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
import urllib.request
from io import BytesIO
def extract_pdf_metadata(pdf_file:str) -> dict:
"""Returns the metadata of a PDF
Parameters:
pdf_path (str) : the path of the PDF file of which metadata should be extracted
Returns:
dict: metadata of the PDF, presented as a JSON structured as follows :
{
metadata:{
"Author": "AURORE",
"CreationDate": "D:20200325185329+01'00'",
"Creator": "Microsoft Office Word 2007",
"ModDate": "D:20210311153835+01'00'",
"Producer": "Microsoft Office Word 2007",
"Title": "DOSSIER COUP DE POUCE 2020"
}
content:"Lorem ipsum dolor sit amet, ..."
}
"""
pdf_parser = PDFParser(pdf_file)
doc = PDFDocument(pdf_parser)
metadata = doc.info[0]
for (key, value) in doc.info[0].items():
# Need to decode each value from bytestrings toward strings
metadata[key] = value.decode("utf-8", errors='ignore')
return metadata
if __name__ == '__main__':
response = urllib.request.urlopen('http://arxiv.org/pdf/cs/9308101v1')
pdf_txt = response.read()
fileObj = BytesIO()
fileObj.write(pdf_txt)
metadata = extract_pdf_metadata(fileObj)
print(metadata)
No description provided.
The text was updated successfully, but these errors were encountered: