-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_util.py
145 lines (128 loc) · 5.28 KB
/
pdf_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import pybtex.database
from pylatexenc.latex2text import LatexNodes2Text # to unescape latex
from pathlib import Path
import re
import PyPDF2
from difflib import SequenceMatcher
def get_title_info(entry):
"""Try to guess title information from a publication."""
if isinstance(entry, Path):
pdf = PyPDF2.PdfFileReader(str(entry))
if '/Title' in pdf.documentInfo:
return pdf.documentInfo['/Title']
else:
return None
elif type(entry) is pybtex.database.Entry:
if 'title' in entry.fields:
return LatexNodes2Text().latex_to_text(entry.fields['title'])
elif 'booktitle' in entry.fields:
return LatexNodes2Text().latex_to_text(entry.fields['booktitle'])
else:
return None
else:
raise NotImplementedError("Can only handle pdf or bib objects (was %s)"
% type(entry))
def get_author_info(entry):
"""Try to extract some form of author string from a bibitem
:entry: BibliographyData
:returns: str -- String with author info or None, if nothing found
"""
if isinstance(entry, Path):
pdf = PyPDF2.PdfFileReader(str(entry))
if '/Author' in pdf.documentInfo:
return pdf.documentInfo['/Author']
else:
# last straw: filename
return entry.stem
elif type(entry) is pybtex.database.Entry:
author1 = ''
author2 = ''
author3 = ''
if 'author' in entry.fields:
author1 = str(entry.fields['author'])
if 'authors' in entry.fields:
author2 = entry.fields['authors']
if 'author' in str(entry.persons):
persons = [' '.join(p.first_names + p.middle_names + p.last_names)
for p in entry.persons['author']]
author3 = ' '.join(persons)
if 'editor' in str(entry.persons):
persons = [' '.join(p.first_names + p.middle_names + p.last_names)
for p in entry.persons['editor']]
author3 = ' '.join(persons)
author_info = LatexNodes2Text().latex_to_text(' '.join((author1,
author2,
author3)))
if author_info:
return author_info.strip()
else:
return None
else:
raise NotImplementedError("Can only handle pdf or bib objects (was %s)"
% type(entry))
def pdf_for_pub(entry, pdf_folder):
"""Attempt to guess which pdf file might belong to a given bibliography entry.
:entry: BibliographyData object
:pdf_folder: str denoting folder location
:returns: str
"""
pdf_path = Path(pdf_folder)
if not pdf_path.exists():
raise OSError('Folder %s does not exist.' % pdf_folder)
if not pdf_path.is_dir():
raise OSError('%s is not a folder.' % pdf_folder)
else:
match = None
confidence = 0
for file in pdf_path.iterdir():
author_info = get_author_info(entry)
title_info = get_title_info(entry)
author_info_pdf = get_author_info(file)
confidence_title = search_for_string(file, title_info)
confidence_author = SequenceMatcher(None, author_info_pdf.split(),
author_info.split(),
autojunk=False).ratio()
new_confidence = (3 * confidence_title + confidence_author) / 4
if confidence < new_confidence:
confidence = new_confidence
match = file
if match and confidence > 0.5:
if 'title' in entry.fields:
title = entry.fields['title']
elif 'booktitle' in entry.fields:
title = entry.fields['booktitle']
else:
title = '<no title>'
print('Match for %s: %s, score=%f' % (title, match, confidence))
return match
def search_for_string(pdf, text):
"""Scan a pdf file for a string and return best match + score.
:pdf: PyPDF2.PdfFileReader -- file containing the pdf
:text: str -- String to search for
:returns: TODO
"""
if not isinstance(pdf, PyPDF2.PdfFileReader):
pdf = PyPDF2.PdfFileReader(str(pdf))
if not isinstance(text, list):
text = text.split()
text_set = set(text)
pdf_text = ' '.join([page.extractText() for page in pdf.pages])
# remove newlines and superfluous spaces
pdf_text = re.sub('[\n\s]+', ' ', pdf_text).split()
# title should occur in first fifth (hopefully)
pdf_text = pdf_text[:len(pdf_text) // 5]
best_score = 0
best_match = None
print("Searching for: %s" % text)
for start in range(0, len(pdf_text) - len(text)):
subsequence = pdf_text[start:start + len(text)]
if len(text_set.intersection(set(subsequence))) < 2:
continue
score = SequenceMatcher(None, subsequence, text, autojunk=False).ratio()
if score > best_score:
best_score = score
best_match = subsequence
if best_match:
print("Best match for\n%s:\n%s\nscore=%f" % (text, best_match,
best_score))
return best_score