Skip to content

Commit

Permalink
Use Path.rglob() to find EPUB files
Browse files Browse the repository at this point in the history
`full-path` in file `container.xml` and `href` in `content.opf` are unreliable.
  • Loading branch information
xxyzz committed Mar 8, 2022
1 parent cf2b2f0 commit bcbce59
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 49 deletions.
6 changes: 3 additions & 3 deletions parse_job.py
Expand Up @@ -184,7 +184,7 @@ def find_lemma(start, text, kw_processor, ll_conn, mobi_codec):
PERSON_LABELS = ['PERSON', 'PER', 'persName']


def find_named_entity(start, x_ray, doc, mobi_codec, lang, xhtml=None):
def find_named_entity(start, x_ray, doc, mobi_codec, lang, xhtml_path=None):
len_limit = 3 if lang == 'en' else 2

for ent in doc.ents:
Expand All @@ -203,10 +203,10 @@ def find_named_entity(start, x_ray, doc, mobi_codec, lang, xhtml=None):
continue

new_start_char = ent.start_char + ent.text.index(text)
if xhtml: # EPUB
if xhtml_path: # EPUB
x_ray.search(text, ent.label_ in PERSON_LABELS,
ent.sent.text, start + new_start_char,
start + new_start_char + len(text), xhtml)
start + new_start_char + len(text), xhtml_path)
continue

selectable_text = text
Expand Down
80 changes: 34 additions & 46 deletions x_ray_epub.py
Expand Up @@ -33,8 +33,8 @@ def __init__(self, book_path, search_people, mediawiki):
self.extract_folder = Path(book_path).with_name('extract')
if self.extract_folder.exists():
shutil.rmtree(self.extract_folder)
self.content_folder = None
self.xhtml_folder = None
self.xhtml_folder = self.extract_folder
self.xhtml_href_has_folder = False

def extract_epub(self):
from lxml import etree
Expand All @@ -45,40 +45,35 @@ def extract_epub(self):
with self.extract_folder.joinpath(
'META-INF/container.xml').open('rb') as f:
root = etree.fromstring(f.read())
self.opf_path = root.find(
opf_path = root.find(
'.//n:rootfile', NAMESPACES).get("full-path")
content_folder = Path(self.opf_path).parent.name
if content_folder:
self.content_folder = content_folder
else:
for folder in ['OEBPS', 'epub']:
if self.extract_folder.joinpath(folder).is_dir():
self.content_folder = folder
with self.extract_folder.joinpath(self.opf_path).open('rb') as opf:
self.opf_path = self.extract_folder.joinpath(opf_path)
if not self.opf_path.exists():
self.opf_path = next(self.extract_folder.rglob(opf_path))
with self.opf_path.open('rb') as opf:
self.opf_root = etree.fromstring(opf.read())
item_path = 'opf:manifest/opf:item' \
'[@media-type="application/xhtml+xml"]'
for item in self.opf_root.findall(item_path, NAMESPACES):
if item.get('properties') == 'nav':
continue
xhtml = item.get("href")
xhtml_folder = Path(xhtml).parent.name
if xhtml_folder and xhtml_folder != self.xhtml_folder \
and xhtml_folder != self.content_folder:
self.xhtml_folder = xhtml_folder
if not xhtml.startswith(self.content_folder):
xhtml = f'{self.content_folder}/{xhtml}'
xhtml_path = self.extract_folder.joinpath(xhtml)
if xhtml_path.exists():
with xhtml_path.open() as f:
xhtml_str = f.read()
body_start = xhtml_str.index('<body')
body_end = xhtml_str.index('</body>') + len('</body>')
body_str = xhtml_str[body_start:body_end]
for m in re.finditer(r'>[^<]+<', body_str):
yield (m.group(0)[1:-1], (m.start() + 1, xhtml))

def search(self, name, is_person, sent, start, end, xhtml):
if not xhtml_path.exists():
xhtml_path = next(self.extract_folder.rglob(xhtml))
if not xhtml_path.parent.samefile(self.extract_folder):
self.xhtml_folder = xhtml_path.parent
if '/' in xhtml:
self.xhtml_href_has_folder = True
with xhtml_path.open() as f:
xhtml_str = f.read()
body_start = xhtml_str.index('<body')
body_end = xhtml_str.index('</body>') + len('</body>')
body_str = xhtml_str[body_start:body_end]
for m in re.finditer(r'>[^<]+<', body_str):
yield (m.group(0)[1:-1], (m.start() + 1, xhtml_path))

def search(self, name, is_person, sent, start, end, xhtml_path):
from rapidfuzz.process import extractOne

if (r := extractOne(
Expand All @@ -99,7 +94,7 @@ def search(self, name, is_person, sent, start, end, xhtml):
self.mediawiki.query(
self.pending_dic, self.update_summary)
self.pending_dic.clear()

self.ent_dic[xhtml_path].append((start, end, name, ent_id))

def update_summary(self, key, summary):
Expand All @@ -112,8 +107,8 @@ def modify_epub(self):
self.mediawiki.save_cache()

def insert_a_tags(self):
for xhtml, ent_list in self.ent_dic.items():
with self.extract_folder.joinpath(xhtml).open() as f:
for xhtml_path, ent_list in self.ent_dic.items():
with xhtml_path.open() as f:
xhtml_str = f.read()
body_start = xhtml_str.index('<body')
body_end = xhtml_str.index('</body>') + len('</body>')
Expand All @@ -129,7 +124,7 @@ def insert_a_tags(self):
s += body_str[last_end:]
new_xhtml_str = xhtml_str[:body_start] + s + xhtml_str[body_end:]

with self.extract_folder.joinpath(xhtml).open('w') as f:
with xhtml_path.open('w') as f:
if NAMESPACES['ops'] not in new_xhtml_str:
# add epub namespace
new_xhtml_str = new_xhtml_str.replace(
Expand Down Expand Up @@ -159,29 +154,22 @@ def create_x_ray_page(self):
{self.mediawiki.source_name}</a>
'''
s += '</aside>'

s += '</body></html>'

if self.xhtml_folder:
x_ray_href = f'{self.xhtml_folder}/x_ray.xhtml'
x_ray_path = self.extract_folder.joinpath(
f'{self.content_folder}/{x_ray_href}')
else:
x_ray_href = f'{self.content_folder}/x_ray.xhtml'
x_ray_path = self.extract_folder.joinpath(x_ray_href)

with x_ray_path.open('w') as f:
with self.xhtml_folder.joinpath('x_ray.xhtml').open('w') as f:
f.write(s)

manifest = self.opf_root.find('opf:manifest', NAMESPACES)
s = f'<item href="{x_ray_href}" id="x_ray.xhtml" '\
if self.xhtml_href_has_folder:
x_ray_href = f'{self.xhtml_folder.name}/x_ray.xhtml'
else:
x_ray_href = 'x_ray.xhtml'
s = f'<item href="{x_ray_href}" id="x_ray.xhtml" ' \
'media-type="application/xhtml+xml"/>'
manifest = self.opf_root.find('opf:manifest', NAMESPACES)
manifest.append(etree.fromstring(s))
spine = self.opf_root.find('opf:spine', NAMESPACES)
s = '<itemref idref="x_ray.xhtml"/>'
spine.append(etree.fromstring(s))

with self.extract_folder.joinpath(self.opf_path).open('w') as f:
with self.opf_path.open('w') as f:
f.write(etree.tostring(self.opf_root, encoding=str))

self.book_path = Path(self.book_path)
Expand Down

0 comments on commit bcbce59

Please sign in to comment.