Skip to content

Commit

Permalink
use byte string for KFX books
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Dec 23, 2020
1 parent 91933b4 commit 047ecde
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 23 deletions.
2 changes: 1 addition & 1 deletion __init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ class WordDumbDumb(InterfaceActionBase):
description = 'Create Kindle Word Wise file.'
supported_platforms = ['linux', 'osx', 'windows']
author = 'xxyzz'
version = (1, 8, 0)
version = (1, 8, 1)
minimum_calibre_version = (5, 0, 0) # Python3
actual_plugin = 'calibre_plugins.worddumb.ui:WordDumb'

Expand Down
1 change: 0 additions & 1 deletion database.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def create_lang_layer(asin, book_path):
('enDictionaryId', 'kll.en.en'),
('sidecarFormat', '1.0')]
ll_conn.executemany('INSERT INTO metadata VALUES (?, ?)', metadata)

return ll_conn


Expand Down
4 changes: 2 additions & 2 deletions metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,13 @@ def check_metadata(db, book_id, update_exth=True):
if fmt.lower() in ['mobi', 'azw3'] and update_exth:
with open(book_path, 'r+b') as stream:
mu = UpdateMobiEXTH(stream)
mu.update(mi, asin)
mu.update(asin)

return book_fmt, asin, book_path, mi


class UpdateMobiEXTH(MetadataUpdater):
def update(self, mi, asin):
def update(self, asin):
def update_exth_record(rec):
recs.append(rec)
if rec[0] in self.original_exth_records:
Expand Down
32 changes: 13 additions & 19 deletions parse_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,13 @@ def do_job(gui, ids, plugin_path, abort, log, notifications):
from nltk.corpus import wordnet as wn
worker_count = os.cpu_count()

for data in books:
(_, book_fmt, asin, book_path, _) = data
for (_, book_fmt, asin, book_path, _) in books:
ll_conn = create_lang_layer(asin, book_path)
if ll_conn is None:
continue

data = []
for (start, word) in parse_book(book_path, book_fmt):
word = word.lower()
word = wn.morphy(word)
data.append((start, word))

data = [(start, wn.morphy(word.lower()))
for (start, word) in parse_book(book_path, book_fmt)]
words_each_worker = math.floor(len(data) / worker_count)
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
Expand All @@ -74,9 +69,9 @@ def worker(data):

def parse_book(path_of_book, book_fmt):
if (book_fmt.lower() == 'kfx'):
return parse_kfx(path_of_book)
yield from parse_kfx(path_of_book)
else:
return parse_mobi(path_of_book, book_fmt)
yield from parse_mobi(path_of_book, book_fmt)


def parse_kfx(path_of_book):
Expand All @@ -85,9 +80,8 @@ def parse_kfx(path_of_book):
book = YJ_Book(path_of_book)
data = book.convert_to_json_content()
for entry in json.loads(data)['data']:
for match_word in re.finditer('[a-zA-Z]{3,}', entry['content']):
word = entry['content'][match_word.start():match_word.end()]
yield (entry['position'] + match_word.start(), word)
yield from parse_text(entry['position'],
entry['content'].encode('utf-8'))


def parse_mobi(pathtoebook, book_fmt):
Expand All @@ -108,12 +102,12 @@ def parse_mobi(pathtoebook, book_fmt):

# match text between HTML tags
for match_text in re.finditer(b">[^<>]+<", html):
text = html[match_text.start():match_text.end()]
# match each word inside text
for match_word in re.finditer(b"[a-zA-Z]{3,}", text):
word = text[match_word.start():match_word.end()]
start = match_text.start() + match_word.start()
yield (start, word.decode('utf-8'))
yield from parse_text(match_text.start(), match_text.group(0))


def parse_text(start, text):
for match_word in re.finditer(b'[a-zA-Z]{3,}', text):
yield (start + match_word.start(), match_word.group(0).decode('utf-8'))


def install_libs(plugin_path):
Expand Down

0 comments on commit 047ecde

Please sign in to comment.