In [16]:
import zipfile
import os
from bs4 import BeautifulSoup
import re

Create class

In [25]:
class BookEpub:
    def __init__(self, filename):
        # Check the filename for existence.
        if not os.path.exists(filename):
            raise FileNotFoundError(f"File {filename} not found")
        self.file_path = filename
        self.folder_path = os.path.dirname(self.file_path)
        self.file_name_ext = os.path.basename(self.file_path)
        self.file_name_no_ext = os.path.splitext(self.file_name_ext)[0]
        self.extracted_path = None
        self.book_parts = []


    # Done
    def epub_print_content(self):
        """
        Prints the list of files it the EPUB archive.
        """
        with zipfile.ZipFile(self.file_path, 'r') as zf:
            for f in zf.namelist():
                print(f)


    # Done
    def epub_get_content(self):
        """
        Returns unsorted list of files in the EPUB archive.
        """
        content = list()
        with zipfile.ZipFile(self.file_path, 'r') as zf:
            for f in zf.namelist():
                content.append(f)
        return content


    # Done
    def epub_get_xhtml_names(self):
        """
        Returns sorted list of .xhtml files it the EPUB archive.
        """
        content = []
        with zipfile.ZipFile(self.file_path, 'r') as zf:
            for f in zf.namelist():
                if "xhtml" in f:
                    content.append(f)
        return sorted(content)

    
    def epub_get_html_names(self):
        """
        Returns sorted list of .html files it the EPUB archive.
        """
        content = []
        with zipfile.ZipFile(self.file_path, 'r') as zf:
            for f in zf.namelist():
                if "html" in f:
                    content.append(f)
        return sorted(content)
        

    # Done
    def epub_unpack(self, b_path=None, b_pwd=None):
        """
        Unpacks the EPUB file into folder named as EPUB file located in parent folder "books" by default.
        b_path - folder to where extract book. by default its the same place where book is located and it assumed to be "books" folder.
        b_pwd - password if any and if applicapabel at all.
        """
        if b_path != None:
            b_path = os.path.dirname(self.file_path)
        b_name = os.path.splitext(os.path.basename(self.file_path))[0]
        self.extracted_path = b_path + '/' + b_name
        with zipfile.ZipFile(self.file_path, 'r') as zf:
            for f in zf.namelist():
                zf.extract(f, self.extracted_path, b_pwd)

    
    def epub_get_part_names_unsorted(self):
        """
        Return unsorted list of book part names.
        """
        self.book_parts = []  # reset book_parts
        with zipfile.ZipFile(self.file_path, 'r') as zf:
            for f in zf.namelist():
                if ".xhtml" in str(f) or ".html" in str(f):
                    self.book_parts.append(str(f))
        # self.book_parts = sorted(self.book_parts)
        return self.book_parts

    
    def epub_get_part_names_sorted(self):
        """
        Return unsorted list of book part names.
        """
        self.book_parts = []  # reset book_parts
        with zipfile.ZipFile(self.file_path, 'r') as zf:
            for f in zf.namelist():
                if ".xhtml" in str(f) or ".html" in str(f):
                    self.book_parts.append(str(f))
        self.book_parts = sorted(self.book_parts)
        return self.book_parts


    def epub_clean(self, text):
        text = re.sub(r"\[\d+\]", "", text)  # clean text from links like [123]
        text = re.sub(r"\{\d+\}", "", text)  # clean text from links like {123}
        text = text.replace(" . ", ". ")    # with such simple replacement it works faster then with re.sub
        text = text.replace(" , ", ", ")    # with such simple replacement it works faster then with re.sub
        return(text)

    
    def epub_get_text_from_parts(self, part_name=None):
        """
        Opens part of extracted epub, extract text, clean it from html code and return cleaned text
        Needs extracted epub book
        part_name - full path to extracted part.
        """
        if part_name == None:
            part_name = input("Enter full path to the file you need:\n")
        # Check the filename for existence.
        if not os.path.exists(part_name):
            raise FileNotFoundError(f"File {part_name} not found")
        with open(part_name, "r", encoding="utf-8") as f:
            text = BeautifulSoup(f.read(), "html.parser").get_text(separator=" ", strip=True)
        text = self.epub_clean(text)
        return(text)
    

    def epub_get_text_from_parts_unextracted(self, part_name=None):
        """
        Opens part of extracted epub, extract text, clean it from html code and return cleaned text
        Doesn't require extracted epub book
        part_name - file name of the part in the archived epub book.
        """
        with zipfile.ZipFile(self.file_path, 'r') as zf:
            if part_name not in zf.namelist():
                raise FileNotFoundError(f"File {part_name} not found")
            with zf.open(part_name, "r") as f:
                content = f.read().decode("utf-8")
                text = BeautifulSoup(content, "html.parser").get_text(separator="\n", strip=True)
            text = self.epub_clean(text)
            return(text)            
                
    
    def epub_get_text_from_parts_unextracted_list(self, part_name=None):
        """
        Opens part of extracted epub, extract text, clean it from html code and return cleaned text
        Doesn't require extracted epub book
        part_name - file name of the part in the archived epub book.
        """
        with zipfile.ZipFile(self.file_path, 'r') as zf:
            if part_name not in zf.namelist():
                raise FileNotFoundError(f"File {part_name} not found")
            with zf.open(part_name, "r") as f:
                content = f.read().decode("utf-8")
                text = BeautifulSoup(content, "html.parser").get_text(separator="\n", strip=True)
                text_list = []
                for s in BeautifulSoup(content, "html.parser").stripped_strings:
                    text_list.append(self.epub_clean(s))
            return(text_list) 


    def epub_print_text_from_parts(self, part_name=None):
        """
        Opens part of extracted epud, extract text, clean it from html code and print cleaned text
        """
        if part_name == None:
            part_name = input("Enter full path to the file you need:\n")
        # Check the filename for existence.
        if not os.path.exists(part_name):
            raise FileNotFoundError(f"File {part_name} not found")
        with open(part_name, 'r', encoding='utf-8') as f:
            # for text that will be splitted later for one sentence per row:
            text = BeautifulSoup(f.read(), "html.parser").get_text(separator=" ", strip=True)

            # for text expected to be printed out later as paragraphs
            # text = BeautifulSoup(f.read(), "html.parser").get_text(separator="\n", strip=True)
        # text = text.replace(" . ", ". ")    # for nice printout of text
        # text = text.replace(" \n. ", ". ")  # for nice printout of text
        # text = text.replace("\n. ", ". ")   # for nice printout of text
        # text = text.replace(" \n, ", ", ")  # for nice printout of text
        # text = text.replace("\n, ", ",")    # for nice printout of text
            
        # text = re.sub(r"\[\d+\]", "", text)  # clean text from links like [123]
        text = self.epub_clean(text)
        print(text)
    

    def epub_get_book_txt(self, book_extracted_path=None, book_parts_list=None):
        """
        Receive path to extracted epub as a string
        Receive book parts as a list
        Return joined text as a string
        """
        if self.extracted_path != None and book_extracted_path == None:
            book_extracted_path == self.extracted_path
        elif self.extracted_path == None and book_extracted_path == None:
            book_extracted_path = input("Enter folder path for the place where the book is located:\n")
        # Check the folder path for existence.
        if not os.path.exists(book_extracted_path):
            raise FileNotFoundError(f"Folder {book_extracted_path} not found")
        book_text = list()
        if book_parts_list == None:
            book_parts_list = self.epub_get_xhtml_names()
        
        for name in book_parts_list:
            # Check the part name for existence.
            book_part_path = book_extracted_path + "/" + name
            if not os.path.exists(book_part_path):
                raise FileNotFoundError(f"File {book_part_path} not found")
            book_text.append(self.epub_get_text_from_parts(book_part_path))
        return " ".join(book_text)

In [26]:
file_book = 'books/vo_vseorujii_1.epub'
b = BookEpub(file_book)
b.epub_print_content()
# b.epub_unpack(file_book)
# b.epub_get_part_names()
# b.epub_get_text_from_parts("books/test/index_split_001.xhtml")
# b.epub_print_text_from_parts("books/test/index_split_002.xhtml")
# book = b.epub_get_book_txt("books/test")
# print(book[:5000])

mimetype
META-INF/
META-INF/container.xml
styles.css
CoverPage.html
images/20181210110208374.jpg
Header.html
Chapter1.html
Chapter2.html
Chapter3.html
Chapter4.html
Chapter5.html
Chapter6.html
Chapter7.html
Chapter8.html
Chapter9.html
Chapter10.html
Chapter11.html
Chapter12.html
Chapter13.html
Chapter14.html
Chapter15.html
Chapter16.html
Chapter17.html
Chapter18.html
Chapter19.html
Chapter20.html
Chapter21.html
Chapter22.html
Chapter23.html
Chapter24.html
Chapter25.html
Chapter26.html
Chapter27.html
Chapter28.html
Chapter29.html
Chapter30.html
Chapter31.html
Chapter32.html
Chapter33.html
Chapter34.html
Chapter35.html
Chapter36.html
Chapter37.html
Chapter38.html
Chapter39.html
Chapter40.html
Chapter41.html
Chapter42.html
Chapter43.html
Chapter44.html
Chapter45.html
Chapter46.html
Chapter47.html
Chapter48.html
Chapter49.html
Chapter50.html
Chapter51.html
Chapter52.html
Chapter53.html
Chapter54.html
Chapter55.html
Chapter56.html
Chapter57.html
Chapter58.html
Chapter59.html
Chapter60.html


In [27]:
part_names = b.epub_get_part_names_unsorted()
print(part_names)

['CoverPage.html', 'Header.html', 'Chapter1.html', 'Chapter2.html', 'Chapter3.html', 'Chapter4.html', 'Chapter5.html', 'Chapter6.html', 'Chapter7.html', 'Chapter8.html', 'Chapter9.html', 'Chapter10.html', 'Chapter11.html', 'Chapter12.html', 'Chapter13.html', 'Chapter14.html', 'Chapter15.html', 'Chapter16.html', 'Chapter17.html', 'Chapter18.html', 'Chapter19.html', 'Chapter20.html', 'Chapter21.html', 'Chapter22.html', 'Chapter23.html', 'Chapter24.html', 'Chapter25.html', 'Chapter26.html', 'Chapter27.html', 'Chapter28.html', 'Chapter29.html', 'Chapter30.html', 'Chapter31.html', 'Chapter32.html', 'Chapter33.html', 'Chapter34.html', 'Chapter35.html', 'Chapter36.html', 'Chapter37.html', 'Chapter38.html', 'Chapter39.html', 'Chapter40.html', 'Chapter41.html', 'Chapter42.html', 'Chapter43.html', 'Chapter44.html', 'Chapter45.html', 'Chapter46.html', 'Chapter47.html', 'Chapter48.html', 'Chapter49.html', 'Chapter50.html', 'Chapter51.html', 'Chapter52.html', 'Chapter53.html', 'Chapter54.html', 'Ch

If part file name doesn't contain number it's quite likely that it's not a part of a book
Clean the list of part names and keep only those with number

In [28]:
def is_num_in_name(name):
    for elem in name:
        if elem in ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"):
            return True
    return False

part_names_clean = []
for part_name in part_names:
    if is_num_in_name(part_name):
        part_names_clean.append(part_name)

print(part_names_clean)

['Chapter1.html', 'Chapter2.html', 'Chapter3.html', 'Chapter4.html', 'Chapter5.html', 'Chapter6.html', 'Chapter7.html', 'Chapter8.html', 'Chapter9.html', 'Chapter10.html', 'Chapter11.html', 'Chapter12.html', 'Chapter13.html', 'Chapter14.html', 'Chapter15.html', 'Chapter16.html', 'Chapter17.html', 'Chapter18.html', 'Chapter19.html', 'Chapter20.html', 'Chapter21.html', 'Chapter22.html', 'Chapter23.html', 'Chapter24.html', 'Chapter25.html', 'Chapter26.html', 'Chapter27.html', 'Chapter28.html', 'Chapter29.html', 'Chapter30.html', 'Chapter31.html', 'Chapter32.html', 'Chapter33.html', 'Chapter34.html', 'Chapter35.html', 'Chapter36.html', 'Chapter37.html', 'Chapter38.html', 'Chapter39.html', 'Chapter40.html', 'Chapter41.html', 'Chapter42.html', 'Chapter43.html', 'Chapter44.html', 'Chapter45.html', 'Chapter46.html', 'Chapter47.html', 'Chapter48.html', 'Chapter49.html', 'Chapter50.html', 'Chapter51.html', 'Chapter52.html', 'Chapter53.html', 'Chapter54.html', 'Chapter55.html', 'Chapter56.html', 

Print first part

In [29]:
text = b.epub_get_text_from_parts_unextracted(part_names_clean[0])
print(text)

Во всеоружии_1
Пролог.
— Готовность пять минут!
На 4-ом этаже подземелья Пробкового Острова было собрано сразу восемь топ-игроков. Наиболее титулованным из них был Высший Меч, занимающий 16-ую строку в общеигровом рейтинге. Впрочем, не следовало недооценивать и две сотни элитных членов Гильдии Серебряных Рыцарей.
Вид такого количества профессиональных игроков, объединенной силы которых вполне хватало для организации осады хорошо защищённой крепости, и вправду был захватывающим. Но куда большее волнение вызывали лица собравшихся, на каждом из которых отражалось неподдельное беспокойство и волнение.
— Осталось четыре минуты!
С каждой секундой нервозность игроков становилась всё больше и больше. Некоторые из них нетерпеливо переминались с ноги на ногу, а кое-кто неприглядно грыз свои ногти. Даже Высший Меч чувствовал, как у него вспотели руки.
«Да что же это такое… Никак не могу успокоиться».
Подземелье Пробкового Острова находилось под полным контролем Гильдии Серебряных Рыцарей, а его ф

In [30]:
b.epub_get_text_from_parts_unextracted_list(part_names_clean[0])

['Во всеоружии_1',
 'Пролог.',
 '— Готовность пять минут!',
 'На 4-ом этаже подземелья Пробкового Острова было собрано сразу восемь топ-игроков. Наиболее титулованным из них был Высший Меч, занимающий 16-ую строку в общеигровом рейтинге. Впрочем, не следовало недооценивать и две сотни элитных членов Гильдии Серебряных Рыцарей.',
 'Вид такого количества профессиональных игроков, объединенной силы которых вполне хватало для организации осады хорошо защищённой крепости, и вправду был захватывающим. Но куда большее волнение вызывали лица собравшихся, на каждом из которых отражалось неподдельное беспокойство и волнение.',
 '— Осталось четыре минуты!',
 'С каждой секундой нервозность игроков становилась всё больше и больше. Некоторые из них нетерпеливо переминались с ноги на ногу, а кое-кто неприглядно грыз свои ногти. Даже Высший Меч чувствовал, как у него вспотели руки.',
 '«Да что же это такое… Никак не могу успокоиться».',
 'Подземелье Пробкового Острова находилось под полным контролем Г

Get all parts and write to .txt file using list of strings.  
This approach may be used for AI cleaning and style improving: 
1. keeping from 2 to 10 strings of the current text in buffer for and reference text for prompt send request to AI api 
2. get response
3. check if it suites using another model: send original text, formatted text and reference text ask to calculate quality coefficient form 0 to 100
4. if quality coefficient is above 98, write string to file
5. else repeat steps from 1 to 4

In [33]:
txt_book = f"{b.folder_path}/{b.file_name_no_ext}.txt"
with open(txt_book, "w", encoding="utf-8") as f:
    for part in part_names_clean:
        text = b.epub_get_text_from_parts_unextracted_list(part)
        f.writelines(text)

Get all parts and write to .txt file using plain text

In [35]:
txt_book = f"{b.folder_path}/{b.file_name_no_ext}.txt"
with open(txt_book, "w", encoding="utf-8") as f:
    for part in part_names_clean:
        text = b.epub_get_text_from_parts_unextracted(part)
        f.write(text)
        f.write('\n\n')

In [24]:
print(b.file_path)
print(b.file_name_no_ext)
print(b.file_name_ext)
print(b.folder_path)

books/vo_vseorujii_1.epub
vo_vseorujii_1
vo_vseorujii_1.epub
books
