<a href="https://colab.research.google.com/github/nakamura196/ndl_ocr/blob/main/Python%E3%82%92%E7%94%A8%E3%81%84%E3%81%A6TEI_XML%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB%E3%82%92EPUB%E3%81%AB%E5%A4%89%E6%8F%9B%E3%81%99%E3%82%8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pythonを用いてTEI/XMLファイルをEPUBに変換する

## ライブラリのインストール

In [5]:
!pip install ebooklib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ebooklib
  Downloading EbookLib-0.17.1.tar.gz (111 kB)
[K     |████████████████████████████████| 111 kB 4.9 MB/s 
Building wheels for collected packages: ebooklib
  Building wheel for ebooklib (setup.py) ... [?25l[?25hdone
  Created wheel for ebooklib: filename=EbookLib-0.17.1-py3-none-any.whl size=38184 sha256=1256ebbd6f711ad3d72eefc1749da36596d9b63296d92d50d531e98d17543be3
  Stored in directory: /root/.cache/pip/wheels/43/39/fd/db4f652431a55d28472ba7f5f7c9a8efad03b97f443a48ea2f
Successfully built ebooklib
Installing collected packages: ebooklib
Successfully installed ebooklib-0.17.1


## データのダウンロード

In [1]:
!git clone https://github.com/kouigenjimonogatari/kouigenjimonogatari.github.io.git kouigenjimonogatari

Cloning into 'kouigenjimonogatari'...
remote: Enumerating objects: 112124, done.[K
remote: Total 112124 (delta 0), reused 0 (delta 0), pack-reused 112124[K
Receiving objects: 100% (112124/112124), 71.08 MiB | 28.01 MiB/s, done.
Resolving deltas: 100% (111207/111207), done.
Checking out files: 100% (25788/25788), done.


## XMLファイルを読み込む

In [2]:
import glob
files = glob.glob("kouigenjimonogatari/tei/*.xml")
files.sort()
file = files[0]

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(open(file,'r'), "xml")
elements = soup.findChildren(text=True, recursive=True)

## メタデータ

In [6]:
from ebooklib import epub
book = epub.EpubBook()

In [7]:
title = soup.find("title").text
author = soup.find("author").text

id = file.split("/")[-1].split(".")[0]

book.set_identifier(id)
book.set_title(title)
book.set_language('ja')

# 方向
book.set_direction('rtl')

book.add_author(author)

### cssを追加する例

In [8]:
css_content = '''html {
-ms-writing-mode: tb-rl;
-epub-writing-mode: vertical-rl;
-webkit-writing-mode: vertical-rl;
writing-mode: vertical-rl;
font-family: serif, sans-serif;
}'''

nav_css = epub.EpubItem(uid=f"_style.css",
                        file_name="style.css",
                        media_type="text/css",
                        content=css_content)
book.add_item(nav_css)

<ebooklib.epub.EpubItem at 0x7ff6cb76ff90>

### ページを作る

In [9]:
elements = soup.find("body").find("p").findChildren()

body = []

count = 0

pageContent = None
page = ""
pages = []

for e in elements:

    if e.name == "pb":
        if pageContent is not None:

            pageContent = pageContent.replace("𠅘", "亭")

            c = epub.EpubHtml(title=page,
                        uid=f"page_{page}",
                        file_name=f'{page}.xhtml',
                        lang='ja')
            c.set_content(f'<body><link rel="stylesheet" href="style.css" type="text/css" />{pageContent}</body>')
            book.add_item(c)

        pageContent = ""
        page = e["n"]
        pages.append(page)

    if e.name == "seg":
        pageContent += e.text + "<br/>"

    # break

    count += 1

    if count > 100:
        # break
        pass

c = epub.EpubHtml(title=page,
            uid=f"page_{page}",
            file_name=f'{page}.xhtml',
            lang='ja')
c.set_content(f'<body><link rel="stylesheet" href="style.css" type="text/css" /><p>{pageContent}</p></body>')
book.add_item(c)

<ebooklib.epub.EpubHtml at 0x7ff6cb6ba510>

## Spineを作成

In [10]:
book.spine = []

for page in pages:
    book.spine.append(f"page_{page}")

book.add_item(epub.EpubNcx())

<ebooklib.epub.EpubNcx at 0x7ff6ded7b950>

## EPUBに書き出し

In [11]:
opath = f'kouigenjimonogatari/epub/{id}.epub'
import os
os.makedirs(os.path.dirname(opath), exist_ok=True)
epub.write_epub(opath, book)