## pandoc + DeepL

### 1. 使用 Homebrew 在 MacOS 上安装 pandoc：

```bash
brew install pandoc
```

### 2. 获取 pandoc 的 path 用来为 pypandoc 设置路径：

```bash
which pandoc

$ /opt/homebrew/bin/pandoc
```

### 3. 使用 pypandoc 转换 epub 为 html：

In [None]:
import pypandoc
import os
import re
from bs4 import BeautifulSoup

os.environ.setdefault('PYPANDOC_PANDOC', '/opt/homebrew/bin/pandoc')


filepath = "/Users/joker/Public/The Computer Book From the Abacus to Artificial Intelligence/The Computer Book From the Abacus to Artificial Intelligence, 250 Milestones in the History of Computer Science by Simson L. Garfinkel, Rachel H. Grunspan (z-lib.org).epub"
pathname, filename = os.path.split(filepath)
targetfilename = 'index.html'


pypandoc.convert_file(filepath,
                      format='epub',
                      to='html5',
                      extra_args=[
                          '--read=epub',
                          f'--extract-media={pathname}/images',
                          '--wrap=none'
                      ],
                      encoding='utf-8',
                      outputfile=pathname + '/' + targetfilename,
                      filters=None,
                      verify_format=True
                     )

os.system(f"open '{pathname}'")

# brew install vscode
# which code -> /opt/homebrew/bin/code
# os.system(f"/opt/homebrew/bin/code '{pathname}'")

### 4. 为 pandoc 转换的 html 添加 header、footer 以及 css：

In [7]:
htmlheader = """
<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
    <link href="./style.css" rel="stylesheet" type="text/css" />
    <title></title>
  </head>  
<body>
"""

htmlfooter = """
</body>
</html>
"""


stylecss = """
body {
  width: 90%;
  margin: 1em auto;
  font-family:"Kaiti SC", Georgia, 'Times New Roman', Times, serif !important;
  font-size: 20px;
  color: #35453F;
  background-color: #C4E7CD;
}

img{
  /* width: 90% !important; */
  text-align: center !important;
  border: #35453F solid 10 px;
  padding: 10px;
}

img.inline {
  height: 1em;
  width:auto !important;
  margin-bottom: -0.6em !important;
}

p {
  margin-bottom: 1em !important;
  font-size: 20px !important;
}

h1.english, h2.english, p.english {
  /* display: none; */
}

h1.chinese, h2.chinese, p.chinese {
  letter-spacing: 0.1em;
}

sup {
  margin-right: 0.5em;
}

a {
  color: #232442
}

strong {
  color: #151741; 
  font-size: 95%;
}

.chinese em, .chinese i {
  font-style: normal;
  font-weight: bold;
  color: #151741; 
  font-size: 95%;
}
"""

with open(pathname + '/' + targetfilename, "r") as file:
    html = file.read()

# pypandoc 在转换时，会将 pathname 写入 html 文件，
# 以下 regex 是将 img tag 中的 pathname 替换掉
pttn = rf'{pathname}'
rpl = r'.'
html = re.sub(pttn, rpl, html)    
    
with open(pathname + '/' + targetfilename, "w") as file:
    file.write(htmlheader)
    file.write(html)
    file.write(htmlfooter)
    
with open(pathname + '/style.css', "w") as file:
    file.write(stylecss)

### 5. 清理 html

提交 DeepL 翻译之前，有必要大致浏览一下 html 文件。

1. 看看有没有什么需要清理的地方（比如，多余的 span tag）
2. 看看需要翻译的都包括哪些 tags？（在下一个代码块中需要指定）

如有必要清理，可使用以下脚本：

In [8]:
# pathname  = "/Users/joker/Public/The future of everything/" # 文件夹名称末尾得有 / 

source_filename = pathname + '/' + targetfilename
                        
target_filename = pathname + '/' + targetfilename

def remove_span_with_no_attribute(html):
    soup = BeautifulSoup(html)
    for span in soup.find_all('span', attrs={}):
        if span.attrs == {}:
            span.unwrap()    
    return str(soup)

def remove_span_with_class_small(html):
    soup = BeautifulSoup(html)
    for span in soup.find_all('span', attrs={'class':'small'}):
        span.unwrap()    
    return str(soup)

def remove_br(html):
    pttn = r'<br\s*/>\n+'
    rpl = r''
    html = re.sub(pttn, rpl, html)
    return html

def remove_tags(html, tags):
    soup = BeautifulSoup(html)
    for tag in tags:
        for each_tag in soup.find_all(tag):
            each_tag.unwrap()
    return str(soup)
        

with open(source_filename, "r") as file:
    html = file.read()
    
html = remove_span_with_no_attribute(html)
html = remove_span_with_class_small(html)
html = remove_br(html)
html = remove_tags(html, ['big'])

with open (target_filename, "w") as file:
    file.write(html)


### 6. 提交 DeepL 翻译

以下脚本中，line 170-189，需要在执行前指定一些参数，参见以下代码中的备注部分：

```python
# tags_to_be_translated = ['p', 'h1', 'h2', 'h3', 'h4']
tags_to_be_translated = ['p', 'li', 'td', 'h1', 'h2', 'h3', 'h4']
tags_tbt = '|'.join(tags_to_be_translated) # 为了以后在 regular expression 中使用

# pathname  = "/Users/joker/Public/The future of everything/" # 文件夹名称末尾得有 / 

source_filename = "index.html"  # 用 pandoc 转换生成的文件，成为这一步的 “源文件”
                        
target_filename = "index-en-zhcn.html"

lines = open(pathname + '/' + source_filename, "r").readlines()

new_lines = []
line_count = 0
# 指定从哪一行开始翻译
startline = 313
# 指定到哪一行停止翻译
endline = 4746
# 是否是重新尝试
retry = 0source_filename = "index.html"  # 用 pandoc 转换生成的文件，成为这一步的 “源文件”
                        
target_filename = "index-en-zhcn.html"

lines = open(pathname + '/' + source_filename, "r").readlines()

new_lines = []
line_count = 0
# 指定从哪一行开始翻译
startline = 313
# 指定到哪一行停止翻译
endline = 4746
# 是否是重新尝试
retry = 0
```

In [None]:
import re
import requests
from bs4 import BeautifulSoup

# 若干需要使用的函数

def translate(text):
    result = requests.get( 
       "https://api.deepl.com/v2/translate",
       params={ 
         "auth_key": auth_key,
         "target_lang": target_language,
         "text": text,
         "tag_handling": "xml", # 这个参数确保 DeepL 正确处理 html tags
       },
    ) 
    return result.json()["translations"][0]["text"]

def add_language_tag(html, tag, classname):
    soup = BeautifulSoup(html)
    for the_tag in soup.find_all(tag):
        the_tag['class'] = the_tag.get('class', []) + [classname]
    return str(soup)

def write_into_file(filename, text):
    with open(filename, 'a', encoding='utf-8') as f:
        f.write("\n"+text)    

def zh_format(html):
    
    # 直双引号转换成弯双引号
    pttn = r'\s*"(.*?)\s*"'
    rpl = r'“\1”'
    html = re.sub(pttn, rpl, html)
    
    # 直单引号转换成弯单引号
    pttn = r"\s*'(.*?)\s*'"
    rpl = r'‘\1’'
    html = re.sub(pttn, rpl, html)
    
    # html tag 中被误伤的双直引号
    pttn = r'=[“”"](.*?)[“”"]'
    rpl = r'="\1"'
    html = re.sub(pttn, rpl, html)   
    
    # html tag 中被误伤的单直引号
    pttn = r"=[‘’'](.*?)[‘’']"
    rpl = r"='\1'"
    html = re.sub(pttn, rpl, html)
    
    # 弯引号之前的空格
    pttn = r'([\u4e00-\u9fa5])([“‘])'
    rpl = r'\1 \2'
    html = re.sub(pttn, rpl, html)

    # 弯引号之后的空格 —— 标点符号不在 \u4e00-\u9fa5 范围内
    pttn = r'([’”])([\u4e00-\u9fa5])'
    rpl = r'\1 \2'
    html = re.sub(pttn, rpl, html)
           
    # html tag: <i>, <em> 转换成 <strong>
    pttn = r'(<i|<em)'
    rpl = r'<strong'
    html = re.sub(pttn, rpl, html)
    
    # html tag: <i>, <em> 转换成 <strong>
    pttn = r'(/i>|/em>)'
    rpl = r'/strong>'
    html = re.sub(pttn, rpl, html)
    
    # html tag: strong 内部的 “”、‘’、《》、（）
    pttn = r'<strong (.*?)>([《（“‘]+)'
    rpl = r'\2<strong \1>'
    html = re.sub(pttn, rpl, html)
    
    pttn = r'([》）”’。，]+)</strong>'
    rpl = r'</strong>\1'
    html = re.sub(pttn, rpl, html)
    
    # 省略号
    pttn = r'(\. )+\s*。*\s*|。\s*(\. )+'
    rpl = r'…… '
    html = re.sub(pttn, rpl, html)   

    # 破折号
    pttn = r'&mdash；|&mdash;|--'
    rpl = r' —— '
    html = re.sub(pttn, rpl, html)
    
    # 姓名之间的 ·（重复三次）
    pttn = r'([\u4e00-\u9fa5])-([\u4e00-\u9fa5])'
    rpl = r'\1·\2'
    html = re.sub(pttn, rpl, html)
    
    pttn = r'([\u4e00-\u9fa5])-([\u4e00-\u9fa5])'
    rpl = r'\1·\2'
    html = re.sub(pttn, rpl, html)
    
    pttn = r'([\u4e00-\u9fa5])-([\u4e00-\u9fa5])'
    rpl = r'\1·\2'
    html = re.sub(pttn, rpl, html)

    # 姓名之间的 ·（中间含有一个英文字母的）
    pttn = r'([\u4e00-\u9fa5])-(.?)-([\u4e00-\u9fa5])'
    rpl = r'\1·\2·\3'
    html = re.sub(pttn, rpl, html)

    # 全角百分号
    pttn = r'％'
    rpl = r'%'
    html = re.sub(pttn, rpl, html)
      
    # 数字前的空格
    pttn = r'([\u4e00-\u9fa5])(\d)'
    rpl = r'\1 \2'
    html = re.sub(pttn, rpl, html)
    
    # 数字后的空格，百分比 % 后的空格
    pttn = r'([\d%])([\u4e00-\u9fa5])'
    rpl = r'\1 \2'
    html = re.sub(pttn, rpl, html)
        
    # 英文字母前的空格
    pttn = r'([\u4e00-\u9fa5])([a-zA-Z])'
    rpl = r'\1 \2'
    html = re.sub(pttn, rpl, html)
        
    # 英文字母后的空格，百分比 % 后的空格
    pttn = r'([a-zA-Z])([\u4e00-\u9fa5])'
    rpl = r'\1 \2'
    html = re.sub(pttn, rpl, html)
        
    # 弯引号前的逗号
    pttn = r'，([”’])'
    rpl = r'\1，'
    html = re.sub(pttn, rpl, html)
        
    # 中文标点符号之前多余的空格
    pttn = r'([，。！？》〉】]) '
    rpl = r'\1'
    html = re.sub(pttn, rpl, html)
    
    # 英文句号 . 与汉字之间的空格
    pttn = r'\.([\u4e00-\u9fa5])'
    rpl = r'. \1'
    html = re.sub(pttn, rpl, html)
      
    # 左半角括号
    pttn = r'\s*\('
    rpl = r'（'
    html = re.sub(pttn, rpl, html)
    
    # 右半角括号
    pttn = r'\)\s*'
    rpl = r'）'
    html = re.sub(pttn, rpl, html)  

    # 多余的括号（DeepL 返回文本经常出现的情况）
    pttn = r'）。）'
    rpl = r'。）'
    html = re.sub(pttn, rpl, html)
    
    return html

# 指定一些变量

auth_key = "<your deepl API auth_key>" # 注意，要订阅的是 DeepL API Pro
target_language = "ZH"  ## 当然，你可以将目标语言设置成任何 DeepL 支持的语言

# tags_to_be_translated = ['p', 'h1', 'h2', 'h3', 'h4']
tags_to_be_translated = ['p', 'li', 'td', 'h1', 'h2', 'h3', 'h4']
tags_tbt = '|'.join(tags_to_be_translated) # 为了以后在 regular expression 中使用

# pathname  = "/Users/joker/Public/The future of everything/" # 文件夹名称末尾得有 /

source_filename = "index.html"  # 用 pandoc 转换生成的文件，成为这一步的 “源文件”
                        
target_filename = "index-en-zhcn.html"

lines = open(pathname + '/' + source_filename, "r").readlines()

new_lines = []
line_count = 0
# 指定从哪一行开始翻译
startline = 313
# 指定到哪一行停止翻译
endline = 4746
# 是否是重新尝试
retry = 0

# 开始逐行处理

for line in lines:
    
    line_count += 1
    print(line_count)
    
    if (line_count < startline) or (line_count > endline):
        new_lines.append(line)
        print(line)
        if not retry:
            write_into_file(pathname + '/' + target_filename, line)
        continue  

    if line.strip() == '':
        new_lines.append(line)  
    
    tags = [tag.name for tag in BeautifulSoup(line).find_all()]
    
    if len(tags) > 0 and line_count > startline and not retry:
        
        to_tranlate = False
        translating_tag = ""
        
        for tag in tags:
            if tag in tags_to_be_translated:
                to_tranlate = True
                translating_tag = tag
        
        if to_tranlate:
         
            succeeded = False
            while not succeeded:
                
                # 以下比较粗暴的 try... except，用来防止执行过程中出现 DeepL 连接错误而导致翻译任务中断……

                soupline = BeautifulSoup(line, 'html.parser')
                    
                line = str(soupline)
                
                try:
                    line_translated = translate(line)

                    # 以下一行确保将返回的字符串转换成一整行，而非含有 \n 的多行文本
                    line_translated = line_translated.replace("\n", "")
                    succeeded = True
                except:
                    succeeded = False
        
            line = add_language_tag(line, translating_tag, 'english')
            line_translated = add_language_tag(zh_format(line_translated), translating_tag, 'chinese')

            new_lines.append(line)
            print(line)
            write_into_file(pathname + '/' + target_filename, line)      

            new_lines.append(line_translated)
            print(line_translated)
            write_into_file(pathname + '/' + target_filename, line_translated + '\n')

        else:
            new_lines.append(line)
            print(line)
            write_into_file(pathname + '/' + target_filename, line)
            continue
      
            
print ('finished!')            


### 7. 对已翻译段落做批处理

有时需要对已经 “中英交错排版的 html” 中的 “中文段落” 进行批处理。可用以下脚本。

因为已翻译段落，被加上了 `class="chinese"`，所以可以用它作为判断条件（`if '="chinese"' in line:`）。

if 块中的正则表达式可按需求修改：

In [None]:
import re
import requests
from bs4 import BeautifulSoup

source_filename = "index-en-zhcn.html"
                        
target_filename = "index-en-zhcn.html"


with open(pathname + '/' + source_filename, "r") as file:
    lines = file.readlines()

    
with open(pathname + '/' + target_filename, "w") as file:  
    
    for line in lines:
    
        if '="chinese"' in line:
            
            # html tag: <i>, <em> 转换成 <strong>
            pttn = r'(<i|<em)'
            rpl = r'<strong'
            line = re.sub(pttn, rpl, line)

            # html tag: <i>, <em> 转换成 <strong>
            pttn = r'(/i>|/em>)'
            rpl = r'/strong>'
            line = re.sub(pttn, rpl, line)
        
        file.write(line)

### 8. 转换 html 为 epub

使用 Calibre 更佳
