This notebook scrapes CUVS (和合本圣经，简体版) from `cus.ibibles.ent`. 

In [1]:
import requests
from bs4 import BeautifulSoup
import re 
from typing import List 
from definitions import BibleVerse, BibleBook
import yaml 

In [2]:
def is_verse_line (txt :str) -> bool: 
    if (re.match(r"^\d+:\d+", txt) is not None): 
        return True
    else: 
        return False 
    
def scrape_raw_text_from_url (url :str) -> str: 
    # download the raw web content 
    response = requests.get(url)
    html_content = response.content.decode("utf-8")

    # parse the HTML using BeautifulSoup 
    soup = BeautifulSoup(html_content, "html.parser")

    # scrape the plain text (raw)
    raw_text = soup.text 

    # return 
    return raw_text

def process_raw_text_to_verse_lines (raw_text :str) -> List[str]: 
    text_lines = raw_text.split("\n")
    text_lines = list(map(lambda txt: txt.strip(), text_lines))
    text_lines = list(filter(lambda txt: is_verse_line(txt), text_lines)) 
    text_lines = list(map(lambda txt: re.sub("\u3000", "", txt), text_lines))
    text_lines = list(map(lambda txt: re.sub("．", "，", txt), text_lines))
    return text_lines 

def process_verse_lines_to_bible_verses (book :str, verses :List[str]) -> List[BibleVerse]: 
    bible_verses = [] 
    for v in verses: 
        re_match = re.match(r"^(\d+):(\d+)$", v)
        if (re_match is not None): 
            print(f"Warning: empty verse: {book}{v}")
            continue 

        re_match = re.match(r"^(\d+):(\d+)\s+(.*)$", v)
        assert(re_match is not None), f"Bad verse line: {v}"

        bible_verses.append(BibleVerse(
            text=re_match.groups()[2], 
            metadata = {
                "chapter": int(re_match.groups()[0]), 
                "verse": int(re_match.groups()[1])
            }
        ))
    return bible_verses

In [5]:
bible_metadata = {"version": "和合本(简)"}
bible_books = [
    ("001genesis", "创世纪", bible_metadata, "https://cus.ibibles.net/001Genesis.htm"), 
    ("002exodus", "出埃及记", bible_metadata, "https://cus.ibibles.net/002Exodus.htm"), 
    ("003leviticus", "利未记", bible_metadata, "https://cus.ibibles.net/003Leviticus.htm"), 
    ("004numbers", "民数记", bible_metadata, "https://cus.ibibles.net/004Numbers.htm"), 
    ("005deuteronomy", "申命记", bible_metadata, "https://cus.ibibles.net/005Deuteronomy.htm"), 
    ("006joshua", "约书亚记", bible_metadata, "https://cus.ibibles.net/006Joshua.htm"), 
    ("007judges", "士师记", bible_metadata, "https://cus.ibibles.net/007Judges.htm"), 
    ("008ruth", "路得记", bible_metadata, "https://cus.ibibles.net/008Ruth.htm"), 
    ("009samuel1", "撒母耳记上", bible_metadata, "https://cus.ibibles.net/009Samuel1.htm"), 
    ("010samuel2", "撒母耳记下", bible_metadata, "https://cus.ibibles.net/010Samuel2.htm"), 
    ("011kings1", "列王记上", bible_metadata, "https://cus.ibibles.net/011Kings1.htm"), 
    ("012kings2", "列王记下", bible_metadata, "https://cus.ibibles.net/012Kings2.htm"), 
    ("013chronicles1", "历代志上", bible_metadata, "https://cus.ibibles.net/013Chronicles1.htm"), 
    ("014chronicles2", "历代志下", bible_metadata, "https://cus.ibibles.net/014Chronicles2.htm"), 
    ("015ezra", "以斯拉记", bible_metadata, "https://cus.ibibles.net/015Ezra.htm"), 
    ("016nehemiah", "尼希米记", bible_metadata, "https://cus.ibibles.net/016Nehemiah.htm"), 
    ("017esther", "以斯帖记", bible_metadata, "https://cus.ibibles.net/017Esther.htm"), 
    ("018job", "约伯记", bible_metadata, "https://cus.ibibles.net/018Job.htm"), 
    ("019psalms", "诗篇", bible_metadata, "https://cus.ibibles.net/019Psalms.htm"), 
    ("020proberbs", "箴言", bible_metadata, "https://cus.ibibles.net/020Proverbs.htm"), 
    ("021ecclesiastes", "传道书", bible_metadata, "https://cus.ibibles.net/021Ecclesiastes.htm"), 
    ("022songs", "雅歌", bible_metadata, "https://cus.ibibles.net/022Songs.htm"), 
    ("023isaiah", "以赛亚书", bible_metadata, "https://cus.ibibles.net/023Isaiah.htm"), 
    ("024jeremiah", "耶利米书", bible_metadata, "https://cus.ibibles.net/024Jeremiah.htm"), 
    ("025lamentations", "耶利米哀歌", bible_metadata, "https://cus.ibibles.net/025Lamentations.htm"), 
    ("026ezekiel", "以西结书", bible_metadata, "https://cus.ibibles.net/026Ezekiel.htm"), 
    ("027daniel", "但以理书", bible_metadata, "https://cus.ibibles.net/027Daniel.htm"), 
    ("028hosea", "何西阿书", bible_metadata, "https://cus.ibibles.net/028Hosea.htm"), 
    ("029joel", "约珥书", bible_metadata, "https://cus.ibibles.net/029Joel.htm"), 
    ("030amos", "阿摩司书", bible_metadata, "https://cus.ibibles.net/030Amos.htm"), 
    ("031obadiah", "俄巴底亚书", bible_metadata, "https://cus.ibibles.net/031Obadiah.htm"), 
    ("032jonah", "约拿书", bible_metadata, "https://cus.ibibles.net/032Jonah.htm"), 
    ("033micah", "弥迦书", bible_metadata, "https://cus.ibibles.net/033Micah.htm"), 
    ("034nahum", "那鸿书", bible_metadata, "https://cus.ibibles.net/034Nahum.htm"), 
    ("035habakkuk", "哈巴谷书", bible_metadata, "https://cus.ibibles.net/035Habakkuk.htm"), 
    ("036zephaniah", "西番雅书", bible_metadata, "https://cus.ibibles.net/036Zephaniah.htm"), 
    ("037haggai", "哈该书", bible_metadata, "https://cus.ibibles.net/037Haggai.htm"), 
    ("038zechariah", "撒迦利亚书", bible_metadata, "https://cus.ibibles.net/038Zechariah.htm"), 
    ("039malachi", "玛拉基书", bible_metadata, "https://cus.ibibles.net/039Malachi.htm"), 
    ("101matthew", "马太福音", bible_metadata, "https://cus.ibibles.net/101Matthew.htm"), 
    ("102mark", "马可福音", bible_metadata, "https://cus.ibibles.net/102Mark.htm"), 
    ("103luke", "路加福音", bible_metadata, "https://cus.ibibles.net/103Luke.htm"), 
    ("104john", "约翰福音", bible_metadata, "https://cus.ibibles.net/104John.htm"), 
    ("105acts", "使徒行传", bible_metadata, "https://cus.ibibles.net/105Acts.htm"), 
    ("106romans", "罗马书", bible_metadata, "https://cus.ibibles.net/106Romans.htm"), 
    ("107corinthians1", "哥林多前书", bible_metadata, "https://cus.ibibles.net/107Corinthians1.htm"), 
    ("108corinthians2", "哥林多后书", bible_metadata, "https://cus.ibibles.net/108Corinthians2.htm"), 
    ("109galatians", "加拉太书", bible_metadata, "https://cus.ibibles.net/109Galatians.htm"), 
    ("110ephesians", "以弗所书", bible_metadata, "https://cus.ibibles.net/110Ephesians.htm"), 
    ("111philippians", "腓立比书", bible_metadata, "https://cus.ibibles.net/111Philippians.htm"), 
    ("112colossians", "歌罗西书", bible_metadata, "https://cus.ibibles.net/112Colossians.htm"), 
    ("113thessalonians1", "帖撒罗尼迦前书", bible_metadata, "https://cus.ibibles.net/113Thessalonians1.htm"), 
    ("114thessalonians2", "帖撒罗尼迦后书", bible_metadata, "https://cus.ibibles.net/114Thessalonians2.htm"), 
    ("115timothy1", "提摩太前书", bible_metadata, "https://cus.ibibles.net/115Timothy1.htm"), 
    ("116timothy2", "提摩太后书", bible_metadata, "https://cus.ibibles.net/116Timothy2.htm"), 
    ("117titus", "提多书", bible_metadata, "https://cus.ibibles.net/117Titus.htm"), 
    ("118philemon", "腓利门书", bible_metadata, "https://cus.ibibles.net/118Philemon.htm"), 
    ("119hebrews", "希伯来书", bible_metadata, "https://cus.ibibles.net/119Hebrews.htm"), 
    ("120james", "雅各书", bible_metadata, "https://cus.ibibles.net/120James.htm"), 
    ("121peter1", "彼得前书", bible_metadata, "https://cus.ibibles.net/121Peter1.htm"), 
    ("122peter2", "彼得后书", bible_metadata, "https://cus.ibibles.net/122Peter2.htm"), 
    ("123john1", "约翰一书", bible_metadata, "https://cus.ibibles.net/123John1.htm"), 
    ("124john2", "约翰二书", bible_metadata, "https://cus.ibibles.net/124John2.htm"), 
    ("125john3", "约翰三书", bible_metadata, "https://cus.ibibles.net/125John3.htm"), 
    ("126jude", "犹大书", bible_metadata, "https://cus.ibibles.net/126Jude.htm"), 
    ("127revelation", "启示录", bible_metadata, "https://cus.ibibles.net/127Revelation.htm")
]

In [6]:
for b_book in bible_books: 
    # Acquire the bible verses 
    url = b_book[3]
    raw_text = scrape_raw_text_from_url(url)
    verse_lines = process_raw_text_to_verse_lines(raw_text)
    bible_verses = process_verse_lines_to_bible_verses(b_book[0], verse_lines)

    # write the bible book 
    bible_book = BibleBook(
        name=b_book[1], 
        metadata=b_book[2], 
        verses=bible_verses
    )
    
    with open(f"../../data/bible_versions/cuvs/{b_book[0]}.yaml", "w", encoding="utf-8") as f: 
        yaml.dump(bible_book.model_dump(), f, indent=2, allow_unicode=True)

