This notebook scrapes CUVS (和合本圣经，简体版) from `cus.ibibles.ent`. 

In [2]:
import requests
from bs4 import BeautifulSoup
import re 
from typing import List 
from bible_study_bot.data.definitions import BibleVerse, BibleVerseMetadata, BibleBook, BibleBookMetadata
import yaml 

In [3]:
def is_verse_line (txt :str) -> bool: 
    if (re.match(r"^\d+:\d+", txt) is not None): 
        return True
    else: 
        return False 
    
def scrape_raw_text_from_url (url :str) -> str: 
    # download the raw web content 
    response = requests.get(url)
    html_content = response.content.decode("utf-8")

    # parse the HTML using BeautifulSoup 
    soup = BeautifulSoup(html_content, "html.parser")

    # scrape the plain text (raw)
    raw_text = soup.text 

    # return 
    return raw_text

def process_raw_text_to_verse_lines (raw_text :str) -> List[str]: 
    text_lines = raw_text.split("\n")
    text_lines = list(map(lambda txt: txt.strip(), text_lines))
    text_lines = list(filter(lambda txt: is_verse_line(txt), text_lines)) 
    text_lines = list(map(lambda txt: re.sub("\u3000", "", txt), text_lines))
    text_lines = list(map(lambda txt: re.sub("．", "，", txt), text_lines))
    return text_lines 

def process_verse_lines_to_bible_verses (book :str, verses :List[str]) -> List[BibleVerse]: 
    bible_verses = [] 
    for v in verses: 
        re_match = re.match(r"^(\d+):(\d+)$", v)
        if (re_match is not None): 
            print(f"Warning: empty verse: {book}{v}")
            continue 

        re_match = re.match(r"^(\d+):(\d+)\s+(.*)$", v)
        assert(re_match is not None), f"Bad verse line: {v}"

        bible_verses.append(BibleVerse(
            text=re_match.groups()[2], 
            metadata = BibleVerseMetadata(
                chapter = int(re_match.groups()[0]), 
                verse = int(re_match.groups()[1])
            ).model_dump()
        ))
    return bible_verses

In [4]:
bible_version = "和合本(简)"
bible_books = [
    ("001genesis", "genesis", "https://cus.ibibles.net/001Genesis.htm"), 
    ("002exodus", "exodus", "https://cus.ibibles.net/002Exodus.htm"), 
    ("003leviticus", "leviticus", "https://cus.ibibles.net/003Leviticus.htm"), 
    ("004numbers", "numbers", "https://cus.ibibles.net/004Numbers.htm"), 
    ("005deuteronomy", "deuteronomy", "https://cus.ibibles.net/005Deuteronomy.htm"), 
    ("006joshua", "joshua", "https://cus.ibibles.net/006Joshua.htm"), 
    ("007judges", "judges", "https://cus.ibibles.net/007Judges.htm"), 
    ("008ruth", "ruth", "https://cus.ibibles.net/008Ruth.htm"), 
    ("009samuel1", "1samuel", "https://cus.ibibles.net/009Samuel1.htm"), 
    ("010samuel2", "2samuel", "https://cus.ibibles.net/010Samuel2.htm"), 
    ("011kings1", "1kings", "https://cus.ibibles.net/011Kings1.htm"), 
    ("012kings2", "2kings", "https://cus.ibibles.net/012Kings2.htm"), 
    ("013chronicles1", "1chronicles", "https://cus.ibibles.net/013Chronicles1.htm"), 
    ("014chronicles2", "2chronicles", "https://cus.ibibles.net/014Chronicles2.htm"), 
    ("015ezra", "ezra", "https://cus.ibibles.net/015Ezra.htm"), 
    ("016nehemiah", "nehemiah", "https://cus.ibibles.net/016Nehemiah.htm"), 
    ("017esther", "esther", "https://cus.ibibles.net/017Esther.htm"), 
    ("018job", "job", "https://cus.ibibles.net/018Job.htm"), 
    ("019psalms", "psalms", "https://cus.ibibles.net/019Psalms.htm"), 
    ("020proverbs", "proverbs", "https://cus.ibibles.net/020Proverbs.htm"), 
    ("021ecclesiastes", "ecclesiastes", "https://cus.ibibles.net/021Ecclesiastes.htm"), 
    ("022songs", "songs", "https://cus.ibibles.net/022Songs.htm"), 
    ("023isaiah", "isaiah", "https://cus.ibibles.net/023Isaiah.htm"), 
    ("024jeremiah", "jeremiah", "https://cus.ibibles.net/024Jeremiah.htm"), 
    ("025lamentations", "lamentations", "https://cus.ibibles.net/025Lamentations.htm"), 
    ("026ezekiel", "ezekiel", "https://cus.ibibles.net/026Ezekiel.htm"), 
    ("027daniel", "daniel", "https://cus.ibibles.net/027Daniel.htm"), 
    ("028hosea", "hosea", "https://cus.ibibles.net/028Hosea.htm"), 
    ("029joel", "joel", "https://cus.ibibles.net/029Joel.htm"), 
    ("030amos", "amos", "https://cus.ibibles.net/030Amos.htm"), 
    ("031obadiah", "obadiah", "https://cus.ibibles.net/031Obadiah.htm"), 
    ("032jonah", "jonah", "https://cus.ibibles.net/032Jonah.htm"), 
    ("033micah", "micah", "https://cus.ibibles.net/033Micah.htm"), 
    ("034nahum", "nahum", "https://cus.ibibles.net/034Nahum.htm"), 
    ("035habakkuk", "habakkuk", "https://cus.ibibles.net/035Habakkuk.htm"), 
    ("036zephaniah", "zephaniah", "https://cus.ibibles.net/036Zephaniah.htm"), 
    ("037haggai", "haggai", "https://cus.ibibles.net/037Haggai.htm"), 
    ("038zechariah", "zechariah", "https://cus.ibibles.net/038Zechariah.htm"), 
    ("039malachi", "malachi", "https://cus.ibibles.net/039Malachi.htm"), 
    ("101matthew", "matthew", "https://cus.ibibles.net/101Matthew.htm"), 
    ("102mark", "mark", "https://cus.ibibles.net/102Mark.htm"), 
    ("103luke", "luke", "https://cus.ibibles.net/103Luke.htm"), 
    ("104john", "john", "https://cus.ibibles.net/104John.htm"), 
    ("105acts", "acts", "https://cus.ibibles.net/105Acts.htm"), 
    ("106romans", "romans", "https://cus.ibibles.net/106Romans.htm"), 
    ("107corinthians1", "1corinthians", "https://cus.ibibles.net/107Corinthians1.htm"), 
    ("108corinthians2", "2corinthians", "https://cus.ibibles.net/108Corinthians2.htm"), 
    ("109galatians", "galatians", "https://cus.ibibles.net/109Galatians.htm"), 
    ("110ephesians", "ephesians", "https://cus.ibibles.net/110Ephesians.htm"), 
    ("111philippians", "philippians", "https://cus.ibibles.net/111Philippians.htm"), 
    ("112colossians", "colossians", "https://cus.ibibles.net/112Colossians.htm"), 
    ("113thessalonians1", "1thessalonians", "https://cus.ibibles.net/113Thessalonians1.htm"), 
    ("114thessalonians2", "2thessalonians", "https://cus.ibibles.net/114Thessalonians2.htm"), 
    ("115timothy1", "1timothy", "https://cus.ibibles.net/115Timothy1.htm"), 
    ("116timothy2", "2timothy", "https://cus.ibibles.net/116Timothy2.htm"), 
    ("117titus", "titus", "https://cus.ibibles.net/117Titus.htm"), 
    ("118philemon", "philemon", "https://cus.ibibles.net/118Philemon.htm"), 
    ("119hebrews", "hebrews", "https://cus.ibibles.net/119Hebrews.htm"), 
    ("120james", "james", "https://cus.ibibles.net/120James.htm"), 
    ("121peter1", "1peter", "https://cus.ibibles.net/121Peter1.htm"), 
    ("122peter2", "2peter", "https://cus.ibibles.net/122Peter2.htm"), 
    ("123john1", "1john", "https://cus.ibibles.net/123John1.htm"), 
    ("124john2", "2john", "https://cus.ibibles.net/124John2.htm"), 
    ("125john3", "3john", "https://cus.ibibles.net/125John3.htm"), 
    ("126jude", "jude", "https://cus.ibibles.net/126Jude.htm"), 
    ("127revelation", "revelation", "https://cus.ibibles.net/127Revelation.htm")
]

In [6]:
for b_book in bible_books: 
    # Acquire the bible verses 
    url = b_book[2]
    raw_text = scrape_raw_text_from_url(url)
    verse_lines = process_raw_text_to_verse_lines(raw_text)
    bible_verses = process_verse_lines_to_bible_verses(b_book[0], verse_lines)

    bible_book_metadata = BibleBookMetadata(
        bible_version = bible_version, 
        book = b_book[1]
    )

    # write the bible book 
    bible_book = BibleBook(
        metadata=bible_book_metadata.model_dump(), 
        verses=bible_verses
    )
    
    with open(f"../../data/bible_versions/cuvs/{b_book[0]}.yaml", "w", encoding="utf-8") as f: 
        yaml.dump(bible_book.model_dump(), f, indent=2, allow_unicode=True)

