In [25]:

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
from pathlib import Path
import json


class WebScraper:
    def __init__(self, url: str, folder: str):
        self.url = url
        self.folder = folder
        self.soup = self.scrape()
        self.chapter = self.soup.find('title').text.replace(" ", "_")

    def scrape(self):
        req = Request(self.url, headers={'User-Agent': 'Mozilla/5.0'})
        url_content = urlopen(req).read()
        return BeautifulSoup(url_content, "lxml")
    
    def clean_div(self, *attribs):
        for attrib in attribs:
            div_tags = self.soup.find_all('div', class_=attrib)
            for div in div_tags:
                div.extract()
    
    def clean_subdiv(self, div, att, sub_att):
        parent_div = self.soup.find(div, class_=att) 

        if parent_div:
            subdivision_div = parent_div.find(div, class_=sub_att)  
            if subdivision_div:
                subdivision_div.extract()

    def clean_text(self):
        
        self.raw = (self.soup.text
               .split('©')[0]
               .split('>')[-1]
               .strip()
               )
       

    def save_json(self):
        
        # Specify the filename
        json_name = f'{self.chapter}.json'   

        # Create the folder if it doesn't exist
        Path(self.folder).mkdir(parents=True, exist_ok=True)

        # Construct the file path
        json_path = Path(self.folder) / json_name

        data = {}
        matches = re.findall(r'(\d+)\s+([\s\S]*?)(?=\d|$)', self.raw)
        for match in matches:
            key = match[0]
            value = match[1].strip()
            data[key] = value

        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False)

        print("JSON data saved to:", json_path)

        
        
# Example usage:
url = "https://scriptureearth.org/data/bom/sab/bom/bom-16-NEH-013.html"
scraper = WebScraper(url, 'berom')
txt = scraper.soup
chpt_name = scraper.chapter

In [26]:
attribs = ('ms', 'mt', 'sp')
scraper.clean_div(attribs)
scraper.clean_subdiv('div', 'm', 'c-drop')
scraper.clean_text()
scraper.save_json()
print(scraper.raw)

JSON data saved to: berom/Nɛhɛmiya_13.json
Ba kang yére dyuk detu

 1 E jeng o bá sè basa a BeIsrayɛl Bwok-basâ Mose na gbɔng fwɔ, bá wul kwɔn de ba jɛk ɛ wɔgɔ, be kwon tik WoAmɔn kɛ WoMowap hwak a wuna hɛ na bemât Dagwi wɛt. 2 Ba jɛk ano, yaga BeAmɔn na BeMowap bá kana nɔ̂ng BeIsrayɛl pyɛ̂ re kɛ nshî sɔ wɛt, jeng de yɛn á sè kyè yi e vwêl Misra. Ko BeMowap bá nɔ̂ng Balaam, dyām yaga na a mok a BeIsrayɛl gawey. Ko Dagwi mot á tɛ̀ gawey o ha tɛ́ ɛ a yɛn nerat. 3 Jeng de BeIsrayɛl bá wok bása mo, ba vɛ̌ kang mwa bemât yére dyuk yi na yɛn.
Pyɛ́ ryat de Nɛhɛmiya a ra
4 Ɛliyaship, piris de a vɔk reto e ra begban Gbɔng Duk Re Fongol, a ga ros a sè sɛ̀ pang na Tobiya, 5 ko a simo a yɛ na a ra fwom na duk rwey o hwak de ba tik wɛ yaga tikîs nɔrɔ̂ pyɛ̂ yeneng, na pyɛ̂ nesang hyɔɔlɔl, na pyɛ́ detidetu de be ra mɛ fwom e Gbɔng Duk o, na pyɛ̂ nɔrɔ de be nɔngɔs yaga bepiris tanga pyɔlɔ̂ pyɛ̂ yeneng, na yê nshî anap, na yê něy pwat de ba nɔngɔ̂s BeLevi, na bemât lwɛlɛ tanga vê gyêng ri Gbɔng Duk o. 