In [80]:

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
from pathlib import Path
import json


class WebScraper:
    def __init__(self, url: str, folder: str):
        self.url = url
        self.folder = folder
        self.soup = self.scrape()
        self.chapter = self.soup.find('title').text.replace(" ", "_")

    def scrape(self):
        req = Request(self.url, headers={'User-Agent': 'Mozilla/5.0'})
        url_content = urlopen(req).read()
        return BeautifulSoup(url_content, "lxml")
    
    def clean_div(self, *attribs):
        for attrib in attribs:
            div_tags = self.soup.find_all('div', class_=attrib)
            for div in div_tags:
                div.extract()
    
    def clean_subdiv(self, div, att, sub_att):
        parent_div = self.soup.find(div, class_=att) 

        if parent_div:
            subdivision_div = parent_div.find(div, class_=sub_att)  
            if subdivision_div:
                subdivision_div.extract()

    def clean_text(self):
        
        self.raw = (self.soup.text
               .split('©')[0]
               .split('>')[-1]
               .strip()
               )
       

    def save_json(self):
        
        # Specify the filename
        json_name = f'{self.chapter}.json'   

        # Create the folder if it doesn't exist
        Path(self.folder).mkdir(parents=True, exist_ok=True)

        # Construct the file path
        json_path = Path(self.folder) / json_name

        data = {}
        matches = re.findall(r'(\d+)\s+([\s\S]*?)(?=\d|$)', self.raw)
        for match in matches:
            key = match[0]
            value = match[1].strip()
            data[key] = value

        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False)

        print("JSON data saved to:", json_path)

        
        
# Example usage:
url = "https://scriptureearth.org/data/bom/sab/bom/bom-22-SNG-001.html"
scraper = WebScraper(url, 'berom')
txt = scraper.soup
chpt_name = scraper.chapter

In [81]:
attribs = ('ms', 'mt', 'sp')
scraper.clean_div(attribs)
scraper.clean_subdiv('div', 'm', 'c-drop')
scraper.clean_text()
scraper.save_json()
print(scraper.raw)

JSON data saved to: berom/Lwɛlɛ_Sɔlɔmɔn_1.json
1 Lwɛlɛ̂ Sɔlɔmɔn de gwa dal lɛlɛ mwa na neta,


2 Ka hwà yang man ranang na me!
Yaga tɛ̂ yey mo ha dal na neta e ra nshî anap.
3 Něy mɔmɔ mo ne ro nesang hyɔɔlɔl,
wôk reza mo zɔng ha sé rat sede wok nesang něy.
Yaga ano de beha mwa be tɛ̀ yey na hwo.
4 Jut me, na wot a nára hot!
À gbɔng gwɔ̀m hom, yel na me e duk pɔlɔ mō,
tik na wot vɔk yey pyɛng na wot ra nzem tɛ̂ yey mo,
wot ê somo tɛ̂ yey mo hɛ dal nshî anap.
A sé tyɛng be tɛ̀ yey na hwo.
5 À behwong Yɛrusalɛm, ma sé nèros bes, ko kasâng-hwong,
nèros sede bûk hey e Kedar,
ko zɛrɛ sede berugû tɔrɔ̂ lɔ Gbɔng Gwɔ̀m Sɔlɔmɔn.
6 Yin lòlo bayis e ra me yaga ma sé nèros wɛt,
yaga e gwi gwɛ gwa tɛ̀ me ano.
Begwa hom ba vɔk shom na me,
ba tɛ̀ me mwât bɛrɛng bě anap,
ko ma bɛrɛng to hom wɛt, sede bwi de ba hala gwɛ.
7 À wò yey hom, ha a me,
kwɔn de hwó gyenges ɛ vyēl mo,
na kwɔn de hwó tik yɛ ye dyɛng ɛ e necam nagwi.
Ano mê kyě rɔ̀nɔs jáma mo e kara mó vè behak wɛt.

8 Ka hwà tɔk wɛt, à hwo de hw