In [118]:

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
from pathlib import Path
import json


class WebScraper:
    def __init__(self, url: str, folder: str):
        self.url = url
        self.folder = folder
        self.soup = self.scrape()
        self.chapter = self.soup.find('title').text.replace(" ", "_")

    def scrape(self):
        req = Request(self.url, headers={'User-Agent': 'Mozilla/5.0'})
        url_content = urlopen(req).read()
        return BeautifulSoup(url_content, "lxml")
    
    def clean_div(self, *attribs):
        for attrib in attribs:
            div_tags = self.soup.find_all('div', class_=attrib)
            for div in div_tags:
                div.extract()
    
    def clean_subdiv(self, div, att, sub_att):
        parent_div = self.soup.find(div, class_=att) 

        if parent_div:
            subdivision_div = parent_div.find(div, class_=sub_att)  
            if subdivision_div:
                subdivision_div.extract()

    def clean_text(self):
        
        self.raw = (self.soup.text
               .split('©')[0]
               .split('>')[-1]
               .strip()
               )
        self.raw = re.sub(r'\([\d,]+\)|[\[\]]', 
                          '', self.raw)
       

    def save_json(self):
        
        # Specify the filename
        json_name = f'{self.chapter}.json'   

        # Create the folder if it doesn't exist
        Path(self.folder).mkdir(parents=True, exist_ok=True)

        # Construct the file path
        json_path = Path(self.folder) / json_name

        data = {}
        matches = re.findall(r'(\d+)\s+([\s\S]*?)(?=\d|$)', self.raw)
        for match in matches:
            key = match[0]
            value = match[1].strip()
            data[key] = value

        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False)

        print("JSON data saved to:", json_path)

        
        
# Example usage:
url = "https://scriptureearth.org/data/bom/sab/bom/bom-45-ACT-028.html"
scraper = WebScraper(url, 'berom')
txt = scraper.soup
chpt_name = scraper.chapter

In [119]:
attribs = ('ms', 'mt', 'sp', 's')
scraper.clean_div(attribs)
scraper.clean_subdiv('div', 'm', 'c-drop')
scraper.clean_text()
scraper.save_json()
print(scraper.raw)

JSON data saved to: berom/Tomong_Betom_Yɛsu_28.json
1 Jeng de wot a ye dɔ tyang, bemat ba ha a wot, be tâng kerezik o, à Malta. 2 Bemat de ba sé e hwomo ba dura a wot nerat sizɔng. E jeng o, rwak wa sé rop, ko jey ya vɔk gul. Yaga ano, yɛn a pɛl a wot kya na wot mwa wot a wok, na be sey ot mɛ̀ na yey pyɛng. 3 Bulus a wɔ̀ tula cɔgɔ̂t kya. Ko jeng de a tɛ̀ yɛ̄ e ji kya mo, gàbak de nevɛ̂t kya mo na pat gwɛ gwa nusu yi e cɔ́gɔt o, gwa pe man ɛ e ra vwɔ̂ Bulus, ko gwa rot ɛ. 4 Sede bemat o ba di gwɔ̄m de wa man ɛ e vwɔ̂ Bulus o, yɛn a yâng bemɛn, “Mwin, mwat womo a sé mwât mɔbɔs bemat. Mwa ano de a ga sey tó mɛ e nshi, ko byɛ̂ talâ neves mɛ ya sé rɔɔ raa hɛ; yaga ano, ê ku.” 5 Ko Bulus a digta vwɔ mɛ, gwɔ̄m o wa gaba e ji kya, ko vit a pyɛ de ya té na yɛ. 6 Bemat o ba bare, vwɔ mɛ mo gwô pu, kɛ remo-remo ê vɔ̌ gaba a ku. Yɛn a ros e de bɛrɛng hɛ, ko pyɛ cak ya té na yɛ wɛt. Yaga ano, yɛn a to hwo bare, yɛn a ye, “Mwat womo a sé dagwi hwak!”
7 Bagat na kwɔn de pyɛ yemo ya té ɛ̀, mwat hwak d