## **Web Scraping de la Biblia ${\rightarrow}$ https://www.bible.com/**

### Librerías a usar

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os
from time import sleep

### Capítulos por libro de la Biblia

In [2]:
# La siguiente información se obtuvo de la siguiente web: https://www.bible.com/
BOOKS = {
    "GEN": 50,
    "EXO": 40,
    "LEV": 27,
    "NUM": 36,
    "DEU":34,
    "JOS":24,
    "JDG":21,
    "RUT":4,
    "1SA":31,
    "2SA":24,
    "1KI":22,
    "2KI": 25,
    "1CH":29,
    "2CH":36,
    "EZR": 10,
    "NEH": 13,
    "EST": 10,
    "JOB": 42,
    "PSA": 150,
    "PRO": 31,
    "ECC": 12,
    "SNG": 8,
    "ISA": 66,
    "JER": 52,
    "LAM": 5,
    "EZK": 48,
    "DAN": 12,
    "HOS": 14,
    "JOL": 3,
    "AMO": 9,
    "OBA": 1,
    "JON": 4,
    "MIC": 7,
    "NAM": 3,
    "HAB": 3,
    "ZEP": 3,
    "HAG": 2,
    "ZEC": 14,
    "MAL": 4,
    "MAT": 28,
    "MRK": 16,
    "LUK": 24,
    "JHN": 21,
    "ACT": 28,
    "ROM": 16,
    "1CO": 16,
    "2CO": 13,
    "GAL": 6,
    "EPH": 6,
    "PHP": 4,
    "COL": 4,
    "1TH": 5,
    "2TH": 3,
    "1TI": 6,
    "2TI": 4,
    "TIT": 3,
    "PHM": 1,
    "HEB": 13,
    "JAS": 5,
    "1PE": 5,
    "2PE": 3,
    "1JN": 5,
    "2JN": 1,
    "3JN": 1,
    "JUD": 1,
    "REV": 22
}

### Función para obtener el capítulo de un libro de la Biblia

In [3]:
def get_bible(bible_code, book, chapter, folder_name):
    headers = {"User-Agent": "Mozilla/5.0"}  # ayuda a evitar bloqueos
    url = f"https://www.bible.com/bible/{bible_code}/{book}.{chapter}"
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        chapter_div = soup.find('div', class_='ChapterContent_chapter__uvbXo') # <div> principal del capítulo

        if chapter_div:
            spans_ChapterContent_verse = chapter_div.find_all('span', class_='ChapterContent_verse__57FIw')
            
            # Obtener los códigos únicos de los versículos en el capítulo
            verse_codes = []
            for span in spans_ChapterContent_verse:
                code = span.get("data-usfm")
                if code and code not in verse_codes:
                    verse_codes.append(code)

            chapter_info = {} # información de los versículos del capítulo
            for verse_code in verse_codes:
                specific_verse_spans = [span for span in spans_ChapterContent_verse if span.get("data-usfm") == verse_code]
                
                contents = []
                for span in specific_verse_spans:
                    spans_ChapterContent_content = span.find_all("span", class_="ChapterContent_content__RrUqA")
                    for span_content in spans_ChapterContent_content:
                        contents.append(span_content.get_text(strip=True))
                
                verse_text = " ".join(contents).strip()
                chapter_info[verse_code] = verse_text

            # Guardar capítulo de libro en un archivo JSON
            folder = f"data/raw/bible/{folder_name}/{bible_code}"
            os.makedirs(folder, exist_ok=True)

            filename = f"{folder}/book={book}_chapter={chapter}.json"
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(chapter_info, f, ensure_ascii=False, indent=4) # ensure_ascii=False permite guardar caracteres no ASCII

            return chapter_info

        else:
            print(f"No chapter <div> for: {url}")

    except Exception as e:
        print(f"FAILED for {url}: error --> {e}")
    
    sleep(0.25)  # para no sobrecargar el servidor

* Utilizaremos dos versiones de la biblia en aymara. El tipo de aymara es aymara central o altiplánico. En este caso las dos versiones de biblias que usaremos son variaciones ortográficas.
*  La dos versiones de biblias en español (américa latina) que usaremos presentan una variación léxica (lenguaje formal y lenguage contemporáneo).

### Obtenemos la Biblia en Aymara con código 293 $\rightarrow$ https://www.bible.com/bible/293

In [4]:
# Biblia en Aymara ---> id = 293
for book in BOOKS:
    chapters = BOOKS[book]
    for chapter in range(1, chapters + 1):
        get_bible(bible_code=293, book=book, chapter=chapter, folder_name="aymara")

### Obtenemos la Biblia en Aymara con código 2250 $\rightarrow$ https://www.bible.com/bible/2250

In [5]:
# Biblia en Aymara ---> id = 2250
for book in BOOKS:
    chapters = BOOKS[book]
    for chapter in range(1, chapters + 1):
        get_bible(bible_code=2250, book=book, chapter=chapter, folder_name="aymara")

### Obtenemos la Biblia en Español con código 4278 $\rightarrow$ https://www.bible.com/bible/4278

In [None]:
# Biblia en Español ---> id = 4278
for book in BOOKS:
    chapters = BOOKS[book]
    for chapter in range(1, chapters + 1):
        get_bible(bible_code=4278, book=book, chapter=chapter, folder_name="spanish")

### Obtenemos la Biblia en Español con código 1782 $\rightarrow$ https://www.bible.com/bible/1782

In [7]:
# Biblia en Español ---> id = 1782
for book in BOOKS:
    chapters = BOOKS[book]
    for chapter in range(1, chapters + 1):
        get_bible(bible_code=1782, book=book, chapter=chapter, folder_name="spanish")