<a href="https://colab.research.google.com/github/vesaakerman/hello-world/blob/master/notebooks/eBible_Download_projects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define base folder

In [2]:
base = "/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible"

# Import modules and directory paths

In [3]:
import os
from pathlib import Path
from datetime import date, datetime
from random import randint
import requests
from time import sleep
import shutil
from glob import iglob
from bs4 import BeautifulSoup
from csv import DictReader, DictWriter
import ntpath
import regex

eBible_url = r"https://ebible.org/Scriptures/"
eBible_csv_url = r"https://ebible.org/Scriptures/translations.csv"

downloads = Path(base) / "downloads"
metadata = Path(base) / "metadata"
metadata_csv = metadata / "translations.csv"
logs = Path(base) / "logs"

file_suffix = "_usfm.zip"
csv_headers = [
    "ID",
    "File",
    "Language",
    "Dialect",
    "Licence Type",
    "Licence Version",
    "CC Licence Link",
    "Copyright Holder",
    "Copyright Years",
    "Translation by",
]

print(eBible_url)
print(eBible_csv_url)
print(downloads)
print(metadata_csv)
print(logs)

https://ebible.org/Scriptures/
https://ebible.org/Scriptures/translations.csv
/content/drive/MyDrive/eBible/downloads
/content/drive/MyDrive/eBible/metadata/translations.csv
/content/drive/MyDrive/eBible/logs


# Define methods

In [4]:
def log_and_print(s, type='ínfo'):
    log_file.write(f"{type.upper()}: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {s}\n")
    print(s)

def make_directories():
    os.makedirs(downloads, exist_ok=True)
    os.makedirs(metadata, exist_ok=True)
    os.makedirs(logs, exist_ok=True)


def download_csv_file(url, headers, save_as):

    r = requests.get(url, headers=headers)
    # If the status is OK continue
    if r.status_code == requests.codes.ok:

        with open(save_as, "wb") as out_file:
            # Write out the content of the page.
            out_file.write(r.content)

        return save_as
    return None


def download_zip_file(url, headers, save_as):

    r = requests.get(url, headers=headers)
    # If the status is OK continue
    if r.status_code == requests.codes.ok:

        with open(save_as, "wb") as out_file:
            # Write out the content of the page.
            out_file.write(r.content)

        return save_as
    return None
    

def get_filenames(metadata_csv):
    file_infos = []
    countall = count_redist = 0

    with open(metadata_csv, encoding="utf-8-sig", newline="") as csvfile:
        reader = DictReader(csvfile, delimiter=",", quotechar='"')
        for row in reader:
            countall += 1
            if row["Redistributable"] == "True":
                row["Redistributable"] = True

                file_infos.append(row)
                count_redist += 1

            if row["Redistributable"] == "False":
                row["Redistributable"] = False

                file_infos.append(row)

        filenames = [row["translationId"] + file_suffix for row in file_infos]
        log_and_print(f"The translations csv file lists {countall} translations and {count_redist} are redistributable.")

        return filenames


# Download eBible projects 

In [6]:
# Create directories if they don't already exist
make_directories()

log_file = open(logs / f"run_{date.today()}.log", "a")

# Set the user-agent to Chrome for Requests.
headers = {"user-agent": "Chrome/51.0.2704.106"}

# Download the list of translations.
log_and_print(f"Starting downloading eBible files...")
log_and_print(f"Downloading list of translations from {eBible_csv_url} to: {str(metadata_csv)}")
done = download_csv_file(eBible_csv_url, headers, metadata_csv)

if not done:
    log_and_print(f"Couldn't download {eBible_csv_url}")
    exit

# Get filenames
filenames = sorted(get_filenames(metadata_csv))

# Find which files have already been downloaded:
already_downloaded = sorted([file.name for file in downloads.glob("*" + file_suffix)])
log_and_print(f"There are {len(already_downloaded)} files with the suffix {file_suffix} already in {downloads}")

# Those that require downloading are the filenames - already_downloaded.
to_download = sorted(set(filenames) - set(already_downloaded))
log_and_print(f"There are {len(to_download)} files still to download.")

# Download the zipped USFM file if it doesn't already exist.
for i, filename in enumerate(to_download):

    # Construct the download url and the local file path.
    url = eBible_url + filename
    save_as = downloads / filename

    # Skip any missing filenames.
    if filename == "":
        continue

    # Skip existing files that contain data.
    elif save_as.exists() and save_as.stat().st_size > 100:
        log_and_print(f"{i+1}: {save_as} already exists and contains {save_as.stat().st_size} bytes.")
        continue

    else:
        log_and_print(f"{i+1}: Downloading from {url} to {save_as}.")
        done = download_zip_file(url, headers, save_as)

        if done:
            log_and_print(f"Saved {url} as {save_as}")
            # Pause for a random number of miliseconds
            pause = randint(1, 5000) / 1000
            sleep(pause)

        else:
            log_and_print(f"Could not download {url}\n")
            
log_and_print(f"Finished downloading eBible files")
log_file.close()

Starting downloading eBible files...
Downloading list of translations from https://ebible.org/Scriptures/translations.csv to: /content/drive/MyDrive/eBible/metadata/translations.csv
The translations csv file lists 1285 translations and 1036 are redistributable.
There are 2 files with the suffix _usfm.zip already in /content/drive/MyDrive/eBible/downloads
There are 1283 files still to download.
1: Downloading from https://ebible.org/Scriptures/aau_usfm.zip to /content/drive/MyDrive/eBible/downloads/aau_usfm.zip.
Saved https://ebible.org/Scriptures/aau_usfm.zip as /content/drive/MyDrive/eBible/downloads/aau_usfm.zip

2: Downloading from https://ebible.org/Scriptures/aaz_usfm.zip to /content/drive/MyDrive/eBible/downloads/aaz_usfm.zip.
Saved https://ebible.org/Scriptures/aaz_usfm.zip as /content/drive/MyDrive/eBible/downloads/aaz_usfm.zip

Finished downloading eBible files
