<a href="https://colab.research.google.com/github/vesaakerman/hello-world/blob/master/notebooks/eBible_Download_projects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define base folder

In [None]:
base = "/content/drive/MyDrive/eBible/"

# Import modules and directory paths

In [None]:
import logging
import os
from pathlib import Path
from datetime import date, datetime
from random import randint
import requests
from time import sleep
import shutil
from glob import iglob
from bs4 import BeautifulSoup
from csv import DictReader, DictWriter
import ntpath
import regex

eBible_url = r"https://ebible.org/Scriptures/"
eBible_csv_url = r"https://ebible.org/Scriptures/translations.csv"

zipped = Path(base) / "downloads"
unzipped = Path(base) / "projects"
ebible_projects = base + "projects"
metadata = Path(base) / "metadata"
metadata_csv = metadata / "translations.csv"
copyright_file = metadata / "copyrights.csv"

file_suffix = "_usfm.zip"
headers = [
    "ID",
    "File",
    "Language",
    "Dialect",
    "Licence Type",
    "Licence Version",
    "CC Licence Link",
    "Copyright Holder",
    "Copyright Years",
    "Translation by",
]

print(eBible_url)
print(eBible_csv_url)
print(zipped)
print(unzipped)
print(ebible_projects)
print(metadata_csv)
print(copyright_file)


https://ebible.org/Scriptures/
https://ebible.org/Scriptures/translations.csv
/content/drive/MyDrive/eBible/downloads
/content/drive/MyDrive/eBible/projects
/content/drive/MyDrive/eBible/projects
/content/drive/MyDrive/eBible/metadata/translations.csv
/content/drive/MyDrive/eBible/metadata/copyrights.csv


# Define methods for downloading and unzipping eBibles

In [None]:
def log_and_print(s, type='ínfo'):
    if type == "error":
        log_file.write(f"ERROR: {datetime.now()} {s}\n")
    else:
        log_file.write(f"INFO: {datetime.now()} {s}\n")
    print(s)

def make_directories():
    os.makedirs(zipped, exist_ok=True)
    os.makedirs(unzipped, exist_ok=True)
    os.makedirs(metadata, exist_ok=True)


def download_csv_file(url, headers, save_as):

    r = requests.get(url, headers=headers)
    # If the status is OK continue
    if r.status_code == requests.codes.ok:

        with open(save_as, "wb") as out_file:
            # Write out the content of the page.
            out_file.write(r.content)

        return save_as
    return None


def save_csv_file(csv_url, save_as):
    log_and_print(f"Saving file from {csv_url} to {zips_folder}")

    r = requests.get(csv_url)
    # If the status is OK continue
    if r.status_code == requests.codes.ok:

        with open(save_as, "wb") as csv_file:
            csv_file.write(r.content)
        return save_as

    return None


def download_zip_file(url, headers, save_as):

    r = requests.get(url, headers=headers)
    # If the status is OK continue
    if r.status_code == requests.codes.ok:

        with open(save_as, "wb") as out_file:
            # Write out the content of the page.
            out_file.write(r.content)

        return save_as
    return None
    
def get_folder(file):
    # Get the path of the folder to which the zip file should be extracted."
    return unzipped / file.name[0: (len(file.name) - len(file_suffix))]


def get_tree_size(path):
    """Return total size of files in given path and subdirs."""
    total = 0
    for entry in os.scandir(path):
        if entry.is_dir(follow_symlinks=False):
            total += get_tree_size(entry.path)
        else:
            total += entry.stat(follow_symlinks=False).st_size
    return total
    
    
def unzip_ebibles(source_folder, file_suffix, dest_folder):
    pattern = "*" + file_suffix
    zip_files = [zip_file for zip_file in source_folder.glob(pattern)]
    
    # Strip off the pattern so that the subfolder name is the project ID.
    extract_folders = [ (zip_file, get_folder(zip_file)) for zip_file in zip_files ]
    extracts = [ (zip_file, folder) for zip_file, folder in extract_folders if not folder.exists() or zip_file.stat().st_size >= get_tree_size(folder) ]
    log_and_print(f"Found {len(zip_files)} files in {source_folder} matching pattern: {pattern}")
    log_and_print(f"Found {len(extracts)} that were not yet extracted or are smaller than the zip file.")
    
    for zip_file, extract in extracts:
        extract.mkdir(parents=True, exist_ok=True)
        log_and_print(f"Extracting to: {extract}")
        shutil.unpack_archive(zip_file, extract)

def get_filenames(metadata_csv):
    file_infos = []
    countall = count_redist = 0

    with open(metadata_csv, encoding="utf-8-sig", newline="") as csvfile:
        reader = DictReader(csvfile, delimiter=",", quotechar='"')
        for row in reader:
            countall += 1
            if row["Redistributable"] == "True":
                row["Redistributable"] = True

                file_infos.append(row)
                count_redist += 1

            if row["Redistributable"] == "False":
                row["Redistributable"] = False

                file_infos.append(row)

        filenames = [row["translationId"] + file_suffix for row in file_infos]
        log_and_print(f"The translations csv file lists {countall} translations and {count_redist} are redistributable.")

        return filenames


# Download and unzip eBible projects 

In [None]:
log_file = open(zipped / f"run_{date.today()}.log", "a")

# Set the user-agent to Chrome for Requests.
headers = {"user-agent": "Chrome/51.0.2704.106"}

# Create directories if they don't already exist
make_directories()

# Download the list of translations.
log_and_print(f"Downloading list of translations from {eBible_csv_url} to: {str(metadata_csv)}")
done = download_csv_file(eBible_csv_url, headers, metadata_csv)
if not done:
    log_and_print(f"Couldn't download {eBible_csv_url}")
    exit

# Get filenames
filenames = get_filenames(metadata_csv)

# Find which files have already been downloaded:
already_downloaded = [file.name for file in zipped.glob("*" + file_suffix)]
log_and_print(f"There are {len(already_downloaded)} files with the suffix {file_suffix} already in {zipped}")

# Those that require downloading are the filenames - already_downloaded.
to_download = set(filenames) - set(already_downloaded)
log_and_print(f"\nThere are {len(to_download)} files still to download.")

# Download the zipped USFM file if it doesn't already exist.

for i, filename in enumerate(to_download):

    if i < 1:

      # Construct the download url and the local file path.
      url = eBible_url + filename
      save_as = zipped / filename

      # Skip any missing filenames.
      if filename == "":
          continue

      # Skip existing files that contain data.
      elif save_as.exists() and save_as.stat().st_size > 100:
          log_and_print(f"{i+1}: {save_as} already exists and contains {save_as.stat().st_size} bytes.")
          continue

      else:
          log_and_print(f"{i+1}: Downloading from {url} to {save_as}.")
          done = download_zip_file(url, headers, save_as)

          if done:
              log_and_print(f"Saved {url} as {save_as}\n")
              # Pause for a random number of miliseconds
              pause = randint(1, 5000) / 1000
              sleep(pause)

          else:
              log_and_print(f"Could not download {url}\n")
log_file.close()

log_file = open(unzipped / f"run_{date.today()}.log", "a")
unzip_ebibles(zipped, file_suffix, unzipped)

log_file.close()

Downloading list of translations from https://ebible.org/Scriptures/translations.csv to: /content/drive/MyDrive/eBible/metadata/translations.csv
The translations csv file lists 1284 translations and 1035 are redistributable.
There are 2 files with the suffix _usfm.zip already in /content/drive/MyDrive/eBible/downloads

There are 1282 files still to download.
1: Downloading from https://ebible.org/Scriptures/ebk_usfm.zip to /content/drive/MyDrive/eBible/downloads/ebk_usfm.zip.
Saved https://ebible.org/Scriptures/ebk_usfm.zip as /content/drive/MyDrive/eBible/downloads/ebk_usfm.zip

Found 3 files in /content/drive/MyDrive/eBible/downloads matching pattern: *_usfm.zip
Found 1 that were not yet extracted or are smaller than the zip file.
Extracting to: /content/drive/MyDrive/eBible/projects/ebk


# Define methods for creating the copyrights file

In [None]:
def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)


def read_list_from_file(f_in):
    lines = list()
    with open(f_in, "r", encoding="utf-8") as infile:
        for line in infile.read().splitlines():
            lines.append(line)
    return lines


def get_copyright_from_url(url):
    r = requests.get(url)
    # If the status is OK continue
    if r.status_code == requests.codes.ok:
        soup = BeautifulSoup(r.content, "lxml")
        cr_item = soup.find("td", colspan="3")
        return cr_item.string
    else:
        return None


# Get copyright info from eBible projects 

In [None]:
log_file = open(metadata / f"run_{date.today()}.log", "a")

data = list()
copr_regex = r".*/(?P<id>.*?)/copr.htm"

# Check we can write to the output file before the processing:
with open(copyright_file, 'w',encoding='utf-8') as f_out:
    pass

log_and_print("Collecting eBible copyright information...")
for copyright_info_file in sorted(iglob(ebible_projects + "/**/copr.htm")):
    log_and_print(copyright_info_file)

    id_match = regex.match(copr_regex, str(copyright_info_file))
    id = id_match["id"]

    entry = dict()
    entry["ID"] = str(id)
    entry["File"] = copyright_info_file

    with open(copyright_info_file, "r", encoding="utf-8") as copr:
        html = copr.read()
        soup = BeautifulSoup(html, "lxml")

    cclink = soup.find(href=regex.compile("creativecommons"))
    if cclink:
        ref = cclink.get("href")
        if ref:
            entry["CC Licence Link"] = ref
            cc_match = regex.match(
                r".*?/licenses/(?P<type>.*?)/(?P<version>.*)/", ref
            )
            if cc_match:
                entry["Licence Type"] = cc_match["type"]
                entry["Licence Version"] = cc_match["version"]
            else:
                cc_by_match = regex.match(r".*?/licenses/by(?P<version>.*)/", ref)
                if cc_by_match:
                    # print(f'Licence version = {cc_by_match["version"]}')
                    entry["Licence Type"] = "by"
                    entry["Licence Version"] = cc_by_match["version"]

    cclink = None

    titlelink = soup.find(href=regex.compile(f"https://ebible.org/{id}"))
    if titlelink:
        entry["Vernacular Title"] = titlelink.string
    titlelink = None

    copy_strings = [s for s in soup.body.p.stripped_strings]
      
    for i, copy_string in enumerate(copy_strings):
        if i == 0 and "copyright ©" in copy_string:
            entry["Copyright Years"] = copy_string
            entry["Copyright Holder"] = copy_strings[i + 1]
        if i > 0 and "Language:" in copy_string:
            entry["Language"] = copy_strings[i + 1]
        if "Dialect" in copy_string:
            entry["Dialect"] = copy_string
        if "Translation by" in copy_string:
            entry["Translation by"] = copy_string
        if "Public Domain" in copy_string:
            entry["Copyright Year"] = ""
            entry["Copyright Holder"] = "Public Domain"
    
    data.append(entry)

    with open(copyright_file, "w", encoding="utf-8") as csv_out:
        writer = DictWriter(
            csv_out, headers, restval="", extrasaction="ignore", dialect="excel"
        )
        writer.writeheader()
        writer.writerows(data)

log_and_print(f"Wrote copyright info to {copyright_file}")
log_file.close()


Collecting eBible copyright information...
/content/drive/MyDrive/eBible/projects/bjvNT/copr.htm
/content/drive/MyDrive/eBible/projects/ebk/copr.htm
/content/drive/MyDrive/eBible/projects/engWycliffe/copr.htm
Wrote copyright info to /content/drive/MyDrive/eBible/metadata/copyrights.csv
