<a href="https://colab.research.google.com/github/vesaakerman/hello-world/blob/master/notebooks/eBible_Download_and_unzip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define base folder

In [None]:
base = "/content/drive/MyDrive/eBible/"

# Import modules and directory paths

In [None]:
import csv
import logging
import os
from pathlib import Path
from random import randint
import requests
from time import sleep
import shutil

eBible_url = r"https://ebible.org/Scriptures/"
eBible_csv_url = r"https://ebible.org/Scriptures/translations.csv"

zipped = Path(base) / "downloads"
unzipped = Path(base) / "projects"

metadata_csv = Path(base) / "eBible_metadata/translations.csv"

file_suffix = "_usfm.zip"

print(eBible_url)
print(eBible_csv_url)
print(zipped)
print(unzipped)
print(metadata_csv)


https://ebible.org/Scriptures/
https://ebible.org/Scriptures/translations.csv
/content/drive/MyDrive/eBible/eBible_downloads
/content/drive/MyDrive/eBible/eBible_projects
/content/drive/MyDrive/eBible/eBible_metadata/translations.csv


# Define methods

In [None]:
def download_csv_file(url, headers, save_as):

    r = requests.get(url, headers=headers)
    # If the status is OK continue
    if r.status_code == requests.codes.ok:

        with open(save_as, "wb") as out_file:
            # Write out the content of the page.
            out_file.write(r.content)

        return save_as
    return None


def save_csv_file(csv_url, save_as):
    print(f"Saving file from {csv_url} to {zips_folder}")

    r = requests.get(csv_url)
    # If the status is OK continue
    if r.status_code == requests.codes.ok:

        with open(save_as, "wb") as csv_file:
            csv_file.write(r.content)
        return save_as

    return None


def download_zip_file(url, headers, save_as):

    r = requests.get(url, headers=headers)
    # If the status is OK continue
    if r.status_code == requests.codes.ok:

        with open(save_as, "wb") as out_file:
            # Write out the content of the page.
            out_file.write(r.content)

        return save_as
    return None
    
def get_folder(file):
    # Get the path of the folder to which the zip file should be extracted."
    return unzipped / file.name[0: (len(file.name) - len(file_suffix))]


def get_tree_size(path):
    """Return total size of files in given path and subdirs."""
    total = 0
    for entry in os.scandir(path):
        if entry.is_dir(follow_symlinks=False):
            total += get_tree_size(entry.path)
        else:
            total += entry.stat(follow_symlinks=False).st_size
    return total
    
    
def unzip_ebibles(source_folder, file_suffix, dest_folder):
    pattern = "*" + file_suffix
    zip_files = [zip_file for zip_file in source_folder.glob(pattern)]
    
    # Strip off the pattern so that the subfolder name is the project ID.
    extract_folders = [ (zip_file, get_folder(zip_file)) for zip_file in zip_files ]
    extracts = [ (zip_file, folder) for zip_file, folder in extract_folders if not folder.exists() or zip_file.stat().st_size >= get_tree_size(folder) ]
    print(f"Found {len(zip_files)} files in {source_folder} matching pattern: {pattern}")
    print(f"Found {len(extracts)} that were not yet extracted or are smaller than the zip file.")
    
    for zip_file, extract in extracts:
        extract.mkdir(parents=True, exist_ok=True)
        print(f"Extracting to: {extract}")
        shutil.unpack_archive(zip_file, extract)


# Download and unzip eBible projects 

In [None]:
# Set the user-agent to Chrome for Requests.
headers = {"user-agent": "Chrome/51.0.2704.106"}

# Download the list of translations.
print(f"Downloading list of translations from {eBible_csv_url} to: {str(metadata_csv)}")
done = download_csv_file(eBible_csv_url, headers, metadata_csv)
if not done:
    print(f"Couldn't download {eBible_csv_url}")
    exit

file_infos = []
countall = count_redist = 0

with open(metadata_csv, encoding="utf-8-sig", newline="") as csvfile:
    reader = csv.DictReader(csvfile, delimiter=",", quotechar='"')
    for row in reader:
        countall += 1
        if row["Redistributable"] == "True":
            row["Redistributable"] = True

            file_infos.append(row)
            count_redist += 1

        if row["Redistributable"] == "False":
            row["Redistributable"] = False

            file_infos.append(row)

    filenames = [row["translationId"] + file_suffix for row in file_infos]
    print(f"The translations csv file lists {countall} translations and {count_redist} are redistributable.")

# Find which files have already been downloaded:
already_downloaded = [file.name for file in zipped.glob("*" + file_suffix)]

print(f"There are {len(already_downloaded)} files with the suffix {file_suffix} already in {zipped}")

# Those that require downloading are the filenames - already_downloaded.
to_download = set(filenames) - set(already_downloaded)
print(f"\nThere are {len(to_download)} files still to download.")

# Download the zipped USFM file if it doesn't already exist.

for i, filename in enumerate(to_download):

    if i < 1:

      # Construct the download url and the local file path.
      url = eBible_url + filename
      save_as = zipped / filename

      # Skip any missing filenames.
      if filename == "":
          continue

      # Skip existing files that contain data.
      elif save_as.exists() and save_as.stat().st_size > 100:
          print(f"{i+1}: {save_as} already exists and contains {save_as.stat().st_size} bytes.")
          continue

      else:
          print(f"{i+1}: Downloading from {url} to {save_as}.")
          done = download_zip_file(url, headers, save_as)

          if done:
              print(f"Saved {url} as {save_as}\n")
              # Pause for a random number of miliseconds
              pause = randint(1, 5000) / 1000
              sleep(pause)

          else:
              print(f"Could not download {url}\n")

unzip_ebibles(zipped, file_suffix, unzipped)


The translations csv file exists at :/content/drive/MyDrive/eBible/eBible_metadata/translations.csv
The translations csv file lists 1238 translations and 979 are redistributable.
There are 2 files with the suffix _usfm.zip already in /content/drive/MyDrive/eBible/eBible_downloads

There are 1236 files still to download.
1: Downloading from https://ebible.org/Scriptures/nas_usfm.zip to /content/drive/MyDrive/eBible/eBible_downloads/nas_usfm.zip.
Saved https://ebible.org/Scriptures/nas_usfm.zip as /content/drive/MyDrive/eBible/eBible_downloads/nas_usfm.zip

2: Downloading from https://ebible.org/Scriptures/ino2013_usfm.zip to /content/drive/MyDrive/eBible/eBible_downloads/ino2013_usfm.zip.
Saved https://ebible.org/Scriptures/ino2013_usfm.zip as /content/drive/MyDrive/eBible/eBible_downloads/ino2013_usfm.zip

3: Downloading from https://ebible.org/Scriptures/kqa_usfm.zip to /content/drive/MyDrive/eBible/eBible_downloads/kqa_usfm.zip.
Saved https://ebible.org/Scriptures/kqa_usfm.zip as /co