<a href="https://colab.research.google.com/github/vesaakerman/hello-world/blob/master/notebooks/eBible_Extract_projects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define base folder

In [2]:
# base = "/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible"
base = "/content/drive/MyDrive/eBible"

# Import modules, define rewrite boolean and directory paths

In [3]:
from pathlib import Path
from datetime import date, datetime
from os import listdir, makedirs, environ
from os.path import exists
from glob import iglob
from bs4 import BeautifulSoup
import pandas as pd
import shutil
import warnings
import xml.etree.ElementTree as ET
import re
import codecs
import regex
import csv

warnings.simplefilter(action='ignore', category=FutureWarning)

rewrite = False  # tells if the project is overwritten when it already exists

corpus = Path(base)

ebible_downloads = corpus / 'downloads'
ebible_extractions = corpus / "extractions/scripture"
ebible_translations_csv = corpus / 'metadata/translations.csv'
ebible_copyrights_csv = corpus / 'metadata/copyrights.csv'
ebible_logs = corpus / "logs"
ebible_temp = corpus / "temp"

print(ebible_downloads)
print(ebible_extractions)
print(ebible_translations_csv)
print(ebible_copyrights_csv)
print(ebible_logs)
print(ebible_temp)
print(f"rewrite = {rewrite}")

/content/drive/MyDrive/eBible/downloads
/content/drive/MyDrive/eBible/extractions/scripture
/content/drive/MyDrive/eBible/metadata/translations.csv
/content/drive/MyDrive/eBible/metadata/copyrights.csv
/content/drive/MyDrive/eBible/logs
/content/drive/MyDrive/eBible/temp
rewrite = False


# Install packages and clone the silnlp repo

In [4]:
!pip install python-dotenv
!pip install sil-machine
!pip install boto3
!pip install s3path
!pip install requests

!git clone https://github.com/sillsdev/silnlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-dotenv
  Downloading python_dotenv-0.21.0-py3-none-any.whl (18 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.21.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sil-machine
  Downloading sil_machine-0.8.4-py3-none-any.whl (206 kB)
[K     |████████████████████████████████| 206 kB 5.0 MB/s 
Collecting regex<2022.0.0,>=2021.7.6
  Downloading regex-2021.11.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 56.4 MB/s 
Installing collected packages: regex, sil-machine
  Attempting uninstall: regex
    Found existing installation: regex 2022.6.2
    Uninstalling regex-2022.6.2:
      Successfully uninstalled regex-2022.6.2
Successfully installed regex-2021.11.10 sil-machine-0.8.4


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting boto3
  Downloading boto3-1.26.19-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 5.1 MB/s 
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 6.9 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.30.0,>=1.29.19
  Downloading botocore-1.29.19-py3-none-any.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 63.4 MB/s 
[?25hCollecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 59.1 MB/s 
Installing collected packages: urllib3, jmespath, botocore, s3transfer, boto3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Succe

# Define methods

In [5]:
def improve_column_names(df): df.columns = df.columns.str.strip().str.lower().str.replace('"', '').str.replace("'", '')\
    .str.replace('(', '').str.replace(')', '').str.replace(' ', '_')


def log_and_print(s, type='ínfo'):
    log_file.write(f"{type.upper()}: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {s}\n")
    print(s)


def write_copyrights(translation_id, license_type, copyright_holder):

    if not exists(ebible_copyrights_csv):
        header = ['translationId', 'licenseType', 'copyrightHolder']
        with open(ebible_copyrights_csv, 'w', encoding='UTF8') as f:
            csv.writer(f).writerow(header)
            f.close()

    df = pd.read_csv(ebible_copyrights_csv)
    t_ids = df.set_index("translationId")
    if translation_id in t_ids.index:
        df.loc[df['translationId'] == translation_id, 'licenseType'] = license_type
        df.loc[df['translationId'] == translation_id, 'copyrightHolder'] = copyright_holder
    else:
        new_row = {'translationId':translation_id, 'licenseType':license_type, 'copyrightHolder':copyright_holder}
        df = df.append(new_row, ignore_index=True)
    df = df.sort_values(by=['translationId'])
    df.to_csv(ebible_copyrights_csv, index=False)


def get_copyrights(project):

    copyright_info_file = project / "copr.htm"

    license_type = None
    copyright_holder = None
    cclink = None

    with open(copyright_info_file, "r", encoding="utf-8") as copr:
        html = copr.read()
        soup = BeautifulSoup(html, "lxml")

    cclink = soup.find(href=regex.compile("creativecommons"))
    if cclink:
        ref = cclink.get("href")
        if ref:
            cc_match = regex.match(
                r".*?/licenses/(?P<type>.*?)/(?P<version>.*)/", ref
            )
            if cc_match:
                license_type = cc_match["type"]
            else:
                cc_by_match = regex.match(r".*?/licenses/by(?P<version>.*)/", ref)
                if cc_by_match:
                    license_type = "by"

    copy_strings = [s for s in soup.body.p.stripped_strings]
      
    for i, copy_string in enumerate(copy_strings):
        if i == 0 and "copyright ©" in copy_string:
            copyright_holder = copy_strings[i + 1]
        if "Public Domain" in copy_string:
            copyright_holder = "Public Domain"

    return license_type, copyright_holder


def get_extracted_projects(dir_extracted):

    extracted = []
    for line in listdir(dir_extracted):
        m = re.search(r".+-(.+).txt$", line)
        if m:
            extracted.append(m.group(1))
    
    return extracted


def get_books_type(files):

    for book in files:
        m = re.search(r".*GEN|JON.*", book)
        if m:
            return "OT+NT"
    return "NT"


def get_conclusion(versification):

    if versification != "":
        return versification
    else:
        return "4" # English


def conclude_versification_from_OT(dan_3, dan_5, dan_13):
    if dan_3 == 30:
        versification = "4"  # English
    elif dan_3 == 33 and dan_5 == 30:
        versification = "1"  # Original
    elif dan_3 == 33 and dan_5 == 31:
        versification = "5"  # Russian Protestant
    elif dan_3 == 97:
        versification = "2"  # Septuagint
    elif dan_3 == 100:
        if dan_13 == 65:
            versification = "3"  # Vulgate
        else:
            versification = "6"  # Russian Orthodox
    else:
        versification = ""

    return versification


def conclude_versification_from_NT(jhn_6, act_19, rom_16):
    if jhn_6 == 72:
        versification = "3"  # Vulgate
    elif act_19 == 41:
        versification = "4"  # English
    elif rom_16 == 24:
        versification = "6"  # Russian Orthodox (same as Russian Protestant)
    elif jhn_6 == 71 and act_19 == 40:
        versification = "1"  # Original (Same as Septuagint)
    else:
        versification = ""

    return versification


def get_last_verse(project, book, chapter):

    ch = str(chapter)

    for book_file in iglob(f"{project}/*{book}*"):
        last_verse = "0"
        try:
            f = codecs.open(book_file, "r", encoding="utf-8", errors="ignore")
        except Exception as e:
            print(f"Could not open {book_file}, reason:  {e}")
            continue
        try:
            in_chapter = False
            for line in f:
                m = re.search(r"\\c ? ?([0-9]+).*", line)
                if m:
                    if m.group(1) == ch:
                        in_chapter = True
                    else:
                        in_chapter = False

                m = re.search(r"\\v ? ?([0-9]+).*", line)
                if m:
                    if in_chapter:
                        last_verse = m.group(1)
        except Exception as e:
            print(f"Something went wrong in reading {book_file}, reason:  {e}")
            return None
        try:
            return int(last_verse)
        except Exception as e:
            print(f"Could not convert {last_verse} into an integer in {book_file}, reason:  {e}")
            return None


def get_checkpoints_OT(project):
    dan_3 = get_last_verse(project, "DAN", 3)
    dan_5 = get_last_verse(project, "DAN", 5)
    dan_13 = get_last_verse(project, "DAN", 13)

    return dan_3, dan_5, dan_13


def get_checkpoints_NT(project):
    jhn_6 = get_last_verse(project, "JHN", 6)
    act_19 = get_last_verse(project, "ACT", 19)
    rom_16 = get_last_verse(project, "ROM", 16)

    return jhn_6, act_19, rom_16


def get_versification(project):
    versification = ""
    books = get_books_type(listdir(project))

    if books == "OT+NT":
        dan_3, dan_5, dan_13 = get_checkpoints_OT(project)
        versification = conclude_versification_from_OT(dan_3, dan_5, dan_13)

    if not versification:
        jhn_6, act_19, rom_16 = get_checkpoints_NT(project)
        versification = conclude_versification_from_NT(jhn_6, act_19, rom_16)

    return versification


def add_settings_file(project, language_code):
    versification = get_conclusion(get_versification(project))

    root = ET.Element("ScriptureText")
    ET.SubElement(root, "Versification").text = versification
    ET.SubElement(root, "LanguageIsoCode").text = language_code + ":::"
    ET.SubElement(root, "Naming", BookNameForm = "41-MAT", PostPart = project.name + ".usfm", PrePart = "")
    ET.ElementTree(root).write(project / "Settings.xml")


def get_language_code(project):

    translations = pd.read_csv(ebible_translations_csv)
    improve_column_names(translations)
    translations.set_index("translationid", inplace = True)
    language_code = translations.loc[project.name]['languagecode']

    return language_code


def is_redistributable(project, licence_type, copyright_holder):

    ok_copyrights = ["by-nc-nd", "by-nd", "by-sa"]

    translations = pd.read_csv(ebible_translations_csv)
    improve_column_names(translations)
    translations.set_index("translationid", inplace = True)
    redistributable = translations.loc[project.name]['redistributable']

    return redistributable and (licence_type in ok_copyrights or copyright_holder == "Public Domain")


def unzip(zip, unzip):
    unzip.mkdir(parents=True, exist_ok=True)
    shutil.unpack_archive(zip, unzip)

# Extract projects

In [9]:
from pandas._libs.lib import get_level_sorter
log_file = open(ebible_logs / f"run_{date.today()}.log", "a")
log_and_print(f"Starting extracting eBible projects...")

# Tell the SIL NLP tools where to find the resources
environ['SIL_NLP_DATA_PATH'] = base
environ['SIL_NLP_MT_DIR'] = "extractions"

# Tell Python where to find our repo
environ['PYTHONPATH'] = "/env/python:/content/silnlp"

makedirs(ebible_extractions, exist_ok=True)
makedirs(ebible_temp, exist_ok=True)

extracted = get_extracted_projects(ebible_extractions)
nr_extracted = len(extracted)

for download in sorted(ebible_downloads.glob("[a-zA-Z0-9]*")):
    name = download.name[0:download.name.find("_usfm.zip")]
    if not name in extracted or rewrite:
        project = ebible_temp / name
        unzip(download, project)

        language_code = get_language_code(project)
        licence_type, copyright_holder = get_copyrights(project)
        write_copyrights(name, licence_type, copyright_holder)

        if is_redistributable(project, licence_type, copyright_holder):
            log_and_print(f"extracting {project}")
            add_settings_file(project, language_code)
            !python -m silnlp.common.extract_corpora "{project}"
            shutil.rmtree(project)

log_and_print(f"{len(get_extracted_projects(ebible_extractions)) - nr_extracted} new eBible projects extracted")
log_and_print(f"Rewrite {rewrite}")
log_file.close()
shutil.rmtree(ebible_temp)

Starting extracting eBible projects...
extracting /content/drive/MyDrive/eBible/temp/aai
2022-11-30 13:08:36,398 - silnlp.common.environment - INFO - Using workspace: /content/drive/MyDrive/eBible as per environment variable SIL_NLP_DATA_PATH.
2022-11-30 13:08:36,450 - silnlp.common.extract_corpora - INFO - Extracting /content/drive/MyDrive/eBible/temp/aai...
2022-11-30 13:08:42,348 - silnlp.common.extract_corpora - INFO - # of Verses: 41899
2022-11-30 13:08:42,349 - silnlp.common.extract_corpora - INFO - # of Terms: 0
2022-11-30 13:08:42,349 - silnlp.common.extract_corpora - INFO - Done.
extracting /content/drive/MyDrive/eBible/temp/aaz
2022-11-30 13:08:44,399 - silnlp.common.environment - INFO - Using workspace: /content/drive/MyDrive/eBible as per environment variable SIL_NLP_DATA_PATH.
2022-11-30 13:08:44,421 - silnlp.common.extract_corpora - INFO - Extracting /content/drive/MyDrive/eBible/temp/aaz...
2022-11-30 13:08:49,815 - silnlp.common.extract_corpora - INFO - # of Verses: 418