<a href="https://colab.research.google.com/github/vesaakerman/hello-world/blob/master/notebooks/eBible_Extract_projects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Define base folder

In [None]:
base = "/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible"

# Import modules, define rewrite boolean and directory paths

In [None]:
from pathlib import Path
from datetime import date, datetime
from os import listdir, makedirs, environ
from os.path import exists
from glob import iglob
from bs4 import BeautifulSoup
import pandas as pd
import shutil
import warnings
import xml.etree.ElementTree as ET
import re
import codecs
import regex

warnings.simplefilter(action='ignore', category=FutureWarning)

rewrite = False  # tells if the project is overwritten when it already exists

corpus = Path(base)

ebible_downloads = corpus / 'downloads'
ebible_extractions = corpus / "extractions/scripture"
ebible_translations_csv = corpus / 'metadata/translations.csv'
ebible_logs = corpus / "logs"
ebible_temp = corpus / "temp"

print(ebible_downloads)
print(ebible_extractions)
print(ebible_translations_csv)
print(ebible_logs)
print(ebible_temp)
print(f"rewrite = {rewrite}")

/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/downloads
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/extractions/scripture
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/metadata/translations.csv
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/logs
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/temp
rewrite = False


# Install packages and clone the silnlp repo

In [None]:
!pip install python-dotenv
!pip install sil-machine
!pip install boto3
!pip install s3path
!pip install requests

!git clone https://github.com/sillsdev/silnlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
fatal: destination path 'silnlp' already exists and is not an empty directory.


# Define methods

In [None]:
def improve_column_names(df): df.columns = df.columns.str.strip().str.lower().str.replace('"', '').str.replace("'", '')\
    .str.replace('(', '').str.replace(')', '').str.replace(' ', '_')


def log_and_print(s, type='ínfo'):
    log_file.write(f"{type.upper()}: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {s}\n")
    print(s)


def get_copyrights(project):

    copyright_info_file = project / "copr.htm"

    license_type = None
    copyright_holder = None
    cclink = None

    with open(copyright_info_file, "r", encoding="utf-8") as copr:
        html = copr.read()
        soup = BeautifulSoup(html, "lxml")

    cclink = soup.find(href=regex.compile("creativecommons"))
    if cclink:
        ref = cclink.get("href")
        if ref:
            cc_match = regex.match(
                r".*?/licenses/(?P<type>.*?)/(?P<version>.*)/", ref
            )
            if cc_match:
                license_type = cc_match["type"]
            else:
                cc_by_match = regex.match(r".*?/licenses/by(?P<version>.*)/", ref)
                if cc_by_match:
                    license_type = "by"

    copy_strings = [s for s in soup.body.p.stripped_strings]
      
    for i, copy_string in enumerate(copy_strings):
        if i == 0 and "copyright ©" in copy_string:
            copyright_holder = copy_strings[i + 1]
        if "Public Domain" in copy_string:
            copyright_holder = "Public Domain"

    return license_type, copyright_holder


def get_extracted_projects(dir_extracted):

    extracted = []
    for line in listdir(dir_extracted):
        m = re.search(r".+-(.+).txt$", line)
        if m:
            extracted.append(m.group(1))
    
    return extracted


def get_books_type(files):

    for book in files:
        m = re.search(r".*GEN|JON.*", book)
        if m:
            return "OT+NT"
    return "NT"


def get_conclusion(versification):

    if versification != "":
        return versification
    else:
        return "4" # English


def conclude_versification_from_OT(dan_3, dan_5, dan_13):
    if dan_3 == 30:
        versification = "4"  # English
    elif dan_3 == 33 and dan_5 == 30:
        versification = "1"  # Original
    elif dan_3 == 33 and dan_5 == 31:
        versification = "5"  # Russian Protestant
    elif dan_3 == 97:
        versification = "2"  # Septuagint
    elif dan_3 == 100:
        if dan_13 == 65:
            versification = "3"  # Vulgate
        else:
            versification = "6"  # Russian Orthodox
    else:
        versification = ""

    return versification


def conclude_versification_from_NT(jhn_6, act_19, rom_16):
    if jhn_6 == 72:
        versification = "3"  # Vulgate
    elif act_19 == 41:
        versification = "4"  # English
    elif rom_16 == 24:
        versification = "6"  # Russian Orthodox (same as Russian Protestant)
    elif jhn_6 == 71 and act_19 == 40:
        versification = "1"  # Original (Same as Septuagint)
    else:
        versification = ""

    return versification


def get_last_verse(project, book, chapter):

    ch = str(chapter)

    for book_file in iglob(f"{project}/*{book}*"):
        last_verse = "0"
        try:
            f = codecs.open(book_file, "r", encoding="utf-8", errors="ignore")
        except Exception as e:
            print(f"Could not open {book_file}, reason:  {e}")
            continue
        try:
            in_chapter = False
            for line in f:
                m = re.search(r"\\c ? ?([0-9]+).*", line)
                if m:
                    if m.group(1) == ch:
                        in_chapter = True
                    else:
                        in_chapter = False

                m = re.search(r"\\v ? ?([0-9]+).*", line)
                if m:
                    if in_chapter:
                        last_verse = m.group(1)
        except Exception as e:
            print(f"Something went wrong in reading {book_file}, reason:  {e}")
            return None
        try:
            return int(last_verse)
        except Exception as e:
            print(f"Could not convert {last_verse} into an integer in {book_file}, reason:  {e}")
            return None


def get_checkpoints_OT(project):
    dan_3 = get_last_verse(project, "DAN", 3)
    dan_5 = get_last_verse(project, "DAN", 5)
    dan_13 = get_last_verse(project, "DAN", 13)

    return dan_3, dan_5, dan_13


def get_checkpoints_NT(project):
    jhn_6 = get_last_verse(project, "JHN", 6)
    act_19 = get_last_verse(project, "ACT", 19)
    rom_16 = get_last_verse(project, "ROM", 16)

    return jhn_6, act_19, rom_16


def get_versification(project):
    versification = ""
    books = get_books_type(listdir(project))

    if books == "OT+NT":
        dan_3, dan_5, dan_13 = get_checkpoints_OT(project)
        versification = conclude_versification_from_OT(dan_3, dan_5, dan_13)

    if not versification:
        jhn_6, act_19, rom_16 = get_checkpoints_NT(project)
        versification = conclude_versification_from_NT(jhn_6, act_19, rom_16)

    return versification


def add_settings_file(project, language_code):
    versification = get_conclusion(get_versification(project))

    root = ET.Element("ScriptureText")
    ET.SubElement(root, "Versification").text = versification
    ET.SubElement(root, "LanguageIsoCode").text = language_code + ":::"
    ET.SubElement(root, "Naming", BookNameForm = "41-MAT", PostPart = project.name + ".usfm", PrePart = "")
    ET.ElementTree(root).write(project / "Settings.xml")


def get_language_code(project):

    translations = pd.read_csv(ebible_translations_csv)
    improve_column_names(translations)
    translations.set_index("translationid", inplace = True)
    language_code = translations.loc[project.name]['languagecode']

    return language_code


def is_redistributable(project):

    ok_copyrights = ["by-nc-nd", "by-nd", "by-sa"]

    translations = pd.read_csv(ebible_translations_csv)
    improve_column_names(translations)
    translations.set_index("translationid", inplace = True)
    redistributable = translations.loc[project.name]['redistributable']
    licence_type, copyright_holder = get_copyrights(project)

    return redistributable and (licence_type in ok_copyrights or copyright_holder == "Public Domain")


def unzip(zip, unzip):
    unzip.mkdir(parents=True, exist_ok=True)
    shutil.unpack_archive(zip, unzip)

# Extract projects

In [None]:
log_file = open(ebible_logs / f"run_{date.today()}.log", "a")
log_and_print(f"Starting extracting eBible projects...")

# Tell the SIL NLP tools where to find the resources
environ['SIL_NLP_DATA_PATH'] = base
environ['SIL_NLP_MT_DIR'] = "extractions"

# Tell Python where to find our repo
environ['PYTHONPATH'] = "/env/python:/content/silnlp"

makedirs(ebible_extractions, exist_ok=True)
makedirs(ebible_temp, exist_ok=True)

extracted = get_extracted_projects(ebible_extractions)
nr_extracted = len(extracted)

for download in sorted(ebible_downloads.glob("[a-zA-Z0-9]*")):
    name = download.name[0:download.name.find("_usfm.zip")]
    if not name in extracted or rewrite:
        project = ebible_temp / name
        unzip(download, project)
        if is_redistributable(project):
            log_and_print(f"extracting {project}")
            add_settings_file(project, get_language_code(project))
            !python -m silnlp.common.extract_corpora "{project}"
            shutil.rmtree(project)
            break

log_and_print(f"{len(get_extracted_projects(ebible_extractions)) - nr_extracted} new eBible projects extracted")
log_and_print(f"Rewrite {rewrite}")
log_file.close()
shutil.rmtree(ebible_temp)

Starting extracting eBible projects...
These are the initial column names: Index(['languageCode', 'translationId', 'languageName',
       'languageNameInEnglish', 'dialect', 'homeDomain', 'title',
       'description', 'Redistributable', 'Copyright', 'UpdateDate',
       'publicationURL', 'OTbooks', 'OTchapters', 'OTverses', 'NTbooks',
       'NTchapters', 'NTverses', 'DCbooks', 'DCchapters', 'DCverses', 'FCBHID',
       'Certified', 'inScript', 'swordName', ' "rodCode"', ' "textDirection"',
       ' "downloadable"', ' "font"', ' "shortTitle"', ' "PODISBN"',
       ' "script"'],
      dtype='object')
These are the fixed column names: Index(['languagecode', 'translationid', 'languagename',
       'languagenameinenglish', 'dialect', 'homedomain', 'title',
       'description', 'redistributable', 'copyright', 'updatedate',
       'publicationurl', 'otbooks', 'otchapters', 'otverses', 'ntbooks',
       'ntchapters', 'ntverses', 'dcbooks', 'dcchapters', 'dcverses', 'fcbhid',
       'certifi

FileNotFoundError: ignored