<a href="https://colab.research.google.com/github/vesaakerman/hello-world/blob/master/notebooks/eBible_Extract_projects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define base folder

In [4]:
base = "/content/drive/MyDrive/eBible/"

# Import modules, define rewrite boolean, directory paths and logging file

In [6]:
from pathlib import Path
from datetime import date, datetime
from os import listdir, makedirs, environ
from os.path import exists
import pandas as pd
import shutil
import warnings
import xml.etree.ElementTree as ET
import re

warnings.simplefilter(action='ignore', category=FutureWarning)

rewrite = True  # tells if the project is overwritten when it already exists

corpus = Path(base)
ebible_projects = corpus / 'projects'
ebible_metadata = corpus / 'metadata'
ebible_translations_csv = ebible_metadata / 'translations.csv'
ebible_copyright_csv = ebible_metadata / 'copyrights.csv'
ebible_redistributable = corpus / "redistributable/projects"
ebible_extractions = corpus / "MT/scripture"
paratext_projects = Path("/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects")

print(ebible_projects)
print(ebible_metadata)
print(ebible_translations_csv)
print(ebible_copyright_csv)
print(ebible_redistributable)
print(ebible_extractions)
print(paratext_projects)
print(f"rewrite = {rewrite}")

/content/drive/MyDrive/eBible/projects
/content/drive/MyDrive/eBible/metadata
/content/drive/MyDrive/eBible/metadata/translations.csv
/content/drive/MyDrive/eBible/metadata/copyrights.csv
/content/drive/MyDrive/eBible/redistributable/projects
/content/drive/MyDrive/eBible/MT/scripture
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects
rewrite = True


# Install packages and clone the silnlp repo

In [None]:
!pip install python-dotenv
!pip install sil-machine
!pip install boto3
!pip install s3path
!pip install requests

!git clone https://github.com/sillsdev/silnlp

# Define methods

In [8]:
# Columns are easier to use if they are valid python identifiers:
def improve_column_names(df): df.columns = df.columns.str.strip().str.lower().str.replace('"', '').str.replace("'", '')\
    .str.replace('(', '').str.replace(')', '').str.replace(' ', '_')


def log_and_print(s, type='ínfo'):
    if type == "error":
        log_file.write(f"ERROR: {datetime.now()} {s}\n")
    else:
        log_file.write(f"INFO: {datetime.now()} {s}\n")
    print(s)


def add_settings_file(project, language_code):
    fake_setting_file = """<ScriptureText>
    <Versification>4</Versification>
    <LanguageIsoCode>aak:::</LanguageIsoCode>
    <Naming BookNameForm="41-MAT" PostPart=".usfm" PrePart="" />
</ScriptureText>"""

    settings_file = project / 'Settings.xml'
    f = open(settings_file, "w")
    f.write(fake_setting_file)
    f.close()

    try:
        tree = ET.parse(settings_file)
        naming = tree.find('.//Naming')
        naming.attrib['PostPart'] = naming.attrib['PostPart'].replace('.', project.name + '.')
        iso = tree.find('.//LanguageIsoCode')
        iso.text = f"{language_code}:::"
        tree.write(settings_file)
    except FileNotFoundError:
        log_and_print(f"could not find file {settings_file}")
    except IndexError:
        log_and_print(f"IndexError finding Naming in {settings_file}")
        pass


def get_matching_paratext_project_name(paratext_projects, project_name):
    if exists(paratext_projects / project_name):
        return project_name
    elif project_name.startswith("eng-"):
        name = project_name.replace("eng-", "eng")
        if (paratext_projects / name).exists():
            return name
        else:
            m = re.search('(.*?)[0-9]+', name)
            if m and m.group(1) and (paratext_projects / m.group(1)).exists():
                return m.group(1)
    return None


def get_matching_paratext_projects():
    matching_projects = []

    for project in ebible_redistributable.glob("*"):
        matching_paratext_project = get_matching_paratext_project_name(paratext_projects, project.name)
        if matching_paratext_project:
            matching_projects.append((project.name, matching_paratext_project))

    return matching_projects


def get_versification_from_settings_file(settings_file):
    versification = None
    try:
        tree = ET.parse(settings_file)
        versification_elem = tree.find('.//Versification')
        try:
            versification = versification_elem.text
        except:
            pass
    except FileNotFoundError:
        log_and_print(f"could not find file {settings_file}", "error")
    except IndexError:
        log_and_print(f"IndexError finding Naming in {settings_file}", "error")
        pass
    return versification


def modify_versification(settings_file, versification_code):
    try:
        tree = ET.parse(settings_file)
        versification = tree.find('.//Versification')
        versification.text = versification_code
        tree.write(settings_file)
    except FileNotFoundError:
        log_and_print(f"could not find file {settings_file}", "error")
    except IndexError:
        log_and_print(f"IndexError finding Naming in {settings_file}", "error")
        pass


def copy_paratext_versification_info(matching_projects):

    log_and_print(f"copying paratext versification information for matching projects...")
    for (eBible_project, paratext_project) in matching_projects:
        settings_file = paratext_projects / paratext_project / "Settings.xml"
        custom_vrs = paratext_projects / paratext_project / "custom.vrs"
        if settings_file.exists():
            versification = get_versification_from_settings_file(settings_file)
            if versification and versification != "4":
                log_and_print(f"copying versification {versification} from Paratext for project {eBible_project}")
                modify_versification(ebible_redistributable / eBible_project / "Settings.xml", versification)
        if custom_vrs.exists():
            log_and_print(f"copying custom versification file from Paratext for project {eBible_project}")
            shutil.copy(custom_vrs, ebible_redistributable / eBible_project)


def get_redistributable_projects():

    ok_copyrights = ["by-nc-nd", "by-nd", "by-sa"]
    redistributable = {}
    translations_info = pd.read_csv(ebible_translations_csv)
    copyright_info = pd.read_csv(ebible_copyright_csv)
    improve_column_names(translations_info)
    improve_column_names(copyright_info)
    copyright_info.rename(columns={'id': 'translationid'}, inplace=True)
    combined = pd.merge(translations_info, copyright_info, on='translationid', how='left')

    for index, row in combined.iterrows():
        if row["redistributable"] and (row["licence_type"] in ok_copyrights or row["copyright_holder"] == "Public Domain"):
            redistributable[row["translationid"]] = row["languagecode"]

    return redistributable

def copy_to_working_directory(project, language_code):
    folder = ebible_redistributable / project.name
    if exists(folder):
      if rewrite:
        shutil.rmtree(folder)
      else:
        return
    log_and_print(f"copying {project.name} to {ebible_redistributable}")
    shutil.copytree(project, folder)
    add_settings_file(folder, language_code)


# Prepare redistributable projects to be extracted. 

In [20]:
log_file = open(ebible_redistributable / f"../run_{date.today()}.log", "a")

# Create target directory if it doesn't exist already
makedirs(ebible_redistributable, exist_ok=True)

# Make dictionary of copyright free projects in eBible.
redistributable = get_redistributable_projects()

# Copy redistributable eBible projects into working directory, and add settings files
for project in ebible_projects.iterdir():
    if project.name in redistributable:
        copy_to_working_directory(project, redistributable[project.name])

# Modify versification in settings file if found in matching Paratext project,
# and copy custom versification file from matching Paratext project if found.
copy_paratext_versification_info(get_matching_paratext_projects())

log_and_print(f"Number of eBible projects: {len([item for item in listdir(ebible_projects)])}")
log_and_print(f"Number of redistributable eBible projects: {len(redistributable)}")
log_and_print(f"Number of eBible projects to be extracted: {len([item for item in listdir(ebible_redistributable)])}")
log_and_print(f"Files extracted to {ebible_redistributable}")
log_file.close()


copying bjvNT to /content/drive/MyDrive/eBible/redistributable/projects
copying paratext versification information for matching projects...
Number of eBible projects: 2
Number of redistributable eBible projects: 1
Number of eBible projects to be extracted: 1
Files extracted to /content/drive/MyDrive/eBible/redistributable/projects


# Extract projects

In [21]:
log_file = open(ebible_extractions / f"../run_{date.today()}.log", "a")

# Tell the SIL NLP tools where to find the resources
environ['SIL_NLP_DATA_PATH'] = base

# Tell Python where to find our repo
environ['PYTHONPATH'] = "/env/python:/content/silnlp"

i = 0
for project in ebible_redistributable.glob("*"):
  if not (ebible_extractions / project).exists or rewrite:
    !python -m silnlp.common.extract_corpora "{project}"
    i += 1

log_and_print(f"{i} eBible projects extracted.")
log_file.close()

2022-10-17 13:05:57,522 - silnlp.common.environment - INFO - Using workspace: /content/drive/MyDrive/eBible/ as per environment variable SIL_NLP_DATA_PATH.
2022-10-17 13:05:57,545 - silnlp.common.extract_corpora - INFO - Extracting /content/drive/MyDrive/eBible/redistributable/projects/bjvNT...
2022-10-17 13:06:02,617 - silnlp.common.extract_corpora - INFO - # of Verses: 41899
2022-10-17 13:06:02,618 - silnlp.common.extract_corpora - INFO - # of Terms: 0
2022-10-17 13:06:02,618 - silnlp.common.extract_corpora - INFO - Done.
1 eBible projects extracted.
