<a href="https://colab.research.google.com/github/vesaakerman/hello-world/blob/master/Prepare_folder_for_extractions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define base folder

In [None]:
base = "/content/drive/MyDrive/"

# Import modules, define directory paths and logging file

In [None]:
from pathlib import Path
from datetime import date, datetime
from os import listdir
from os.path import exists
import pandas as pd
import shutil
import warnings
import xml.etree.ElementTree as ET
import re

warnings.simplefilter(action='ignore', category=FutureWarning)

corpus = Path(base)
ebible_projects_folder = corpus / 'eBible_projects'
ebible_metadata_folder = corpus / 'eBible_metadata'
ebible_translations_csv = ebible_metadata_folder / 'translations.csv'
ebible_copyright_csv = ebible_metadata_folder / 'copyrights.csv'
paratext_projects_folder = Path("/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects")

working_dir_for_extraction = corpus / "eBible_redistributable/projects"
working_dir_for_extraction.mkdir(parents=True, exist_ok=True)

log_file_path = corpus / f"eBible_redistributable/run_{date.today()}.log"

print(ebible_projects_folder)
print(ebible_metadata_folder)
print(ebible_translations_csv)
print(ebible_copyright_csv)
print(working_dir_for_extraction)
print(log_file_path)
print(paratext_projects_folder)

/content/drive/MyDrive/eBible_projects
/content/drive/MyDrive/eBible_metadata
/content/drive/MyDrive/eBible_metadata/translations.csv
/content/drive/MyDrive/eBible_metadata/copyrights.csv
/content/drive/MyDrive/eBible_redistributable/projects
/content/drive/MyDrive/eBible_redistributable/run_2022-10-06.log
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects


# Define methods

In [None]:
# Columns are easier to use if they are valid python identifiers:
def improve_column_names(df): df.columns = df.columns.str.strip().str.lower().str.replace('"', '').str.replace("'", '')\
    .str.replace('(', '').str.replace(')', '').str.replace(' ', '_')


def log_and_print(s, type='ínfo'):
    if type == "error":
        log_file.write(f"ERROR: {datetime.now()} {s}\n")
    else:
        log_file.write(f"INFO: {datetime.now()} {s}\n")
    print(s)


def add_settings_file(project, language_code):
    fake_setting_file = """<ScriptureText>
    <Versification>4</Versification>
    <LanguageIsoCode>aak:::</LanguageIsoCode>
    <Naming BookNameForm="41-MAT" PostPart=".usfm" PrePart="" />
</ScriptureText>"""

    settings_file = project / 'Settings.xml'
    f = open(settings_file, "w")
    f.write(fake_setting_file)
    f.close()

    try:
        tree = ET.parse(settings_file)
        naming = tree.find('.//Naming')
        naming.attrib['PostPart'] = naming.attrib['PostPart'].replace('.', project.name + '.')
        iso = tree.find('.//LanguageIsoCode')
        iso.text = f"{language_code}:::"
        tree.write(settings_file)
    except FileNotFoundError:
        log_and_print(f"could not find file {settings_file}")
    except IndexError:
        log_and_print(f"IndexError finding Naming in {settings_file}")
        pass


def get_matching_paratext_project_name(paratext_projects, project_name):
    if exists(paratext_projects / project_name):
        return project_name
    elif project_name.startswith("eng-"):
        name = project_name.replace("eng-", "eng")
        if (paratext_projects / name).exists():
            return name
        else:
            m = re.search('(.*?)[0-9]+', name)
            if m and m.group(1) and (paratext_projects / m.group(1)).exists():
                return m.group(1)
    return None


def get_matching_paratext_projects(paratext_projects, ebible_projects):
    matching_projects = []

    for project in ebible_projects.glob("*"):
        matching_paratext_project = get_matching_paratext_project_name(paratext_projects, project.name)
        if matching_paratext_project:
            matching_projects.append((project.name, matching_paratext_project))

    return matching_projects


def get_versification_from_settings_file(settings_file):
    versification = None
    try:
        tree = ET.parse(settings_file)
        versification_elem = tree.find('.//Versification')
        try:
            versification = versification_elem.text
        except:
            pass
    except FileNotFoundError:
        log_and_print(f"could not find file {settings_file}", "error")
    except IndexError:
        log_and_print(f"IndexError finding Naming in {settings_file}", "error")
        pass
    return versification


def modify_versification(settings_file, versification_code):
    try:
        tree = ET.parse(settings_file)
        versification = tree.find('.//Versification')
        versification.text = versification_code
        tree.write(settings_file)
    except FileNotFoundError:
        log_and_print(f"could not find file {settings_file}", "error")
    except IndexError:
        log_and_print(f"IndexError finding Naming in {settings_file}", "error")
        pass


def copy_paratext_versification_info(matching_projects, paratext_projects, target_dir):

    log_and_print(f"copying paratext versification information for matching projects...")
    for (eBible_project, paratext_project) in matching_projects:
        settings_file = paratext_projects / paratext_project / "Settings.xml"
        custom_vrs = paratext_projects / paratext_project / "custom.vrs"
        if settings_file.exists():
            versification = get_versification_from_settings_file(settings_file)
            if versification and versification != "4":
                log_and_print(f"copying versification {versification} from Paratext for project {eBible_project}")
                modify_versification(target_dir / eBible_project / "Settings.xml", versification)
        if custom_vrs.exists():
            log_and_print(f"copying custom versification file from Paratext for project {eBible_project}")
            shutil.copy(custom_vrs, target_dir / eBible_project)


def get_redistributable_projects():

    ok_copyrights = ["by-nc-nd", "by-nd", "by-sa"]
    redistributable = {}
    translations_info = pd.read_csv(ebible_translations_csv)
    copyright_info = pd.read_csv(ebible_copyright_csv)
    improve_column_names(translations_info)
    improve_column_names(copyright_info)
    copyright_info.rename(columns={'id': 'translationid'}, inplace=True)
    combined = pd.merge(translations_info, copyright_info, on='translationid', how='left')

    for index, row in combined.iterrows():
        if row["redistributable"] and (row["licence_type"] in ok_copyrights or row["copyright_holder"] == "Public Domain"):
            redistributable[row["translationid"]] = row["languagecode"]

    return redistributable

def copy_to_working_directory(project):
    folder = working_dir_for_extraction / project.name
    log_and_print(f"copying {project.name} to {working_dir_for_extraction}")
    if exists(folder):
      shutil.rmtree(folder)
    shutil.copytree(project, folder)
  


# Copy eBible .usfm files into a working directory and add settings files. When corresponding Paratext project found, versification is adjusted and a possible custom versification file is copied. 

In [None]:
log_file = open(log_file_path, "a")

# Make dictionary of copyright free projects in eBible.
redistributable = get_redistributable_projects()

# Copy redistributable eBible projects into working directory, and add settings files
for project in ebible_projects_folder.iterdir():
    if project.name in redistributable:
        copy_to_working_directory(project)
        add_settings_file(working_dir_for_extraction / project.name, redistributable[project.name])

# Modify versification in settings file if found in matching Paratext project,
# and copy custom versification file from matching Paratext project if found.
matching_projects = get_matching_paratext_projects(paratext_projects_folder, working_dir_for_extraction)
copy_paratext_versification_info(matching_projects, paratext_projects_folder, working_dir_for_extraction)

log_and_print(f"Number of redistributable eBible projects: {len(redistributable)}")
log_and_print(f"Number of eBible projects: {len([item for item in listdir(ebible_projects_folder)])}")
log_and_print(f"Number of matching Paratext projects: {len(matching_projects)}")
log_and_print(f"Number of eBible projects to be extracted: {len([item for item in listdir(working_dir_for_extraction)])}")
log_and_print(f"Files extracted to {working_dir_for_extraction}")
log_and_print(f"Log file: {log_file_path}")
log_file.close()


copying aau to /content/drive/MyDrive/eBible_redistributable/projects
copying aak to /content/drive/MyDrive/eBible_redistributable/projects
copying aaz to /content/drive/MyDrive/eBible_redistributable/projects
copying paratext versification information for matching projects...
Number of redistributable eBible projects: 3
Number of eBible projects: 3
Number of matching Paratext projects: 3
Number of eBible projects to be extracted: 3
Files extracted to /content/drive/MyDrive/eBible_redistributable/projects
Log file: /content/drive/MyDrive/eBible_redistributable/run_2022-10-06.log
