<a href="https://colab.research.google.com/github/vesaakerman/hello-world/blob/master/notebooks/eBible_Tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# List number of different versifications in Paratext projects & number of custom versifications

In [None]:
from pathlib import Path
import xml.etree.ElementTree as ET

pfolder = Path("/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects")

def get_versification_from_settings_file(settings_file):
    versification = None
    try:
        tree = ET.parse(settings_file)
        versification_elem = tree.find('.//Versification')
        try:
            versification = versification_elem.text
        except:
            pass
    except FileNotFoundError:
        print(f"      could not find file {settings_file}")
    except IndexError:
        print(f"      IndexError finding Versification in {settings_file}")
        pass
    return versification


def custom_versification_file_exists(project):
    custom = project  / "custom.vrs"
    return custom.exists()


def get_versifications(dir):
    amounts = {"1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0, "None": 0, "1cu": 0, "2cu": 0, "3cu": 0, "4cu": 0, "5cu": 0, "6cu": 0, "Nonecu": 0}
    for project in dir.iterdir():
        settings_file = project / "Settings.xml"
        versification = get_versification_from_settings_file(settings_file)
        if not versification:
            versification = "None"
        amounts[versification] += 1
        if custom_versification_file_exists(project):
            amounts[versification + "cu"] += 1
        

    print(f"\nVersification 1: {amounts['1']},  of which {amounts['1cu']} has custom versification file")
    print(f"Versification 2: {amounts['2']},    of which {amounts['2cu']}   has custom versification file")
    print(f"Versification 3: {amounts['3']},   of which {amounts['3cu']}   has custom versification file")
    print(f"Versification 4: {amounts['4']}, of which {amounts['4cu']} has custom versification file")
    print(f"Versification 5: {amounts['5']},   of which {amounts['5cu']}  has custom versification file")
    print(f"Versification 6: {amounts['6']},   of which {amounts['6cu']}  has custom versification file")
    print(f"Versific. None : {amounts['None']},   of which {amounts['Nonecu']}  has custom versification file")


get_versifications(pfolder)


# Conclude versification based on versification of Daniel

In [25]:
from glob import iglob
import re
from os.path import exists
from pathlib import Path
import csv

paratext_projects = "/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects/"
output = Path("/content/drive/MyDrive/versification/")
headers = ['CONCLUDED', 'PARATEXT', 'CUSTOM', 'PROJECT', 'REMARK', '', 'CONCLUSION']


def conclude_versification(last_verses_ch_3_5,last_verse_ch_13):
    if last_verses_ch_3_5 == "3:30 5:31":
        versification = "4"  # English
    elif last_verses_ch_3_5 == "3:33 5:30":
        versification = "1"  # Original
    elif last_verses_ch_3_5 == "3:33 5:31":
        versification = "5"  # Russian Protestant
    elif last_verses_ch_3_5 == "3:100 5:31":
        if last_verse_ch_13 == 65:
            versification = "2?"  # Vulgate
        else:
            versification = "3"  # Russian Orthodox
    else:
        versification = ""

    return versification


def get_paratext_versification(daniel):
    m = re.search(r".*/(.*)/.*", daniel)
    if m:
        project = m.group(1)
        settings = paratext_projects + project + "/Settings.xml"
        pt_versification = ""
        if exists(settings):
            try:
                f = open(settings, "r")
                for line in f:
                    m = re.search(r"<Versification>(\d)</Versification>.*", line)
                    if m:
                        pt_versification = m.group(1)
            except Exception as e:
                print(f"Could not open {settings}, reason:  {e}")

        return pt_versification, project


def get_custom_versification(project):
    custom = paratext_projects + project + "/custom.vrs"
    if exists(custom):
        try:
            f = open(custom, "r")
            for line in f:
                m = re.search(r"DAN.*(3:\d*).*(5:\d*).*", line)
                if m:
                    last_verses_ch_3_5 = f"{m.group(1)} {m.group(2)}"
                    return conclude_versification(last_verses_ch_3_5, 0)
        except Exception as e:
            print(f"Could not open {custom}, reason:  {e}")

    return ""


def get_star(versification, pt_versification, custom_versification):
    star = ""
    if custom_versification != "":
        if versification != custom_versification:
            star = "*"
    else:
        if versification != pt_versification:
            star = "*"

    return star


def get_conclusion(versification, custom_versification, last_verses_ch_3_5):

    if versification != "":
        return versification
    elif last_verses_ch_3_5.startswith("3:100"):
        return "3"  # Russian Orthodox
    else:
        return "4" # English


def get_versifications(code):

    with open(output / 'versifications.csv', 'w', encoding='UTF8', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=headers)
        writer.writeheader()

        for daniel in sorted(iglob(paratext_projects + f"**/*{code}*")):
            try:
                f = open(daniel, "r")
            except Exception as e:
                print(f"Could not open {daniel}, reason:  {e}")
                continue

            last_verse = 0
            last_verse_ch_13 = 0
            last_verses_ch_3_5 = ""
            try:
                for line in f:
                    m = re.search(r".*\\v (\d*).*", line)
                    if m:
                        last_verse = m.group(1)
                    m = re.search(r".*\\c (\d*).*", line)
                    if m:
                        last_chapter = m.group(1)
                        if last_chapter == "4":
                            last_verses_ch_3_5 += "3:" + last_verse
                        elif last_chapter == "6":
                            last_verses_ch_3_5 += " 5:" + last_verse
                        elif last_chapter == "14":
                            last_verse_ch_13 = last_verse
            except Exception as e:
                print(f"Something went wrong in reading {daniel}, reason:  {e}")

            versification = conclude_versification(last_verses_ch_3_5, last_verse_ch_13)
            pt_versification, project = get_paratext_versification(daniel)
            custom_versification = get_custom_versification(project)
            star = get_star(versification, pt_versification, custom_versification)
            conclusion = get_conclusion(versification, custom_versification, last_verses_ch_3_5)

            if versification != "":
                extra = ""
            else:
                extra = last_verses_ch_3_5
            row = {"CONCLUDED": versification, "PARATEXT": pt_versification, "CUSTOM": custom_versification, "PROJECT": project, "REMARK": star, "": extra, "CONCLUSION": conclusion}
            writer.writerow(row)

get_versifications("DAN")


Could not open /content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects/B2000/custom.vrs, reason:  'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Could not open /content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects/NRSV/custom.vrs, reason:  'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Could not open /content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects/SB1917/custom.vrs, reason:  'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Something went wrong in reading /content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects/pma/27DANPMA.SFM, reason:  'utf-8' codec can't decode byte 0x91 in position 1843: invalid start byte
