<a href="https://colab.research.google.com/github/vesaakerman/hello-world/blob/master/notebooks/eBible_Tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# List number of different versifications in Paratext projects & number of custom versifications

In [None]:
from pathlib import Path
import xml.etree.ElementTree as ET

pfolder = Path("/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects")

def get_versification_from_settings_file(settings_file):
    versification = None
    try:
        tree = ET.parse(settings_file)
        versification_elem = tree.find('.//Versification')
        try:
            versification = versification_elem.text
        except:
            pass
    except FileNotFoundError:
        print(f"      could not find file {settings_file}")
    except IndexError:
        print(f"      IndexError finding Versification in {settings_file}")
        pass
    return versification


def custom_versification_file_exists(project):
    custom = project  / "custom.vrs"
    return custom.exists()


def get_versifications(dir):
    amounts = {"1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0, "None": 0, "1cu": 0, "2cu": 0, "3cu": 0, "4cu": 0, "5cu": 0, "6cu": 0, "Nonecu": 0}
    for project in dir.iterdir():
        settings_file = project / "Settings.xml"
        versification = get_versification_from_settings_file(settings_file)
        if not versification:
            versification = "None"
        amounts[versification] += 1
        if custom_versification_file_exists(project):
            amounts[versification + "cu"] += 1
        

    print(f"\nVersification 1: {amounts['1']},  of which {amounts['1cu']} has custom versification file")
    print(f"Versification 2: {amounts['2']},    of which {amounts['2cu']}   has custom versification file")
    print(f"Versification 3: {amounts['3']},   of which {amounts['3cu']}   has custom versification file")
    print(f"Versification 4: {amounts['4']}, of which {amounts['4cu']} has custom versification file")
    print(f"Versification 5: {amounts['5']},   of which {amounts['5cu']}  has custom versification file")
    print(f"Versification 6: {amounts['6']},   of which {amounts['6cu']}  has custom versification file")
    print(f"Versific. None : {amounts['None']},   of which {amounts['Nonecu']}  has custom versification file")


get_versifications(pfolder)


# Conclude versification based on contents

In [None]:
from glob import iglob
import re
from os.path import exists
from pathlib import Path
import csv
import os
import codecs

paratext_projects = "/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/Paratext_projects/"
output = Path("/content/drive/MyDrive/versification/")
headers = ['CONCLUDED', 'PARATEXT', 'CUSTOM', 'PROJECT', 'REMARK', 'BOOKS', '', 'CONCLUSION']


def get_paratext_versification(project):

    settings = project + "/Settings.xml"
    pt_versification = ""
    if exists(settings):
        try:
            f = open(settings, "r", encoding="utf-8")
            for line in f:
                m = re.search(r"<Versification>(\d)</Versification>.*", line)
                if m:
                    pt_versification = m.group(1)
        except Exception as e:
            print(f"Could not open {settings}, reason:  {e}")

    return pt_versification


def get_custom_versification(project):

    custom = project + "/custom.vrs"
    if exists(custom):
        try:
            f = codecs.open(custom, "r", encoding="utf-8", errors="ignore")
            for line in f:
                m = re.search(r"DAN.*3:(\d*).*5:(\d*).*", line)
                if m:
                    return conclude_versification_from_OT(int(m.group(1)), int(m.group(2)), 0)
        except Exception as e:
            print(f"Something went wrong in reading {custom}, reason:  {e}")

    return ""


def get_books_type(files):

    for book in files:
        m = re.search(r".*GEN|JON.*", book)
        if m:
            return "OT+NT"
    return "NT"



def get_star(versification, pt_versification, custom_versification):

    star = ""
    if versification == "":
        if custom_versification != "" and custom_versification != "4":
            star = "*"
        elif pt_versification != "" and pt_versification != "4":
            star = "*"
    else:
        if custom_versification != "":
            if versification != custom_versification:
                star = "*"
        elif pt_versification != "" and versification != pt_versification:
            star = "*"

    return star


def get_conclusion(versification):

    if versification != "":
        return versification
    else:
        return "4" # English


def conclude_versification_from_OT(dan_3, dan_5, dan_13):
    if dan_3 == 30:
        versification = "4"  # English
    elif dan_3 == 33 and dan_5 == 30:
        versification = "1"  # Original
    elif dan_3 == 33 and dan_5 == 31:
        versification = "5"  # Russian Protestant
    elif dan_3 == 97:
        versification = "2"  # Septuagint
    elif dan_3 == 100:
        if dan_13 == 65:
            versification = "3"  # Vulgate
        else:
            versification = "6"  # Russian Orthodox
    else:
        versification = ""

    return versification

def conclude_versification_from_NT(jhn_6, act_19, rom_16):
    if jhn_6 == 72:
        versification = "3"  # Vulgate
    elif act_19 == 41:
        versification = "4"  # English
    elif rom_16 == 24:
        versification = "6"  # Russian Orthodox (same as Russian Protestant)
    elif jhn_6 == 71 and act_19 == 40:
        versification = "1"  # Original (Same as Septuagint)
    else:
        versification = ""

    return versification


def get_last_verse(project, book, chapter):

    ch = str(chapter)

    for book_file in iglob(f"{project}/*{book}*"):
        last_verse = "0"
        try:
            f = codecs.open(book_file, "r", encoding="utf-8", errors="ignore")
        except Exception as e:
            print(f"Could not open {book_file}, reason:  {e}")
            continue
        try:
            in_chapter = False
            for line in f:
                m = re.search(r"\\c ? ?([0-9]+).*", line)
                if m:
                    if m.group(1) == ch:
                        in_chapter = True
                    else:
                        in_chapter = False

                m = re.search(r"\\v ? ?([0-9]+).*", line)
                if m:
                    if in_chapter:
                        last_verse = m.group(1)
        except Exception as e:
            print(f"Something went wrong in reading {book_file}, reason:  {e}")
            return None
        try:
            return int(last_verse)
        except Exception as e:
            print(f"Could not convert {last_verse} into an integer in {book_file}, reason:  {e}")
            return None


def get_checkpoints_OT(project):
    dan_3 = get_last_verse(project, "DAN", 3)
    dan_5 = get_last_verse(project, "DAN", 5)
    dan_13 = get_last_verse(project, "DAN", 13)

    return dan_3, dan_5, dan_13


def get_checkpoints_NT(project):
    jhn_6 = get_last_verse(project, "JHN", 6)
    act_19 = get_last_verse(project, "ACT", 19)
    rom_16 = get_last_verse(project, "ROM", 16)

    return jhn_6, act_19, rom_16


def get_versification(project):
    versification = ""
    books = get_books_type(os.listdir(project))

    if books == "OT+NT":
        dan_3, dan_5, dan_13 = get_checkpoints_OT(project)
        versification = conclude_versification_from_OT(dan_3, dan_5, dan_13)
        based_on = "OT"

    if not versification:
        jhn_6, act_19, rom_16 = get_checkpoints_NT(project)
        versification = conclude_versification_from_NT(jhn_6, act_19, rom_16)
        based_on = "NT"

    return versification, books, based_on


def get_versifications():
    with open(output / 'versifications.csv', 'w', encoding='UTF8', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=headers)
        writer.writeheader()

        for project in sorted(iglob(paratext_projects + "**")):
            print(Path(project).name)
            versification, books, based_on = get_versification(project)
            pt_versification = get_paratext_versification(project)
            custom_versification = get_custom_versification(project)
            star = get_star(versification, pt_versification, custom_versification)
            conclusion = get_conclusion(versification)

            if star == "":
                extra = ""
            else:
                if based_on == "OT":
                    dan_3, dan_5, dan_13 = get_checkpoints_OT(project)
                    extra = f"DAN_3:{dan_3}, DAN_5:{dan_5}, DAN_13:{dan_13}"
                else:
                    jhn_6, act_19, rom_16 = get_checkpoints_NT(project)
                    extra = f"JHN_6:{jhn_6}, ACT_19:{act_19}, ROM_16:{rom_16}"
            row = {"CONCLUDED": versification, "PARATEXT": pt_versification, "CUSTOM": custom_versification, "PROJECT": Path(project).name, "REMARK": star, "BOOKS": books,"": extra, "CONCLUSION": conclusion}
            writer.writerow(row)

get_versifications()



ABG1992
ABN
ABN2012
ABNPOL
ABSNT
AERV
AGR
AKP
ALGNT
AMATNT
AMD
AMP
ANT
APC
APD1927
APSD
APSD-CEB
ARA
ARC
ARNNT96
AROM
ARP
ARUM
ASN1943
ASN1943G
ASN1961
ASVBT
ATB1938
ATB1951
ATCNTM
ATMK
ATN1951
AVD
AVDDV
AVU
AYBOL1986
AYZ
AZB09
AdgNT14
AgxLUK
AkgPor10
AltNT2017
Amh05
Amh62
AnyuB
ApMaNP08
AruBib20
AruNT04
AvrNT
B2000
BALTROM
BAN
BASSARDC
BAV
BBC
BBL
BBR
BCI
BCL
BCND
BCW
BDS
BEA1886
BEA1968
BEAVSY
BEN1882R
BEN1921R
BENCLBSI
BENGALI-BSI
BFZNT
BG2001
BGB01
BGC-NT
BGDC
BGHNT
BGPU
BHD-NT
BHNSB
BHTI
BIB
BIB1819
BIM
BIMK
BIMK85
BIRD
BIV
BK98
BLA
BLA1890
BLA1979
BLAMAT
BLNT74
BLP
BLP2018
BLPH
BLY
BLYDC
BMK
BMNT
BNP70
BNT
BOV
BOVBSI
BPH
BRE1897
BREGEN
BSB
BSC2010
BTI
BTN
BTQ
BTS
BTX
BUY1904
BW
BWM
BYSB
BYZ1904
BandiNT
Bassa02
BedAp
Bedell
BlkNTP2000
BskNT14
BurNT10
CAC2012
CALO1837
CALO1872
CARS
CARSA
CARST
CASS
CBCWNT
CBCWOT
CBSWNT
CCB
CCB-T
CCCbst
CCL
CCSWNT
CDHNT
CEVUK
CEVUS06
CEWUNT
CGJMNT
CGJMP
CGJWNT
CHIOJNT
CHRNT
CHTW
CHUJSM
CHUJSSC
CHV
CIEC1905
CIEC1916
CIMB
CIMRNT
CKK
CKOu
CLNA
CLTT
CMD