<a href="https://colab.research.google.com/github/vesaakerman/hello-world/blob/master/notebooks/Prepare_eBible_projects_licenses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define base folder

In [None]:
base = "/content/drive/MyDrive/eBible"

# Import modules and define directory paths

In [None]:
from glob import iglob
from bs4 import BeautifulSoup
from csv import DictWriter
import ntpath
from pathlib import Path
import regex
import requests

ebible_folder = base + "eBible_projects"
copyright_file = base + "eBible_metadata/copyrights.csv"

print(ebible_folder)
print(copyright_file)


/content/drive/MyDrive/eBible_projects
/content/drive/MyDrive/eBible_metadata/copyrights.csv


# Define methods

In [None]:
def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)


def read_list_from_file(f_in):
    lines = list()
    with open(f_in, "r", encoding="utf-8") as infile:
        for line in infile.read().splitlines():
            lines.append(line)
    return lines


def get_copyright_from_url(url):
    r = requests.get(url)
    # If the status is OK continue
    if r.status_code == requests.codes.ok:
        soup = BeautifulSoup(r.content, "lxml")
        cr_item = soup.find("td", colspan="3")
        return cr_item.string
    else:
        return None


# Get copyright info from eBible projects 

In [None]:
data = list()
copr_regex = r".*/(?P<id>.*?)/copr.htm"

# Check we can write to the output file before the processing:
with open(copyright_file, 'w',encoding='utf-8') as f_out:
    pass

print("Collecting eBible copyright information...")
for copyright_info_file in sorted(iglob(ebible_folder + "/**/copr.htm")):
    print(copyright_info_file)

    id_match = regex.match(copr_regex, str(copyright_info_file))
    id = id_match["id"]

    entry = dict()
    entry["ID"] = str(id)
    entry["File"] = copyright_info_file

    with open(copyright_info_file, "r", encoding="utf-8") as copr:
        html = copr.read()
        soup = BeautifulSoup(html, "lxml")

    cclink = soup.find(href=regex.compile("creativecommons"))
    if cclink:
        ref = cclink.get("href")
        if ref:
            entry["CC Licence Link"] = ref
            cc_match = regex.match(
                r".*?/licenses/(?P<type>.*?)/(?P<version>.*)/", ref
            )
            if cc_match:
                entry["Licence Type"] = cc_match["type"]
                entry["Licence Version"] = cc_match["version"]
            else:
                cc_by_match = regex.match(r".*?/licenses/by(?P<version>.*)/", ref)
                if cc_by_match:
                    # print(f'Licence version = {cc_by_match["version"]}')
                    entry["Licence Type"] = "by"
                    entry["Licence Version"] = cc_by_match["version"]

    cclink = None

    titlelink = soup.find(href=regex.compile(f"https://ebible.org/{id}"))
    if titlelink:
        entry["Vernacular Title"] = titlelink.string
    titlelink = None

    copy_strings = [s for s in soup.body.p.stripped_strings]
    
    
    for i, copy_string in enumerate(copy_strings):
        if i == 0 and "copyright ©" in copy_string:
            entry["Copyright Years"] = copy_string
            entry["Copyright Holder"] = copy_strings[i + 1]
        if i > 0 and "Language:" in copy_string:
            entry["Language"] = copy_strings[i + 1]
        if "Dialect" in copy_string:
            entry["Dialect"] = copy_string
        if "Translation by" in copy_string:
            entry["Translation by"] = copy_string
        if "Public Domain" in copy_string:
            entry["Copyright Year"] = ""
            entry["Copyright Holder"] = "Public Domain"
    
    data.append(entry)

    headers = [
        "ID",
        "File",
        "Language",
        "Dialect",
        "Licence Type",
        "Licence Version",
        "CC Licence Link",
        "Copyright Holder",
        "Copyright Years",
        "Translation by",
        # "Copyright from URL",
    ]

    with open(copyright_file, "w", encoding="utf-8") as csv_out:
        writer = DictWriter(
            csv_out, headers, restval="", extrasaction="ignore", dialect="excel"
        )
        writer.writeheader()
        writer.writerows(data)

print(f"Wrote copyright info to {copyright_file}")




Collecting eBible copyright information...
/content/drive/MyDrive/eBible_projects/aak/copr.htm
/content/drive/MyDrive/eBible_projects/aau/copr.htm
/content/drive/MyDrive/eBible_projects/aaz/copr.htm
Wrote copyright info to /content/drive/MyDrive/eBible_metadata/copyrights.csv
