# Validate and prepare YAML policy stubs

In [1]:
import copy
import pathlib
import json
import os
import re

import lxml.etree
import networkx
import pandas
from pykwalify.core import (
    Rule,
    Core as Kwalify,
)
import ruamel.yaml

## Configuration

In [2]:
schema_path = 'schema.yml'
test_paths = list(map(str, pathlib.Path('test-policies').glob('*.yml')))

In [3]:
# Options to affect yaml.dump
yaml = ruamel.yaml.YAML()
yaml.indent(
    mapping=2,
    sequence=4,
    offset=2,
)

## Perform tests

In [4]:
# Perform tests
for test_path in test_paths:
    kore = Kwalify(
        source_file=test_path,
        schema_files=[schema_path],
        strict_rule_validation=True,
    )
    data = kore.validate(raise_exception=False)

validation.invalid
 --- All found errors ---
["Value '11' is not of type 'str'. Path: '/policy-id'"]
Errors found but will not raise exception...
validation.invalid
 --- All found errors ---
["Key 'id' was not defined. Path: ''"]
Errors found but will not raise exception...


## Create template

In [5]:
def schema_to_template(rule):
    """
    Recursively convert rules to a blank template object.
    yaml_set_comment_before_after_key
    """
    if rule.type == 'seq':
        commented_seq = ruamel.yaml.comments.CommentedSeq()
        for i, value in enumerate(rule.sequence):
            commented_seq.append(schema_to_template(value))
            if value.desc:
                commented_seq.yaml_set_comment_before_after_key(i, before=value.desc)
        return commented_seq
    elif rule.type == 'map':
        commented_map = ruamel.yaml.comments.CommentedMap()
        for key, value in rule.mapping.items():
            commented_map[key] = schema_to_template(value)
            if value.desc:
                commented_map.yaml_set_comment_before_after_key(key, before='\n' + value.desc)
        return commented_map
    else:
        return None

In [6]:
# Create template
with open('schema.yml') as read_file:
    schema = ruamel.yaml.safe_load(read_file)
root_rule = Rule(schema)
template = schema_to_template(root_rule)
with open('template.yml', 'w') as write_file:
    yaml.dump(template, write_file, transform=str.lstrip)

## Create stubs

In [7]:
# Read RoMEO policy ontology
with open('../romeo/data/ontology.json') as read_file:
    data = json.load(read_file)
    graph = networkx.node_link_graph(data)

# Read raw RoMEO XML
tree = lxml.etree.parse('../romeo/downloads/policies-full.xml')

In [8]:
tag_remover = re.compile(r'<[^>]+>')

def get_clean_text(element):
    """
    Get the text of an element, strip whitespace, and remove tags.
    Modified from https://tutorialedge.net/python/removing-html-from-string/
    """
    text = element.text.strip()
    return tag_remover.sub('', text)

def extract_romeo_fields(romeo_id):
    """
    Extract information from an lxml tree object of policies-full.xml.
    """
    romeo_info = dict()
    policy = tree.find(f'publishers/publisher[@id="{romeo_id}"]')
    romeo_info['prearchiving'] = policy.findtext('preprints/prearchiving')
    romeo_info['prerestrictions'] = [
        get_clean_text(x) for x in policy.findall('preprints/prerestrictions/prerestriction')]
    romeo_info['conditions'] = [
        get_clean_text(x) for x in policy.findall('conditions/condition')]
    romeo_info['copyright-urls'] = [
        x.text for x in policy.findall('copyrightlinks/copyrightlink/copyrightlinkurl')]
    return romeo_info

In [9]:
with open('../romeo/data/romeo_id-to-journals.json') as read_file:
    policy_id_to_journals = json.load(read_file)

In [10]:
# Create stubs for each record
for node, data in graph.nodes(data=True):
    if 'Journal Title' in data:
        continue
    # Set path for this policy
    path = pathlib.Path(f'policies/{node}.yml').resolve()

    # Set policy to blank template
    record = copy.deepcopy(template)

    # Update policy with existing annotations
    if path.is_file():
        with path.open() as read_file:
            existing_record = ruamel.yaml.safe_load(read_file)
        for key, value in existing_record.items():
            if not value:
                continue
            record[key] = value
    
    # Set RoMEO fields for policy
    record['policy-id'] = node
    record['publisher'] = data['Publisher']
    record['policy-heading'] = data['Policy Heading']
    journals = policy_id_to_journals.get(node, [])
    child_policies = list()
    for inheritor in networkx.ancestors(graph, node):
        if 'Journal Title' not in graph.node[inheritor]:
            child_policies.append(inheritor)
    record['journals'] = sorted(journals)
    child_journals = set()
    for child in child_policies:
        child_journals |= set(policy_id_to_journals.get(child, []))
    record['child-journals'] = sorted(child_journals)
    record['child-policies'] = sorted(child_policies)
    record['parent-policies'] = sorted(networkx.descendants(graph, node))
    # Extract information from the RoMEO full-policies XML
    record.update(extract_romeo_fields(data['RoMEO Record ID']))

    # Validate output meets schema
    kwalify = Kwalify(schema_data=schema, source_data=record)
    validated_record = kwalify.validate()
    if os.environ.get('TRAVIS', 'false') == 'true':
        # Skip writing files on Travis.
        # Was getting "FileNotFoundError: [Errno 2] No such file or directory". See
        # https://travis-ci.com/transpose-publishing/policies-database/builds/72971278#L756
        continue
    with path.open('w') as write_file:
        yaml.dump(record, write_file, transform=str.lstrip)