# Evaluation: Conformance and Consistency
Part III of the computational evaluation of AI-generated linked data for [Linking Anthropology's Data and Archives (LADA)](https://ischool.umd.edu/projects/building-a-sustainable-future-for-anthropologys-archives-researching-primary-source-data-lifecycles-infrastructures-and-reuse/), focused on conformance to ontologies (i.e., CIDOC-CRM, Schema.org, Dublin Core) and consistency in the conformance across generated data points (e.g., is all data on one or two lines, or does each tag, subtag, etc. appear on its own line?).

---

**Table of Contents:**

I. [Data Loading](#data-loading)

II. [Dublin Core](#dublin-core)

III. [Schema.org](#schemaorg)

IV. [CIDOC-CRM](#cidoc-crm)

---

## Data Loading

In [None]:
import utils
import config
import pandas as pd
import numpy as np
import urllib.request
import urllib
import xml.etree.ElementTree as ET
import json
from lxml import etree
import rdflib
from rdflib.namespace import DC, SDO # Dublin Core, Schema.org
from pathlib import Path
import os
import re

# sax - to validate XML well-formed
# xml.etree.ElementTree - to validate text between tags
# xml.etree.ElementTree + xml.etree.ElementTree.XMLSchema's validate() - to validate XML well-formed
# lxml etree.XMLParser - to validate well-formed based on input XML schema
# json_checker - to validate Python data types (incl. but not limited to those obtained from JSON)
# jsonschema.validate
# ShEx - for RDF graphs, ShExJ for JSON - NOTE: couldn't install package
# OntoME - for CIDOC-CRM ontology alignment

Create variables to reference existing directories and files.

In [None]:
dublin_path = "cleaned/dublin_core/"  # XML data files
schema_path = "cleaned/schema_org/"   # JSON data files
cidoc_path = "cleaned/cidoc_crm/"     # JSON data files

dublin_t1_dir = config.task1_data+dublin_path
schema_t1_dir = config.task1_data+schema_path
cidoc_t1_dir = config.task1_data+cidoc_path

dublin_p1_dir = config.playgrd1_data+dublin_path
schema_p1_dir = config.playgrd1_data+schema_path
cidoc_p1_dir = config.playgrd1_data+cidoc_path

dublin_p3_dir = config.playgrd3_data+dublin_path
schema_p3_dir = config.playgrd3_data+schema_path
cidoc_p3_dir = config.playgrd3_data+cidoc_path

Create variables to reference automatically corrected files and their directories.

In [None]:
dublin_path = "corrected/dublin_core/"  # XML data files
schema_path = "corrected/schema_org/"   # JSON data files
cidoc_path = "corrected/cidoc_crm/"     # JSON data files

dublin_t1_corrected_dir = config.task1_data+dublin_path
schema_t1_corrected_dir = config.task1_data+schema_path
cidoc_t1_corrected_dir = config.task1_data+cidoc_path

dublin_p1_corrected_dir = config.playgrd1_data+dublin_path
schema_p1_corrected_dir = config.playgrd1_data+schema_path
cidoc_p1_corrected_dir = config.playgrd1_data+cidoc_path

dublin_p3_corrected_dir = config.playgrd3_data+dublin_path
schema_p3_corrected_dir = config.playgrd3_data+schema_path
cidoc_p3_corrected_dir = config.playgrd3_data+cidoc_path

corrected_dirs = [dublin_t1_corrected_dir, schema_t1_corrected_dir, cidoc_t1_corrected_dir,
                  dublin_p1_corrected_dir, schema_p1_corrected_dir, cidoc_p1_corrected_dir,
                  dublin_p3_corrected_dir, schema_p3_corrected_dir, cidoc_p3_corrected_dir
                  ]
for corrected_dir in corrected_dirs:
    Path(corrected_dir).mkdir(parents=True, exist_ok=True)

## Dublin Core

***Note:***

*The Dublin Core schemas' URLs below are from [dublincore.org](https://www.dublincore.org/schemas/xmls/) under "Latest versions are always available as: ..."*

In [None]:
dc_elements_schema_url = "https://www.dublincore.org/schemas/xmls/qdc/dc.xsd"
dc_terms_schema_url = "https://www.dublincore.org/schemas/xmls/qdc/dcterms.xsd"
dc_mitype_schema_url = "https://www.dublincore.org/schemas/xmls/qdc/dcmitype.xsd"

In [None]:
dc_tags = ["dcterms:creator", "dcterms:contributor", "dcterms:date", "dcterms:title", "dcterms:publisher", 
             "dcterms:language", "dcterms:format", "dcterms:subject", "dcterms:description", "dcterms:identifier", 
             "dcterms:relation", "dcterms:source", "dcterms:type", "dcterms:coverage", "dcterms:rights"]

In [None]:
url = dc_elements_schema_url

In [None]:
content = urllib.request.urlopen(url)
parser = etree.XMLParser()
dc_elements_tree = etree.parse(content, parser)
dc_elements_schema = etree.XMLSchema(dc_elements_tree)

    # dc_elements_tree = etree.parse(dc_elements_schema_path)
    # dc_elements_schema = etree.XMLSchema(dc_elements_tree)

In [None]:
xml_doc = etree.parse(dublin_file_paths[1]) # valid   #[0] - invalid as expected
result = dc_elements_schema.validate(xml_doc)

XMLSyntaxError: Namespace prefix dc on title is not defined, line 1, column 10 (dc_record_005.xml, line 1)

In [None]:
dublin_file_paths[1]

'data/data_playground_task1/cleaned/dublin_core/dc_record_001.xml'

## Schema.org

## CIDOC-CRM