# Evaluation: Syntax

Part I of the computational evaluation of AI-generated linked data for [Linking Anthropology's Data and Archives (LADA)](https://ischool.umd.edu/projects/building-a-sustainable-future-for-anthropologys-archives-researching-primary-source-data-lifecycles-infrastructures-and-reuse/), focused on syntax (e.g., do the metadata adhere to the expected serialization formats?).

---

**Table of Contents:**

I. [Data Loading](#data-loading)

II. [Syntax](#syntax)

  * [XML](#xml)
  
  * [JSON](#json)

---

## Data Loading

In [None]:
import utils
import config
import pandas as pd
import xml.etree.ElementTree as ET
import json
from lxml import etree
from pathlib import Path
import os
import re

Create variables to reference existing directories and files.

In [None]:
dublin_path = "cleaned/dublin_core/"  # XML data files
schema_path = "cleaned/schema_org/"   # JSON data files
cidoc_path = "cleaned/cidoc_crm/"     # JSON data files

dublin_t1_dir = config.task1_data+dublin_path
schema_t1_dir = config.task1_data+schema_path
cidoc_t1_dir = config.task1_data+cidoc_path

dublin_p1_dir = config.playgrd1_data+dublin_path
schema_p1_dir = config.playgrd1_data+schema_path
cidoc_p1_dir = config.playgrd1_data+cidoc_path

dublin_p3_dir = config.playgrd3_data+dublin_path
schema_p3_dir = config.playgrd3_data+schema_path
cidoc_p3_dir = config.playgrd3_data+cidoc_path

Create a folder to store the error reports in.

In [None]:
d = "syntax"
report_dir = f"data/error_reports/{d}/"
Path(report_dir).mkdir(parents=True, exist_ok=True)

## Syntax

### XML

First, read and evaluate only the files with a `.xml` extension.

In [None]:
extension = ".xml"
dublin_file_paths = []
dublin_files_t1 = [f for f in os.listdir(dublin_t1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_t1_dir+f for f in dublin_files_t1]
dublin_files_p1 = [f for f in os.listdir(dublin_p1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p1_dir+f for f in dublin_files_p1]
dublin_files_p3 = [f for f in os.listdir(dublin_p3_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p3_dir+f for f in dublin_files_p3]
dublin_file_paths.sort()
total_dcxml_files = len(dublin_file_paths)
print(f"Total Dublin Core {extension[1:].upper()} files:", total_dcxml_files)

In [None]:
syntax_errors, errored_files = [], []
for file_path in dublin_file_paths:
    try:
        tree = etree.parse(file_path)
    except Exception as e:
        f_error = {"file": file_path, "exception_type": type(e), "exception_message": str(e)}
        syntax_errors += [f_error]
        errored_files += [file_path]
print("Files with errors:", 
      len(errored_files), "of", total_dcxml_files,
      f"({(len(errored_files)/total_dcxml_files)*100:.2f}%)")

In [None]:
df_se = pd.DataFrame.from_dict(syntax_errors)
pattern = "^[\D]+,"
new_exception_col = df_se["exception_message"].apply(lambda x: re.findall(pattern, x)[0][:-1])
df_se.insert(len(df_se.columns)-1, "exception_subtype", new_exception_col)
df_se.head()

Next, evaluate every record by reading the TXT files to:
- check whether relevnt DC namespace(s) are present
- check whether RDF namespace is present
- check whether a prolog is present
- check whether a prolog with UTF-8 encoding is present

In [None]:
extension = ".txt"
dublin_file_paths = []
dublin_files_t1 = [f for f in os.listdir(dublin_t1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_t1_dir+f for f in dublin_files_t1]
dublin_files_p1 = [f for f in os.listdir(dublin_p1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p1_dir+f for f in dublin_files_p1]
dublin_files_p3 = [f for f in os.listdir(dublin_p3_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p3_dir+f for f in dublin_files_p3]
dublin_file_paths.sort()
total_dctxt_files = len(dublin_file_paths)
print(f"Total Dublin Core {extension[1:].upper()} files:", total_dctxt_files)

In [None]:
custom_syntax_errors, more_errored_files = [], []
for file_path in dublin_file_paths:
    with open(file_path, "r") as f:
        f_string = f.read()
        file_path = file_path.replace(".txt", ".xml")
        if not utils.hasDCNamespaces(f_string):
            f_error = {"file": file_path, "exception_type": "Custom syntax check", "exception_subtype": "Missing namespace", "exception_message": "Missing Dublin Core namespace(s)"}
            custom_syntax_errors += [f_error]
            more_errored_files += [file_path]
        if not utils.hasRDFNamespace(f_string):
            f_error = {"file": file_path, "exception_type": "Custom syntax check", "exception_subtype": "Missing namespace", "exception_message": "Missing RDF namespace"}
            custom_syntax_errors += [f_error]
            more_errored_files += [file_path]
        if not utils.hasProlog(f_string):
            f_error = {"file": file_path, "exception_type": "Custom syntax check", "exception_subtype": "Missing prolog", "exception_message": "Missing prolog"}
            custom_syntax_errors += [f_error]
            more_errored_files += [file_path]
        if not utils.hasPrologWithEncoding(f_string):
            f_error = {"file": file_path, "exception_type": "Custom syntax check", "exception_subtype": "Missing prolog", "exception_message": "Missing prolog with UTF-8 encoding"}
            custom_syntax_errors += [f_error]
            more_errored_files += [file_path]
print(len(custom_syntax_errors), "additional syntax errors found across", len(set(more_errored_files)), "out of", total_dctxt_files, "files")

In [None]:
df_se = pd.concat([df_se, pd.DataFrame.from_dict(custom_syntax_errors)])
new_file_col = df_se["file"].apply(lambda x: x.split("/")[-1])
df_se = df_se.rename(columns={"file":"file_path"})
df_se.insert(1, "file_name", new_file_col)
df_se.tail()

In [None]:
df_se.exception_type.value_counts()

In [None]:
custom_message_report = pd.DataFrame(df_se.loc[df_se.exception_type == "Custom syntax check"].exception_message.value_counts())
custom_df = df_se.loc[df_se.exception_type == "Custom syntax check"]
custom_df = custom_df[["exception_type", "exception_subtype", "exception_message"]].drop_duplicates()
custom_df = custom_df.set_index("exception_message").join(custom_message_report).reset_index()
custom_report = custom_df[["exception_type", "exception_subtype", "exception_message", "count"]]
custom_report

The `count` column refers to the total occurrence of each exception, so the sum of that column may exceed the total number of files.

In [None]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "syntax_error_stats_custom_subtypes"
custom_report.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [None]:
xml_syntax_subtypes = pd.DataFrame(df_se.loc[df_se.exception_type != "Custom syntax check"].exception_subtype.value_counts())
syntax_df = df_se.loc[df_se.exception_type != "Custom syntax check"]
syntax_df = syntax_df[["exception_type", "exception_subtype"]].drop_duplicates()
syntax_df = syntax_df.set_index("exception_subtype").join(xml_syntax_subtypes).reset_index()
xml_syntax_error_report = syntax_df[["exception_type", "exception_subtype", "count"]]
xml_syntax_error_report

In [None]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "syntax_error_stats_subtypes"
xml_syntax_error_report.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [None]:
df_se_subtypes = df_se[["file_path", "file_name", "exception_type", "exception_subtype"]].drop_duplicates()
df_se_subtypes.tail()

In [None]:
subtype_report = pd.DataFrame(df_se_subtypes[["exception_type", "exception_subtype"]].value_counts())
subtype_report = subtype_report.rename(columns={"count":"file_count"})
subtype_report


In [None]:
all_errored_files = set(errored_files + more_errored_files)
totals_report = pd.DataFrame({
    "exception_type": ["TOTAL FILES", "FILES WITH EXCEPTION"],
    "exception_subtype": ["NA", "NA"],
    "file_count": [total_dcxml_files, len(all_errored_files)]
    })
totals_report

In [None]:
more_errored_files[0]

In [None]:
xml_report = pd.concat([subtype_report.reset_index(), totals_report])
proportions = (xml_report[["file_count"]]/total_dcxml_files).values
percentages = [f"{proportion[0]*100:.2f}%" for proportion in proportions]
xml_report.insert(len(xml_report.columns), "proportion_of_all_files", percentages)
xml_report = xml_report.reset_index().drop(columns=["index"])
xml_report

Save the reports as CSV files.

In [None]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "syntax_error_stats"
xml_report.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [None]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "syntax_errors"
df_se.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

### JSON

First, read and evaluate only the files with a `.json` extension.

In [None]:
extension = ".json"
cidoc_file_paths = []
cidoc_files_t1 = [f for f in os.listdir(cidoc_t1_dir) if f.endswith(extension)]
cidoc_file_paths += [cidoc_t1_dir+f for f in cidoc_files_t1]
cidoc_files_p1 = [f for f in os.listdir(cidoc_p1_dir) if f.endswith(extension)]
cidoc_file_paths += [cidoc_p1_dir+f for f in cidoc_files_p1]
cidoc_files_p3 = [f for f in os.listdir(cidoc_p3_dir) if f.endswith(extension)]
cidoc_file_paths += [cidoc_p3_dir+f for f in cidoc_files_p3]
cidoc_file_paths.sort()
print("Total CIDOC-CRM JSON files:", len(cidoc_file_paths))

In [None]:
cidoc_file_paths[0]

In [None]:
extension = ".json"
schema_file_paths = []
schema_files_t1 = os.listdir(schema_t1_dir)
schema_file_paths += [schema_t1_dir+f for f in schema_files_t1 if f.endswith(extension)]
schema_files_p1 = os.listdir(schema_p1_dir)
schema_file_paths += [schema_p1_dir+f for f in schema_files_p1 if f.endswith(extension)]
schema_files_p3 = os.listdir(schema_p3_dir)
schema_file_paths += [schema_p3_dir+f for f in schema_files_p3 if f.endswith(extension)]
schema_file_paths.sort()
print("Total Schema.org JSON files:", len(schema_file_paths))

In [None]:
schema_file_paths[0]

In [None]:
json_file_paths = cidoc_file_paths + schema_file_paths
total_json_files = len(json_file_paths)
print(len(json_file_paths))

In [None]:
syntax_errors = []
for json_f in json_file_paths:
    with open(json_f) as f:
        try:
            data = json.load(f)
        except Exception as e:
            f_error = {"file": json_f, "exception_type": type(e), "exception_message": str(e)}
            syntax_errors += [f_error]
        f.close()
print(
    "Files with errors:", 
    len(syntax_errors), "of", len(json_file_paths),
    f"({(len(syntax_errors)/len(json_file_paths))*100:.2f}%)"
    )

In [None]:
df_se = pd.DataFrame.from_dict(syntax_errors)
new_file_col = df_se["file"].apply(lambda x: x.split("/")[-1])
df_se = df_se.rename(columns={"file":"file_path"})
df_se.insert(1, "file_name", new_file_col)
df_se

In [None]:
df_se.exception_type.unique()

In [None]:
df_se.exception_message.unique()

Looking at the files that triggered the above error messages, it seems that often what's missing is the final curly brace.  Also, in one file, the quotes surrounding strings were doubled (e.g., `""@context"":...` instead of `"@context"`) and in another file, an attempt at a comment was made using `//`.  While the error messages are useful in locating the general source of the error within a file, they're less informative for distinguishing what needs to be changed to correct the file.

Export a reports about the JSON errors.

In [None]:
subdf_se = pd.DataFrame(df_se[["file_path", "file_name", "exception_type"]].drop_duplicates())
subdf_se.tail()

In [None]:
type_report = pd.DataFrame(df_se[["exception_type"]].value_counts())
type_report = type_report.rename(columns={"count":"file_count"})
type_report

In [None]:
totals_report = pd.DataFrame({
    "exception_type": ["TOTAL FILES", "FILES WITH EXCEPTION"],
    "file_count": [len(json_file_paths), len(syntax_errors)]
    })

In [None]:
json_report = pd.concat([type_report.reset_index(), totals_report])
proportions = (json_report[["file_count"]]/(len(json_file_paths))).values
percentages = [f"{proportion[0]*100:.2f}%" for proportion in proportions]
json_report.insert(len(json_report.columns), "proportion_of_all_files", percentages)
json_report = json_report.reset_index().drop(columns=["index"])
json_report


Calculate how many files are meant to adhere to Schema.org and CIDOC-CRM models.

In [None]:
sdo_error_df = subdf_se.loc[subdf_se.file_name.str.contains("sdo")]
cidoc_error_df = subdf_se.loc[subdf_se.file_name.str.contains("cidoccrm")]

In [None]:
exception_types = df_se.exception_type.unique()
assert len(exception_types) == 1, "There are multiple exception types in the JSON files: {}".format(exception_types)
df_type = df_se.loc[df_se.exception_type == exception_types[0]]
sdo_type_df = df_type.loc[df_type.file_name.str.contains("sdo")]
cidoc_type_df = df_type.loc[df_type.file_name.str.contains("cidoccrm")]

In [None]:
json_report.insert(len(json_report.columns), "schema_org_files", [sdo_type_df.shape[0], len(schema_file_paths), sdo_error_df.shape[0]])
proportions = (json_report[["schema_org_files"]]/(len(schema_file_paths))).values
percentages = [f"{proportion[0]*100:.2f}%" for proportion in proportions]
json_report.insert(len(json_report.columns), "proportion_of_schema_files", percentages)
proportions = (json_report[["schema_org_files"]]/(len(json_file_paths))).values
percentages = [f"{proportion[0]*100:.2f}%" for proportion in proportions]
json_report.insert(len(json_report.columns), "schema_proportion_of_all_files", percentages)

json_report.insert(len(json_report.columns), "cidoc_crm_files", [cidoc_type_df.shape[0], len(cidoc_file_paths), cidoc_error_df.shape[0]])
proportions = (json_report[["cidoc_crm_files"]]/(len(cidoc_file_paths))).values
percentages = [f"{proportion[0]*100:.2f}%" for proportion in proportions]
json_report.insert(len(json_report.columns), "proportion_of_cidoc_files", percentages)
proportions = (json_report[["cidoc_crm_files"]]/(len(json_file_paths))).values
percentages = [f"{proportion[0]*100:.2f}%" for proportion in proportions]
json_report.insert(len(json_report.columns), "cidoc_proportion_of_all_files", percentages)

json_report

Save the reports as CSV files.

In [None]:
metadata_standard = "cidoc-and-sdo"
data_serialization = "json"

In [None]:
report_type = "syntax_error_stats"
json_report.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [None]:
report_type = "syntax_errors"
df_se.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )