# Automated Correction: Schema.org and CIDOC-CRM JSON-LD Syntax

Part V of the computational evaluation of AI-generated linked data for [Linking Anthropology's Data and Archives (LADA)](https://ischool.umd.edu/projects/building-a-sustainable-future-for-anthropologys-archives-researching-primary-source-data-lifecycles-infrastructures-and-reuse/), focused on syntax (e.g., do the metadata adhere to the expected serialization formats?).

---

**Table of Contents:**

I. [Data Loading](#data-loading)

II. [Auomated Correction](#automated-correction)

  * [Schema.org](#schema-org)
  
  * [CIDOC-CRM](#cidoc-crm)

---

## Data Loading

In [None]:
import utils
import config
import pandas as pd
import numpy as np
import urllib.request
import urllib
import xml.etree.ElementTree as ET
import json
from lxml import etree
import rdflib
from rdflib.namespace import DC, SDO # Dublin Core, Schema.org
from pathlib import Path
import os
import re

Create variables to reference existing directories and files.

In [None]:
dublin_path = "cleaned/dublin_core/"  # XML data files
schema_path = "cleaned/schema_org/"   # JSON data files
cidoc_path = "cleaned/cidoc_crm/"     # JSON data files

dublin_t1_dir = config.task1_data+dublin_path
schema_t1_dir = config.task1_data+schema_path
cidoc_t1_dir = config.task1_data+cidoc_path

dublin_p1_dir = config.playgrd1_data+dublin_path
schema_p1_dir = config.playgrd1_data+schema_path
cidoc_p1_dir = config.playgrd1_data+cidoc_path

dublin_p3_dir = config.playgrd3_data+dublin_path
schema_p3_dir = config.playgrd3_data+schema_path
cidoc_p3_dir = config.playgrd3_data+cidoc_path

Create directories to store automatically corrected files and variables to reference them.

In [None]:
dublin_path = "corrected/dublin_core/"  # XML data files
schema_path = "corrected/schema_org/"   # JSON data files
cidoc_path = "corrected/cidoc_crm/"     # JSON data files

dublin_t1_corrected_dir = config.task1_data+dublin_path
schema_t1_corrected_dir = config.task1_data+schema_path
cidoc_t1_corrected_dir = config.task1_data+cidoc_path

dublin_p1_corrected_dir = config.playgrd1_data+dublin_path
schema_p1_corrected_dir = config.playgrd1_data+schema_path
cidoc_p1_corrected_dir = config.playgrd1_data+cidoc_path

dublin_p3_corrected_dir = config.playgrd3_data+dublin_path
schema_p3_corrected_dir = config.playgrd3_data+schema_path
cidoc_p3_corrected_dir = config.playgrd3_data+cidoc_path

corrected_dirs = [dublin_t1_corrected_dir, schema_t1_corrected_dir, cidoc_t1_corrected_dir,
                  dublin_p1_corrected_dir, schema_p1_corrected_dir, cidoc_p1_corrected_dir,
                  dublin_p3_corrected_dir, schema_p3_corrected_dir, cidoc_p3_corrected_dir
                  ]
for corrected_dir in corrected_dirs:
    Path(corrected_dir).mkdir(parents=True, exist_ok=True)

## Automated Correction

In [None]:
errored_files = list(df_se.file_path)
txt_errored_files = [f.replace(".json", ".txt") for f in errored_files]
print(txt_errored_files[0])

data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_018.txt


In [None]:
correct_json_files = []
for f in cidoc_file_paths:
    if f not in errored_files:
        correct_json_files += [f]
for f in schema_file_paths:
    if f not in errored_files:
        correct_json_files += [f]
print("Total correct JSON files:", len(correct_json_files))
print("Sample:", correct_json_files[0])

Total correct JSON files: 209
Sample: data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_003.json


In [None]:
still_incorrect, comments_found, new_syntax_errors = utils.correctJSON(txt_errored_files)

In [None]:
print(len(still_incorrect), "still incorrect:", still_incorrect)
print(len(comments_found), "comments(s) found:", comments_found)
print("New errors:", len(new_syntax_errors))


4 still incorrect: ['data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_018.txt', 'data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_070.txt', 'data/data_playground_task1/cleaned/schema_org/sdo_record_018.txt', 'data/data_task1/cleaned/schema_org/sdo_record_006.txt']
1 comments(s) found: [{'errored_file': 'data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_018.txt', 'comment': ['\n    // Additional members go here']}]
New errors: 4


In [None]:
for new_e in new_syntax_errors:
    print(new_e)

{'file': 'data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_018.json', 'exception_type': <class 'io.UnsupportedOperation'>, 'exception_message': 'not readable'}
{'file': 'data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_070.json', 'exception_type': <class 'io.UnsupportedOperation'>, 'exception_message': 'not readable'}
{'file': 'data/data_playground_task1/cleaned/schema_org/sdo_record_018.json', 'exception_type': <class 'io.UnsupportedOperation'>, 'exception_message': 'not readable'}
{'file': 'data/data_task1/cleaned/schema_org/sdo_record_006.json', 'exception_type': <class 'io.UnsupportedOperation'>, 'exception_message': 'not readable'}


In [None]:
errors = []
for txt_f in still_incorrect:
    json_f = txt_f.replace(".txt", ".json")
    new_json_path = json_f.replace("cleaned", "corrected")
    with open(new_json_path) as f:
        try:
            data = json.load(f)
        except Exception as e:
            f_error = {"file": json_f, "exception_type": type(e), "exception_message": str(e)}
            errors += [f_error]
        f.close()
print(
    "Files with errors:", 
    len(errors), "of", len(json_file_paths),
    f"({(len(errors)/len(json_file_paths))*100:.2f}%)"
    )

Files with errors: 0 of 213 (0.00%)


In [None]:
assert len(errors) == 0, f"There are still {len(still_incorrect)} files that need correcting."

THEN WHY AM I GETTING ERRORS FROM THE FIRST TRY/EXCEPT LOOP???

Update the report to show all resolved errors.

In [None]:
updated = pd.concat([
    json_report, 
    pd.DataFrame({
        "dimension_counted":"errored_files_after_auto_correction",
        "exception": "NA",
        "count":len(still_incorrect),
        "proportion_of_all_files":(len(still_incorrect)/total_json_files)
    }, index=[json_report.shape[0]])
])
updated

Unnamed: 0,dimension_counted,exception,count,proportion_of_all_files
0,exception_type,<class 'lxml.etree.XMLSyntaxError'>,43,40.19%
1,exception_subtype,Namespace prefix dc on title is not defined,33,30.84%
2,exception_subtype,Namespace prefix rdf for about on Description ...,7,6.54%
3,exception_subtype,xmlns:dc: Empty XML namespace is not allowed,1,0.93%
4,exception_subtype,Namespace prefix rdf on Description is not def...,1,0.93%
5,exception_subtype,xmlParseEntityRef: no name,1,0.93%
6,total_files,,107,100.00%
7,files_with_error,,43,40.19%
8,errored_files_after_auto_correction,,0,0.0


In [None]:
metadata_standard = "cidoc-and-sdo"
data_serialization = "json"
report_type = "syntax_error_stats"
json_report.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

Put a copy of all the initially correct files in the same `corrected` directory as the corrected files.

In [None]:
corrected_dir_name = "corrected"
for correct_dc in correct_dc_files:
    with open(correct_dc, "r") as f:
        content = f.read()
        f.close()
    new_path = correct_dc.replace("cleaned", corrected_dir_name)
    with open(new_path, "w") as f:
        f.write(content)
        f.close()
print(f"Copied the rest of the correct files into the {corrected_dir_name} directory!")

Copied the rest of the correct files into the corrected directory!


THEN WHY AM I GETTING ERRORS FROM THE FIRST TRY/EXCEPT LOOP???