# Evaluation: Syntax

Part I of the computational evaluation of AI-generated linked data for [Linking Anthropology's Data and Archives (LADA)](https://ischool.umd.edu/projects/building-a-sustainable-future-for-anthropologys-archives-researching-primary-source-data-lifecycles-infrastructures-and-reuse/), focused on syntax (e.g., do the metadata adhere to the expected serialization formats?).

---

**Table of Contents:**

I. [Data Loading](#data-loading)

II. [Syntax](#syntax)

  * [XML](#xml)
  
    * [Automated Correction](#automated-correction)

  * [JSON](#json)

    * [Automated Correction](#automated-correction)

---

## Data Loading

In [1]:
import utils
import config
import pandas as pd
import numpy as np
import urllib.request
import urllib
import xml.etree.ElementTree as ET
import json
from lxml import etree
import rdflib
from rdflib.namespace import DC, SDO # Dublin Core, Schema.org
from pathlib import Path
import os
import re

# sax - to validate XML well-formed
# xml.etree.ElementTree - to validate text between tags
# xml.etree.ElementTree + xml.etree.ElementTree.XMLSchema's validate() - to validate XML well-formed
# lxml etree.XMLParser - to validate well-formed based on input XML schema
# json_checker - to validate Python data types (incl. but not limited to those obtained from JSON)
# jsonschema.validate
# ShEx - for RDF graphs, ShExJ for JSON - NOTE: couldn't install package
# OntoME - for CIDOC-CRM ontology alignment

  last_close_tag = re.findall("<\/[a-z]+>$", f_string)
  has_prolog = re.findall('<\?xml version="1.0"[^<]*>', f_string)
  comments = re.findall("\n\s*\/\/\s*\w.+|\n\s*\/\*\s*.+\s*\*\/|\n\s*#\s*.+", f_string)
  double_quotes = re.findall("""[@\w]+"":[\s\[\{]*"".+""", f_string)
  open_brace = re.findall("\{", f_string)
  close_brace = re.findall("\}", f_string)


Create variables to reference existing directories and files.

In [2]:
dublin_path = "cleaned/dublin_core/"  # XML data files
schema_path = "cleaned/schema_org/"   # JSON data files
cidoc_path = "cleaned/cidoc_crm/"     # JSON data files

dublin_t1_dir = config.task1_data+dublin_path
schema_t1_dir = config.task1_data+schema_path
cidoc_t1_dir = config.task1_data+cidoc_path

dublin_p1_dir = config.playgrd1_data+dublin_path
schema_p1_dir = config.playgrd1_data+schema_path
cidoc_p1_dir = config.playgrd1_data+cidoc_path

dublin_p3_dir = config.playgrd3_data+dublin_path
schema_p3_dir = config.playgrd3_data+schema_path
cidoc_p3_dir = config.playgrd3_data+cidoc_path

Create variables to reference automatically corrected files and their directories.

In [3]:
dublin_path = "corrected/dublin_core/"  # XML data files
schema_path = "corrected/schema_org/"   # JSON data files
cidoc_path = "corrected/cidoc_crm/"     # JSON data files

dublin_t1_corrected_dir = config.task1_data+dublin_path
schema_t1_corrected_dir = config.task1_data+schema_path
cidoc_t1_corrected_dir = config.task1_data+cidoc_path

dublin_p1_corrected_dir = config.playgrd1_data+dublin_path
schema_p1_corrected_dir = config.playgrd1_data+schema_path
cidoc_p1_corrected_dir = config.playgrd1_data+cidoc_path

dublin_p3_corrected_dir = config.playgrd3_data+dublin_path
schema_p3_corrected_dir = config.playgrd3_data+schema_path
cidoc_p3_corrected_dir = config.playgrd3_data+cidoc_path

corrected_dirs = [dublin_t1_corrected_dir, schema_t1_corrected_dir, cidoc_t1_corrected_dir,
                  dublin_p1_corrected_dir, schema_p1_corrected_dir, cidoc_p1_corrected_dir,
                  dublin_p3_corrected_dir, schema_p3_corrected_dir, cidoc_p3_corrected_dir
                  ]
for corrected_dir in corrected_dirs:
    Path(corrected_dir).mkdir(parents=True, exist_ok=True)

## Syntax

### XML

First, read and evaluate only the files with a `.xml` extension.

In [14]:
extension = ".xml"
dublin_file_paths = []
dublin_files_t1 = [f for f in os.listdir(dublin_t1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_t1_dir+f for f in dublin_files_t1]
dublin_files_p1 = [f for f in os.listdir(dublin_p1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p1_dir+f for f in dublin_files_p1]
dublin_files_p3 = [f for f in os.listdir(dublin_p3_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p3_dir+f for f in dublin_files_p3]
dublin_file_paths.sort()
total_dcxml_files = len(dublin_file_paths)
print(f"Total Dublin Core {extension[1:].upper()} files:", total_dcxml_files)

Total Dublin Core XML files: 107


In [21]:
syntax_errors, errored_files = [], []
for file_path in dublin_file_paths:
    try:
        tree = etree.parse(file_path)
    except Exception as e:
        f_error = {"file": file_path, "exception_type": type(e), "exception_message": str(e)}
        syntax_errors += [f_error]
        errored_files += [file_path]
print("Files with errors:", 
      len(errored_files), "of", total_dcxml_files,
      f"({(len(errored_files)/total_dcxml_files)*100:.2f}%)")

Files with errors: 43 of 107 (40.19%)


In [None]:
df_se = pd.DataFrame.from_dict(syntax_errors)
pattern = "^[\D]+,"
new_exception_col = df_se["exception_message"].apply(lambda x: re.findall(pattern, x)[0][:-1])
df_se.insert(len(df_se.columns)-1, "exception_subtype", new_exception_col)
df_se.head()

  pattern = "^[\D]+,"


Unnamed: 0,file,exception_type,exception_subtype,exception_message
0,data/data_playground_task1/cleaned/dublin_core...,<class 'lxml.etree.XMLSyntaxError'>,Namespace prefix dc on title is not defined,"Namespace prefix dc on title is not defined, l..."
1,data/data_playground_task1/cleaned/dublin_core...,<class 'lxml.etree.XMLSyntaxError'>,Namespace prefix dc on title is not defined,"Namespace prefix dc on title is not defined, l..."
2,data/data_playground_task1/cleaned/dublin_core...,<class 'lxml.etree.XMLSyntaxError'>,Namespace prefix dc on title is not defined,"Namespace prefix dc on title is not defined, l..."
3,data/data_playground_task1/cleaned/dublin_core...,<class 'lxml.etree.XMLSyntaxError'>,Namespace prefix dc on title is not defined,"Namespace prefix dc on title is not defined, l..."
4,data/data_playground_task1/cleaned/dublin_core...,<class 'lxml.etree.XMLSyntaxError'>,Namespace prefix dc on title is not defined,"Namespace prefix dc on title is not defined, l..."


Next, evaluate every record by reading the TXT files to:
- check whether relevnt DC namespace(s) are present
- check whether RDF namespace is present
- check whether a prolog is present
- check whether a prolog with UTF-8 encoding is present

In [None]:
# errored_files = list(df_se.file_path)
# error_list = list(df_se.exception_subtype)
# assert (len(error_list) == len(errored_files)), "Error list and errored files lists should be of the same length"
# txt_errored_files = [f.replace(".xml", ".txt") for f in errored_files]
# print(txt_errored_files[0])
# print(error_list[0])

In [22]:
extension = ".txt"
dublin_file_paths = []
dublin_files_t1 = [f for f in os.listdir(dublin_t1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_t1_dir+f for f in dublin_files_t1]
dublin_files_p1 = [f for f in os.listdir(dublin_p1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p1_dir+f for f in dublin_files_p1]
dublin_files_p3 = [f for f in os.listdir(dublin_p3_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p3_dir+f for f in dublin_files_p3]
dublin_file_paths.sort()
total_dctxt_files = len(dublin_file_paths)
print(f"Total Dublin Core {extension[1:].upper()} files:", total_dctxt_files)

Total Dublin Core TXT files: 107


In [25]:
custom_syntax_errors, more_errored_files = [], []
for file_path in dublin_file_paths:
    with open(file_path, "r") as f:
        f_string = f.read()
        if not utils.hasDCNamespaces(f_string):
            f_error = {"file": file_path, "exception_type": "Custom syntax check", "exception_subtype": "Missing namespace", "exception_message": "Missing Dublin Core namespace(s)"}
            custom_syntax_errors += [f_error]
            more_errored_files += [file_path]
        if not utils.hasRDFNamespace(f_string):
            f_error = {"file": file_path, "exception_type": "Custom syntax check", "exception_subtype": "Missing namespace", "exception_message": "Missing RDF namespace"}
            custom_syntax_errors += [f_error]
            more_errored_files += [file_path]
        if not utils.hasProlog(f_string):
            f_error = {"file": file_path, "exception_type": "Custom syntax check", "exception_subtype": "Missing prolog", "exception_message": "Missing prolog"}
            custom_syntax_errors += [f_error]
            more_errored_files += [file_path]
        if not utils.hasPrologWithEncoding(f_string):
            f_error = {"file": file_path, "exception_type": "Custom syntax check", "exception_subtype": "Missing prolog", "exception_message": "Missing prolog with UTF-8 encoding"}
            custom_syntax_errors += [f_error]
            more_errored_files += [file_path]
print(len(custom_syntax_errors), "additional syntax errors found across", len(set(more_errored_files)), "out of", total_dctxt_files, "files")

334 additional syntax errors found across 107 out of 107 files


In [None]:
df_se = pd.concat([df_se, pd.DataFrame.from_dict(custom_syntax_errors)])
new_file_col = df_se["file"].apply(lambda x: x.split("/")[-1])
df_se = df_se.rename(columns={"file":"file_path"})
df_se.insert(1, "file_name", new_file_col)
df_se.tail()

Unnamed: 0,file_path,file_name,exception_type,exception_subtype,exception_message
329,data/data_task1/cleaned/dublin_core/dc_record_...,dc_record_027.txt,Custom syntax check,Missing prolog,Missing prolog with UTF-8 encoding
330,data/data_task1/cleaned/dublin_core/dc_record_...,dc_record_028.txt,Custom syntax check,Missing namespace,Missing Dublin Core namespace(s)
331,data/data_task1/cleaned/dublin_core/dc_record_...,dc_record_028.txt,Custom syntax check,Missing namespace,Missing RDF namespace
332,data/data_task1/cleaned/dublin_core/dc_record_...,dc_record_028.txt,Custom syntax check,Missing prolog,Missing prolog
333,data/data_task1/cleaned/dublin_core/dc_record_...,dc_record_028.txt,Custom syntax check,Missing prolog,Missing prolog with UTF-8 encoding


In [28]:
subtype_report = pd.DataFrame(df_se.exception_subtype.value_counts()).reset_index()
subtype_report = subtype_report.rename(columns={"exception_subtype":"exception"})
subtype_report.insert(0, "dimension_counted", ["exception_subtype"]*subtype_report.shape[0])

In [29]:
type_report = pd.DataFrame(df_se.exception_type.value_counts()).reset_index()
type_report = type_report.rename(columns={"exception_type":"exception"})
type_report.insert(0, "dimension_counted", ["exception_type"]*type_report.shape[0])

In [33]:
all_errored_files = set(errored_files + more_errored_files)
totals_report = pd.DataFrame({
    "dimension_counted": ["total_files", "files_with_error"],
    "exception": ["NA", "NA"],
    "count": [len(dublin_file_paths), len(all_errored_files)]
    })

In [None]:
##############################################################################
# TO DO:
# Count actual number of files with each exception type and subtype 
# so can provide proper "count" and "propotion" values in DF below!!!
################################################################################

In [None]:
xml_report = pd.concat([type_report, subtype_report, totals_report])
proportions = (xml_report[["count"]]/total_dcxml_files).values
percentages = [f"{proportion[0]*100:.2f}%" for proportion in proportions]
xml_report.insert(len(xml_report.columns), "proportion_of_all_files", percentages)
xml_report = xml_report.reset_index().drop(columns=["index"])
xml_report


Unnamed: 0,dimension_counted,exception,count,proportion_of_all_files
0,exception_type,Custom syntax check,334,312.15%
1,exception_type,<class 'lxml.etree.XMLSyntaxError'>,43,40.19%
2,exception_subtype,Missing prolog,171,159.81%
3,exception_subtype,Missing namespace,163,152.34%
4,exception_subtype,Namespace prefix dc on title is not defined,33,30.84%
5,exception_subtype,Namespace prefix rdf for about on Description ...,7,6.54%
6,exception_subtype,xmlns:dc: Empty XML namespace is not allowed,1,0.93%
7,exception_subtype,Namespace prefix rdf on Description is not def...,1,0.93%
8,exception_subtype,xmlParseEntityRef: no name,1,0.93%
9,total_files,,107,100.00%


Save the reports as CSV files.

In [35]:
report_dir = "data/error_reports/"
Path(report_dir).mkdir(parents=True, exist_ok=True)

In [13]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "syntax_error_stats"
xml_report.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [36]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "syntax_errors"
df_se.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

##### Automated Correction

Try correcting undefined namespace prefix errors automatically, reading the errored files' equivalents with `.txt` extensions and saving the corrected files that can be parsed with an XML parser to a new directory, where each corrected file has a `.xml` extension.

In [13]:
df_se.exception_subtype.unique()

array(['Namespace prefix dc on title is not defined',
       'Namespace prefix rdf for about on Description is not defined',
       'xmlns:dc: Empty XML namespace is not allowed',
       'Namespace prefix rdf on Description is not defined',
       'xmlParseEntityRef: no name'], dtype=object)

In [14]:
errored_files = list(df_se.file_path)
error_list = list(df_se.exception_subtype)
assert (len(error_list) == len(errored_files)), "Error list and errored files lists should be of the same length"

In [15]:
txt_errored_files = [f.replace(".xml", ".txt") for f in errored_files]
print(txt_errored_files[0])
print(error_list[0])

data/data_playground_task1/cleaned/dublin_core/dc_record_005.txt
Namespace prefix dc on title is not defined


In [16]:
still_incorrect = utils.correctXML(txt_errored_files, error_list)
print(f"Files that still need correcting: {still_incorrect}.")  #assert len(still_incorrect) == 0, 

Files that still need correcting: [{'file': 'data/data_playground_task1/cleaned/dublin_core/dc_record_005.txt', 'exception_type': <class 'xml.etree.ElementTree.ParseError'>, 'exception_message': 'unbound prefix: line 1, column 0'}, {'file': 'data/data_playground_task1/cleaned/dublin_core/dc_record_006.txt', 'exception_type': <class 'xml.etree.ElementTree.ParseError'>, 'exception_message': 'unbound prefix: line 3, column 0'}, {'file': 'data/data_playground_task1/cleaned/dublin_core/dc_record_007.txt', 'exception_type': <class 'xml.etree.ElementTree.ParseError'>, 'exception_message': 'unbound prefix: line 1, column 0'}, {'file': 'data/data_playground_task1/cleaned/dublin_core/dc_record_008.txt', 'exception_type': <class 'xml.etree.ElementTree.ParseError'>, 'exception_message': 'unbound prefix: line 2, column 0'}, {'file': 'data/data_playground_task1/cleaned/dublin_core/dc_record_009.txt', 'exception_type': <class 'xml.etree.ElementTree.ParseError'>, 'exception_message': 'unbound prefix: 

In [None]:
more_df_se = pd.DataFrame.from_dict(still_incorrect)
new_file_col = df_se["file_path"].apply(lambda x: x.split("/")[-1])
more_df_se.insert(1, "file_name", new_file_col)
more_df_se.head()

Unnamed: 0,file,file_name,exception_type,exception_message
0,data/data_playground_task1/cleaned/dublin_core...,dc_record_005.xml,Malformed XML,No closing tag found for outermost element.
1,data/data_playground_task1/cleaned/dublin_core...,dc_record_006.xml,Malformed XML,No closing tag found for outermost element.
2,data/data_playground_task1/cleaned/dublin_core...,dc_record_007.xml,Malformed XML,No closing tag found for outermost element.
3,data/data_playground_task1/cleaned/dublin_core...,dc_record_008.xml,Malformed XML,No closing tag found for outermost element.
4,data/data_playground_task1/cleaned/dublin_core...,dc_record_009.xml,Malformed XML,No closing tag found for outermost element.


Great!  We corrected all the Dublin Core XML metadata!

Update the report to show this.

In [None]:
updated = pd.concat([
    xml_report, 
    pd.DataFrame({
        "dimension_counted":"errored_files_after_auto_correction",
        "exception": "NA",
        "count":len(still_incorrect),
        "proportion_of_all_files":(len(still_incorrect)/total_dcxml_files)
    }, index=[xml_report.shape[0]])
])
updated

Unnamed: 0,dimension_counted,exception,count,proportion_of_all_files
0,exception_type,<class 'lxml.etree.XMLSyntaxError'>,43,40.19%
1,exception_subtype,Namespace prefix dc on title is not defined,33,30.84%
2,exception_subtype,Namespace prefix rdf for about on Description ...,7,6.54%
3,exception_subtype,xmlns:dc: Empty XML namespace is not allowed,1,0.93%
4,exception_subtype,Namespace prefix rdf on Description is not def...,1,0.93%
5,exception_subtype,xmlParseEntityRef: no name,1,0.93%
6,total_files,,107,100.00%
7,files_with_error,,43,40.19%
8,errored_files_after_auto_correction,,0,0.0


In [24]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "syntax_error_stats"
xml_report.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

Put a copy of all the initially correct files in the same `corrected` directory as the corrected files.

In [None]:
correct_dc_files = []
for f in dublin_file_paths:
    if f not in errored_files:
        correct_dc_files += [f]
print("Files with correct syntax:", len(correct_dc_files), "of", len(dublin_file_paths))

In [57]:
corrected_dir_name = "corrected"
for correct_dc in correct_dc_files:
    with open(correct_dc, "r") as f:
        content = f.read()
        f.close()
    new_path = correct_dc.replace("cleaned", corrected_dir_name)
    with open(new_path, "w") as f:
        f.write(content)
        f.close()
print(f"Copied the rest of the correct files into the {corrected_dir_name} directory!")

Copied the rest of the correct files into the corrected directory!


### JSON

First, read and evaluate only the files with a `.json` extension.

In [75]:
cidoc_file_paths = []
cidoc_files_t1 = [f for f in os.listdir(cidoc_t1_dir) if f.endswith(".json")]
cidoc_file_paths += [cidoc_t1_dir+f for f in cidoc_files_t1]
cidoc_files_p1 = [f for f in os.listdir(cidoc_p1_dir) if f.endswith(".json")]
cidoc_file_paths += [cidoc_p1_dir+f for f in cidoc_files_p1]
cidoc_files_p3 = [f for f in os.listdir(cidoc_p3_dir) if f.endswith(".json")]
cidoc_file_paths += [cidoc_p3_dir+f for f in cidoc_files_p3]
cidoc_file_paths.sort()
print("Total CIDOC-CRM JSON files:", len(cidoc_file_paths))

Total CIDOC-CRM JSON files: 97


In [76]:
cidoc_file_paths[0]

'data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_003.json'

In [77]:
schema_file_paths = []
schema_files_t1 = os.listdir(schema_t1_dir)
schema_file_paths += [schema_t1_dir+f for f in schema_files_t1 if f.endswith(".json")]
schema_files_p1 = os.listdir(schema_p1_dir)
schema_file_paths += [schema_p1_dir+f for f in schema_files_p1 if f.endswith(".json")]
schema_files_p3 = os.listdir(schema_p3_dir)
schema_file_paths += [schema_p3_dir+f for f in schema_files_p3 if f.endswith(".json")]
schema_file_paths.sort()
print("Total Schema.org JSON files:", len(schema_file_paths))

Total Schema.org JSON files: 116


In [78]:
schema_file_paths[0]

'data/data_playground_task1/cleaned/schema_org/sdo_record_003.json'

In [108]:
json_file_paths = cidoc_file_paths + schema_file_paths
total_json_files = len(json_file_paths)
print(len(json_file_paths))

213


In [80]:
syntax_errors = []
for json_f in json_file_paths:
    with open(json_f) as f:
        try:
            data = json.load(f)
        except Exception as e:
            f_error = {"file": json_f, "exception_type": type(e), "exception_message": str(e)}
            syntax_errors += [f_error]
        f.close()
print(
    "Files with errors:", 
    len(syntax_errors), "of", len(json_file_paths),
    f"({(len(syntax_errors)/len(json_file_paths))*100:.2f}%)"
    )

Files with errors: 4 of 213 (1.88%)


In [81]:
df_se = pd.DataFrame.from_dict(syntax_errors)
new_file_col = df_se["file"].apply(lambda x: x.split("/")[-1])
df_se = df_se.rename(columns={"file":"file_path"})
df_se.insert(1, "file_name", new_file_col)
df_se

Unnamed: 0,file_path,file_name,exception_type,exception_message
0,data/data_playground_task1/cleaned/cidoc_crm/c...,cidoccrm_record_018.json,<class 'json.decoder.JSONDecodeError'>,"Expecting ',' delimiter: line 99 column 5 (cha..."
1,data/data_playground_task1/cleaned/cidoc_crm/c...,cidoccrm_record_070.json,<class 'json.decoder.JSONDecodeError'>,"Expecting ',' delimiter: line 35 column 1 (cha..."
2,data/data_playground_task1/cleaned/schema_org/...,sdo_record_018.json,<class 'json.decoder.JSONDecodeError'>,"Expecting ',' delimiter: line 27 column 1 (cha..."
3,data/data_task1/cleaned/schema_org/sdo_record_...,sdo_record_006.json,<class 'json.decoder.JSONDecodeError'>,Expecting ':' delimiter: line 2 column 5 (char 6)


In [82]:
df_se.exception_type.unique()

array([<class 'json.decoder.JSONDecodeError'>], dtype=object)

In [83]:
df_se.exception_message.unique()

array(["Expecting ',' delimiter: line 99 column 5 (char 3125)",
       "Expecting ',' delimiter: line 35 column 1 (char 1407)",
       "Expecting ',' delimiter: line 27 column 1 (char 914)",
       "Expecting ':' delimiter: line 2 column 5 (char 6)"], dtype=object)

Looking at the files that triggered the above error messages, it seems that often what's missing is the final curly brace.  Also, in one file, the quotes surrounding strings were doubled (e.g., `""@context"":...` instead of `"@context"`) and in another file, an attempt at a comment was made using `//`.  While the error messages are useful in locating the general source of the error within a file, they're less informative for distinguishing what needs to be changed to correct the file.

Export a reports about the JSON errors.

In [84]:
type_report = pd.DataFrame(df_se.exception_type.value_counts()).reset_index()
type_report = type_report.rename(columns={"exception_type":"exception"})
type_report.insert(0, "dimension_counted", ["exception_type"]*type_report.shape[0])

In [85]:
totals_report = pd.DataFrame({
    "dimension_counted": ["total_files", "files_with_error"],
    "exception": ["NA", "NA"],
    "count": [len(json_file_paths), len(syntax_errors)]
    })

In [86]:
json_report = pd.concat([type_report, totals_report])
proportions = (json_report[["count"]]/(len(json_file_paths))).values
percentages = [f"{proportion[0]*100:.2f}%" for proportion in proportions]
# print(proportions)
# print(percentages)
json_report.insert(len(json_report.columns), "proportion_of_all_files", percentages)
json_report = json_report.reset_index().drop(columns=["index"])
json_report


Unnamed: 0,dimension_counted,exception,count,proportion_of_all_files
0,exception_type,<class 'json.decoder.JSONDecodeError'>,4,1.88%
1,total_files,,213,100.00%
2,files_with_error,,4,1.88%


Save the reports as CSV files.

In [87]:
report_dir = "data/error_reports/"
Path(report_dir).mkdir(parents=True, exist_ok=True)

In [None]:
metadata_standard = "cidoc-and-sdo"
data_serialization = "json"

In [None]:
report_type = "syntax_error_stats"
json_report.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [None]:
report_type = "syntax_errors"
df_se.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

#### Automated Correction

In [91]:
errored_files = list(df_se.file_path)
txt_errored_files = [f.replace(".json", ".txt") for f in errored_files]
print(txt_errored_files[0])

data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_018.txt


In [92]:
correct_json_files = []
for f in cidoc_file_paths:
    if f not in errored_files:
        correct_json_files += [f]
for f in schema_file_paths:
    if f not in errored_files:
        correct_json_files += [f]
print("Total correct JSON files:", len(correct_json_files))
print("Sample:", correct_json_files[0])

Total correct JSON files: 209
Sample: data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_003.json


In [None]:
still_incorrect, comments_found, new_syntax_errors = utils.correctJSON(txt_errored_files)

In [103]:
print(len(still_incorrect), "still incorrect:", still_incorrect)
print(len(comments_found), "comments(s) found:", comments_found)
print("New errors:", len(new_syntax_errors))


4 still incorrect: ['data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_018.txt', 'data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_070.txt', 'data/data_playground_task1/cleaned/schema_org/sdo_record_018.txt', 'data/data_task1/cleaned/schema_org/sdo_record_006.txt']
1 comments(s) found: [{'errored_file': 'data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_018.txt', 'comment': ['\n    // Additional members go here']}]
New errors: 4


In [104]:
for new_e in new_syntax_errors:
    print(new_e)

{'file': 'data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_018.json', 'exception_type': <class 'io.UnsupportedOperation'>, 'exception_message': 'not readable'}
{'file': 'data/data_playground_task1/cleaned/cidoc_crm/cidoccrm_record_070.json', 'exception_type': <class 'io.UnsupportedOperation'>, 'exception_message': 'not readable'}
{'file': 'data/data_playground_task1/cleaned/schema_org/sdo_record_018.json', 'exception_type': <class 'io.UnsupportedOperation'>, 'exception_message': 'not readable'}
{'file': 'data/data_task1/cleaned/schema_org/sdo_record_006.json', 'exception_type': <class 'io.UnsupportedOperation'>, 'exception_message': 'not readable'}


In [106]:
errors = []
for txt_f in still_incorrect:
    json_f = txt_f.replace(".txt", ".json")
    new_json_path = json_f.replace("cleaned", "corrected")
    with open(new_json_path) as f:
        try:
            data = json.load(f)
        except Exception as e:
            f_error = {"file": json_f, "exception_type": type(e), "exception_message": str(e)}
            errors += [f_error]
        f.close()
print(
    "Files with errors:", 
    len(errors), "of", len(json_file_paths),
    f"({(len(errors)/len(json_file_paths))*100:.2f}%)"
    )

Files with errors: 0 of 213 (0.00%)


In [107]:
assert len(errors) == 0, f"There are still {len(still_incorrect)} files that need correcting."

THEN WHY AM I GETTING ERRORS FROM THE FIRST TRY/EXCEPT LOOP???

Update the report to show all resolved errors.

In [None]:
updated = pd.concat([
    json_report, 
    pd.DataFrame({
        "dimension_counted":"errored_files_after_auto_correction",
        "exception": "NA",
        "count":len(still_incorrect),
        "proportion_of_all_files":(len(still_incorrect)/total_json_files)
    }, index=[json_report.shape[0]])
])
updated

Unnamed: 0,dimension_counted,exception,count,proportion_of_all_files
0,exception_type,<class 'lxml.etree.XMLSyntaxError'>,43,40.19%
1,exception_subtype,Namespace prefix dc on title is not defined,33,30.84%
2,exception_subtype,Namespace prefix rdf for about on Description ...,7,6.54%
3,exception_subtype,xmlns:dc: Empty XML namespace is not allowed,1,0.93%
4,exception_subtype,Namespace prefix rdf on Description is not def...,1,0.93%
5,exception_subtype,xmlParseEntityRef: no name,1,0.93%
6,total_files,,107,100.00%
7,files_with_error,,43,40.19%
8,errored_files_after_auto_correction,,0,0.0


In [None]:
metadata_standard = "cidoc-and-sdo"
data_serialization = "json"
report_type = "syntax_error_stats"
json_report.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

Put a copy of all the initially correct files in the same `corrected` directory as the corrected files.

In [None]:
corrected_dir_name = "corrected"
for correct_dc in correct_dc_files:
    with open(correct_dc, "r") as f:
        content = f.read()
        f.close()
    new_path = correct_dc.replace("cleaned", corrected_dir_name)
    with open(new_path, "w") as f:
        f.write(content)
        f.close()
print(f"Copied the rest of the correct files into the {corrected_dir_name} directory!")

Copied the rest of the correct files into the corrected directory!


THEN WHY AM I GETTING ERRORS FROM THE FIRST TRY/EXCEPT LOOP???