# Evaluation: Completeness

Part II of the computational evaluation of AI-generated linked data for [Linking Anthropology's Data and Archives (LADA)](https://ischool.umd.edu/projects/building-a-sustainable-future-for-anthropologys-archives-researching-primary-source-data-lifecycles-infrastructures-and-reuse/), focused on completeness (e.g., metadata fields are not empty or 'unknown').

---

**Table of Contents:**

I. [Data Loading](#data-loading)

II. [Completeness](#completeness)

  * [Content of fields](#content-of-fields)

    * [Dublin Core](#dublin-core)

    * [JSON-LD](#json-ld)

  * [Comparison to transcription???](#comparison-to-transcription)

---

## Data Loading

In [1]:
import utils
import config
import pandas as pd
import numpy as np
import urllib
import urllib.request
from urllib.parse import urlparse   # urlparse(URL_TO_CHECK)
import xml.etree.ElementTree as ET
import json
from lxml import etree
import rdflib
from rdflib.namespace import DC, SDO # Dublin Core, Schema.org
from pathlib import Path
import os
import re

  last_close_tag = re.findall("<\/[a-z]+>$", f_string)
  has_prolog = re.findall('<\?xml version="1.0"[^<]*>', f_string)
  comments = re.findall("\n\s*\/\/\s*\w.+|\n\s*\/\*\s*.+\s*\*\/|\n\s*#\s*.+", f_string)
  double_quotes = re.findall("""[@\w]+"":[\s\[\{]*"".+""", f_string)
  open_brace = re.findall("\{", f_string)
  close_brace = re.findall("\}", f_string)


Create variables to reference existing directories and files.

In [2]:
dublin_path = "cleaned/dublin_core/"  # XML data files
schema_path = "cleaned/schema_org/"   # JSON data files
cidoc_path = "cleaned/cidoc_crm/"     # JSON data files

dublin_t1_dir = config.task1_data+dublin_path
schema_t1_dir = config.task1_data+schema_path
cidoc_t1_dir = config.task1_data+cidoc_path

dublin_p1_dir = config.playgrd1_data+dublin_path
schema_p1_dir = config.playgrd1_data+schema_path
cidoc_p1_dir = config.playgrd1_data+cidoc_path

dublin_p3_dir = config.playgrd3_data+dublin_path
schema_p3_dir = config.playgrd3_data+schema_path
cidoc_p3_dir = config.playgrd3_data+cidoc_path

Create a directory to store the error reports in.

In [3]:
d = "completeness"
report_dir = f"data/error_reports/{d}/"
Path(report_dir).mkdir(parents=True, exist_ok=True)

# report_dir = f"data/error_reports/{d}/after_correction/"
# Path(report_dir).mkdir(parents=True, exist_ok=True)

## Content of Fields

### Dublin Core

In [4]:
# Read the TXT files so all generated metadata can be read, whether or not the XML is well-formed.
extension = ".txt"
dublin_file_paths = []
dublin_files_t1 = [f for f in os.listdir(dublin_t1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_t1_dir+f for f in dublin_files_t1]
dublin_files_p1 = [f for f in os.listdir(dublin_p1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p1_dir+f for f in dublin_files_p1]
dublin_files_p3 = [f for f in os.listdir(dublin_p3_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p3_dir+f for f in dublin_files_p3]
dublin_file_paths.sort()
total_dc_files = len(dublin_file_paths)
print(f"Total Dublin Core {extension[1:].upper()} files:", total_dc_files)

Total Dublin Core TXT files: 107


Check for empty metadata fields.

In [34]:
empty = re.compile("<[a-z:]+>\s*</[a-z:]+>|<[a-z:]+>(unknown|none|na|\"\"|\?)</[a-z:]+>")

  empty = re.compile("<[a-z:]+>\s*</[a-z:]+>|<[a-z:]+>(unknown|none|na|\"\"|\?)</[a-z:]+>")


In [44]:
files_with_empty, empty_fields_per_file, fields_per_file = [], [], []
for file_path in dublin_file_paths:
    with open(file_path, "r") as f:
        f_string = f.read().lower()
        # Look for empty fields in the file
        is_empty = re.finditer(empty, f_string)
        # Save the empty fields, including the opening and closing tags and any text in between
        empty_fields = [field[0] for field in is_empty]
        fields_per_file += [empty_fields]
        # Save the file path to the XML version of the file
        file_path.replace(".txt", ".xml")
        files_with_empty += [file_path]
        # Save the number of empty fields in the file
        empty_fields_per_file += [len(empty_fields)]
            
print(sum(empty_fields_per_file), "empty field(s) in", len(files_with_empty), "files found.")

118 empty field(s) in 107 files found.


So every file has empty data fields, and some files have multiple empty fields.

In [49]:
df_empty = pd.DataFrame.from_dict({"file_path":files_with_empty, "empty_field_count":empty_fields_per_file, "fields":fields_per_file}).sort_values(by="empty_field_count", ascending=False)
df_empty.head()

Unnamed: 0,file_path,empty_field_count,fields
1,data/data_playground_task1/cleaned/dublin_core...,6,"[<dc:creator>unknown</dc:creator>, <dc:publish..."
78,data/data_playground_task3/cleaned/dublin_core...,5,"[<dc:creator>unknown</dc:creator>, <dc:publish..."
15,data/data_playground_task1/cleaned/dublin_core...,5,"[<dc:creator>unknown</dc:creator>, <dc:publish..."
7,data/data_playground_task1/cleaned/dublin_core...,5,"[<dc:description>\n</dc:description>, <dc:date..."
60,data/data_playground_task1/cleaned/dublin_core...,4,"[<dc:contributor>\n</dc:contributor>, <dc:desc..."


In [50]:
df_empty.tail()

Unnamed: 0,file_path,empty_field_count,fields
69,data/data_playground_task3/cleaned/dublin_core...,0,[]
70,data/data_playground_task3/cleaned/dublin_core...,0,[]
71,data/data_playground_task3/cleaned/dublin_core...,0,[]
72,data/data_playground_task3/cleaned/dublin_core...,0,[]
106,data/data_task1/cleaned/dublin_core/dc_record_...,0,[]


In [55]:
empty_field_count_report = pd.DataFrame(df_empty.empty_field_count.value_counts()).rename(columns={"count":"file_count"})
empty_field_count_report

Unnamed: 0_level_0,file_count
empty_field_count,Unnamed: 1_level_1
0,45
1,36
2,10
3,7
4,5
5,3
6,1


In [59]:
df_empty_exploded = df_empty.explode("fields").drop(columns=["empty_field_count"])
df_empty_exploded.head()

Unnamed: 0,file_path,fields
1,data/data_playground_task1/cleaned/dublin_core...,<dc:creator>unknown</dc:creator>
1,data/data_playground_task1/cleaned/dublin_core...,<dc:publisher>unknown</dc:publisher>
1,data/data_playground_task1/cleaned/dublin_core...,<dc:contributor>unknown</dc:contributor>
1,data/data_playground_task1/cleaned/dublin_core...,<dc:date>unknown</dc:date>
1,data/data_playground_task1/cleaned/dublin_core...,<dc:relation>none</dc:relation>


In [None]:
df_empty_exploded = df_empty_exploded.dropna() # Remove files without any empty fields
df_empty_exploded.tail()

Unnamed: 0,file_path,fields
33,data/data_playground_task1/cleaned/dublin_core...,<dc:description>\n</dc:description>
11,data/data_playground_task1/cleaned/dublin_core...,<dc:creator>unknown</dc:creator>
95,data/data_task1/cleaned/dublin_core/dc_record_...,<dc:description>\n</dc:description>
94,data/data_task1/cleaned/dublin_core/dc_record_...,<dc:description>\n</dc:description>
38,data/data_playground_task1/cleaned/dublin_core...,<dc:description>\n</dc:description>


In [None]:
fields = (list(df_empty_exploded.fields))
tags = [re.search("<([a-z:]+)>", field)[0][1:-1] for field in fields]
df_empty_exploded.insert(len(df_empty_exploded.columns), "tag", tags)
df_empty_exploded.tail()

Unnamed: 0,file_path,fields,tag
33,data/data_playground_task1/cleaned/dublin_core...,<dc:description>\n</dc:description>,dc:description
11,data/data_playground_task1/cleaned/dublin_core...,<dc:creator>unknown</dc:creator>,dc:creator
95,data/data_task1/cleaned/dublin_core/dc_record_...,<dc:description>\n</dc:description>,dc:description
94,data/data_task1/cleaned/dublin_core/dc_record_...,<dc:description>\n</dc:description>,dc:description
38,data/data_playground_task1/cleaned/dublin_core...,<dc:description>\n</dc:description>,dc:description


In [69]:
tag_counts = pd.DataFrame(df_empty_exploded.tag.value_counts()).reset_index()
tag_counts

Unnamed: 0,tag,count
0,dc:description,35
1,dc:creator,17
2,dc:rights,15
3,dc:contributor,11
4,dc:publisher,10
5,dc:date,6
6,dc:relation,5
7,dcterms:creator,4
8,creator,2
9,dcterms:source,2


In [72]:
tag_values = list(tag_counts.tag)
tag_cats = []
for t in tag_values:
    if ":" in t:
        tag_cats += [t.split(":")[-1]]
    else:
        tag_cats += [t]
tag_counts.insert(1, "tag_category", tag_cats)
tag_counts

Unnamed: 0,tag,tag_category,count
0,dc:description,description,35
1,dc:creator,creator,17
2,dc:rights,rights,15
3,dc:contributor,contributor,11
4,dc:publisher,publisher,10
5,dc:date,date,6
6,dc:relation,relation,5
7,dcterms:creator,creator,4
8,creator,creator,2
9,dcterms:source,source,2


In [86]:
df_cats = tag_counts.groupby(["tag_category"]).transform("sum")
df_cats.insert(0, "tag_category", tag_counts.tag_category)
df_cats = df_cats.drop(columns=["tag"]).drop_duplicates()
df_cats

Unnamed: 0,tag_category,count
0,description,37
1,creator,23
2,rights,15
3,contributor,11
4,publisher,12
5,date,6
6,relation,5
9,source,3
11,coverage,2
12,identifier,2


Save the reports as CSV files.

In [51]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "empty_field_counts"
df_empty.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [56]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "files_per_empty_field_count"
empty_field_count_report.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [None]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "empty_fields_by_file"
df_empty_exploded.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [76]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "empty_field_tag_counts"
tag_counts.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [87]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "empty_field_tag_category_counts"
df_cats.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

Check that URLs are well-formed, that they exist, and that they connect to a relevant web resource.

### JSON-LD

Check for empty metadata fields.

Check that URLs are well-formed, that they exist, and that they connect to a relevant web resource.