# Evaluation: Completeness

Part II of the computational evaluation of AI-generated linked data for [Linking Anthropology's Data and Archives (LADA)](https://ischool.umd.edu/projects/building-a-sustainable-future-for-anthropologys-archives-researching-primary-source-data-lifecycles-infrastructures-and-reuse/), focused on completeness (e.g., metadata fields are not empty or 'unknown').

---

**Table of Contents:**

I. [Data Loading](#data-loading)

II. [Completeness](#completeness)

  * [Content of Fields](#content-of-fields): check for emptiness and URL validity (and that URL provided is relevant???  Or is that conformance???)

    * [Dublin Core](#dublin-core)

    * [JSON-LD](#json-ld)

  * [Comparison to transcription???](#comparison-to-transcription)

---

## Data Loading

In [1]:
import utils
import config
import pandas as pd
import numpy as np
import urllib
import urllib.request
from urllib.parse import urlparse   # urlparse(URL_TO_CHECK)
import xml.etree.ElementTree as ET
import json
from lxml import etree
import rdflib
from rdflib.namespace import DC, SDO # Dublin Core, Schema.org
from pathlib import Path
import os
import re

Create variables to reference existing directories and files.

In [2]:
dublin_path = "cleaned/dublin_core/"  # XML data files
schema_path = "cleaned/schema_org/"   # JSON data files
cidoc_path = "cleaned/cidoc_crm/"     # JSON data files

dublin_t1_dir = config.task1_data+dublin_path
schema_t1_dir = config.task1_data+schema_path
cidoc_t1_dir = config.task1_data+cidoc_path

dublin_p1_dir = config.playgrd1_data+dublin_path
schema_p1_dir = config.playgrd1_data+schema_path
cidoc_p1_dir = config.playgrd1_data+cidoc_path

dublin_p3_dir = config.playgrd3_data+dublin_path
schema_p3_dir = config.playgrd3_data+schema_path
cidoc_p3_dir = config.playgrd3_data+cidoc_path

Create a directory to store the error reports in.

In [3]:
d = "completeness"
report_dir = f"data/error_reports/{d}/"
Path(report_dir).mkdir(parents=True, exist_ok=True)

# report_dir = f"data/error_reports/{d}/after_correction/"
# Path(report_dir).mkdir(parents=True, exist_ok=True)

## Content of Fields

### Dublin Core

#### Empty Fields

In [4]:
# Read the TXT files so all generated metadata can be read, whether or not the XML is well-formed.
extension = ".txt"
dublin_file_paths = []
dublin_files_t1 = [f for f in os.listdir(dublin_t1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_t1_dir+f for f in dublin_files_t1]
dublin_files_p1 = [f for f in os.listdir(dublin_p1_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p1_dir+f for f in dublin_files_p1]
dublin_files_p3 = [f for f in os.listdir(dublin_p3_dir) if f.endswith(extension)]
dublin_file_paths += [dublin_p3_dir+f for f in dublin_files_p3]
dublin_file_paths.sort()
total_dc_files = len(dublin_file_paths)
print(f"Total Dublin Core {extension[1:].upper()} files:", total_dc_files)

Total Dublin Core TXT files: 107


Check for empty metadata fields.

In [5]:
empty = re.compile('<[a-z:]+>\s*</[a-z:]+>|<[a-z:]+>[a-z ]*(unknown|none|na|\"\"|\?|not specified)</[a-z:]+>|<[a-z:]+ [a-z\="]+>[a-z ]*(unknown|none|na|\"\"|\?|\s*|not specified)</[a-z]+>')

  empty = re.compile('<[a-z:]+>\s*</[a-z:]+>|<[a-z:]+>[a-z ]*(unknown|none|na|\"\"|\?|not specified)</[a-z:]+>|<[a-z:]+ [a-z\="]+>[a-z ]*(unknown|none|na|\"\"|\?|\s*|not specified)</[a-z]+>')


In [6]:
files_with_empty, empty_fields_per_file, fields_per_file = [], [], []
for file_path in dublin_file_paths:
    with open(file_path, "r") as f:
        f_string = f.read().lower()
        # Look for empty fields in the file
        is_empty = re.finditer(empty, f_string)
        # Save the empty fields, including the opening and closing tags and any text in between
        empty_fields = [field[0] for field in is_empty]
        fields_per_file += [empty_fields]
        # Save the file path to the XML version of the file
        file_path.replace(".txt", ".xml")
        files_with_empty += [file_path]
        # Save the number of empty fields in the file
        empty_fields_per_file += [len(empty_fields)]
        f.close()
print(sum(empty_fields_per_file), "empty field(s) in", len(files_with_empty), "files found.")

124 empty field(s) in 107 files found.


So every file has empty data fields, and some files have multiple empty fields.

In [7]:
df_empty = pd.DataFrame.from_dict({"file_path":files_with_empty, "empty_field_count":empty_fields_per_file, "fields":fields_per_file}).sort_values(by="empty_field_count", ascending=False)
df_empty.head()

Unnamed: 0,file_path,empty_field_count,fields
1,data/data_playground_task1/cleaned/dublin_core...,6,"[<dc:creator>unknown</dc:creator>, <dc:publish..."
78,data/data_playground_task3/cleaned/dublin_core...,5,"[<dc:creator>unknown</dc:creator>, <dc:publish..."
15,data/data_playground_task1/cleaned/dublin_core...,5,"[<dc:creator>unknown</dc:creator>, <dc:publish..."
7,data/data_playground_task1/cleaned/dublin_core...,5,"[<dc:description>\n</dc:description>, <dc:date..."
60,data/data_playground_task1/cleaned/dublin_core...,4,"[<dc:contributor>\n</dc:contributor>, <dc:desc..."


In [8]:
df_empty.tail()

Unnamed: 0,file_path,empty_field_count,fields
69,data/data_playground_task3/cleaned/dublin_core...,0,[]
70,data/data_playground_task3/cleaned/dublin_core...,0,[]
71,data/data_playground_task3/cleaned/dublin_core...,0,[]
72,data/data_playground_task3/cleaned/dublin_core...,0,[]
106,data/data_task1/cleaned/dublin_core/dc_record_...,0,[]


In [9]:
empty_field_count_report = pd.DataFrame(df_empty.empty_field_count.value_counts()).rename(columns={"count":"file_count"})
empty_field_count_report

Unnamed: 0_level_0,file_count
empty_field_count,Unnamed: 1_level_1
0,44
1,35
2,11
4,7
3,6
5,3
6,1


In [10]:
df_empty_exploded = df_empty.explode("fields").drop(columns=["empty_field_count"])
df_empty_exploded.head()

Unnamed: 0,file_path,fields
1,data/data_playground_task1/cleaned/dublin_core...,<dc:creator>unknown</dc:creator>
1,data/data_playground_task1/cleaned/dublin_core...,<dc:publisher>unknown</dc:publisher>
1,data/data_playground_task1/cleaned/dublin_core...,<dc:contributor>unknown</dc:contributor>
1,data/data_playground_task1/cleaned/dublin_core...,<dc:date>unknown</dc:date>
1,data/data_playground_task1/cleaned/dublin_core...,<dc:relation>none</dc:relation>


In [11]:
df_empty_exploded = df_empty_exploded.dropna() # Remove files without any empty fields
df_empty_exploded.tail()

Unnamed: 0,file_path,fields
95,data/data_task1/cleaned/dublin_core/dc_record_...,<dc:description>\n</dc:description>
94,data/data_task1/cleaned/dublin_core/dc_record_...,<dc:description>\n</dc:description>
38,data/data_playground_task1/cleaned/dublin_core...,<dc:description>\n</dc:description>
33,data/data_playground_task1/cleaned/dublin_core...,<dc:description>\n</dc:description>
92,data/data_task1/cleaned/dublin_core/dc_record_...,<dcterms:creator>unknown</dcterms:creator>


In [12]:
fields = (list(df_empty_exploded.fields))
tags = [re.search('(?<=<)([a-z:]+)(?=>)|(?<=")[a-z]+(?=")', field)[0] for field in fields]
df_empty_exploded.insert(len(df_empty_exploded.columns), "tag", tags)
df_empty_exploded.tail()

Unnamed: 0,file_path,fields,tag
95,data/data_task1/cleaned/dublin_core/dc_record_...,<dc:description>\n</dc:description>,dc:description
94,data/data_task1/cleaned/dublin_core/dc_record_...,<dc:description>\n</dc:description>,dc:description
38,data/data_playground_task1/cleaned/dublin_core...,<dc:description>\n</dc:description>,dc:description
33,data/data_playground_task1/cleaned/dublin_core...,<dc:description>\n</dc:description>,dc:description
92,data/data_task1/cleaned/dublin_core/dc_record_...,<dcterms:creator>unknown</dcterms:creator>,dcterms:creator


In [13]:
tag_counts = pd.DataFrame(df_empty_exploded.tag.value_counts()).reset_index()
tag_counts

Unnamed: 0,tag,count
0,dc:description,35
1,dc:rights,18
2,dc:creator,17
3,dc:contributor,11
4,dc:publisher,10
5,dc:date,6
6,dc:relation,5
7,dcterms:creator,4
8,dc:identifier,2
9,dc:coverage,2


In [14]:
tag_values = list(tag_counts.tag)
tag_cats = []
for t in tag_values:
    if ":" in t:
        tag_cats += [t.split(":")[-1]]
    else:
        tag_cats += [t]
tag_counts.insert(1, "tag_category", tag_cats)
tag_counts

Unnamed: 0,tag,tag_category,count
0,dc:description,description,35
1,dc:rights,rights,18
2,dc:creator,creator,17
3,dc:contributor,contributor,11
4,dc:publisher,publisher,10
5,dc:date,date,6
6,dc:relation,relation,5
7,dcterms:creator,creator,4
8,dc:identifier,identifier,2
9,dc:coverage,coverage,2


In [15]:
df_cats = tag_counts.groupby(["tag_category"]).transform("sum")
df_cats.insert(0, "tag_category", tag_counts.tag_category)
df_cats = df_cats.drop(columns=["tag"]).drop_duplicates()
df_cats

Unnamed: 0,tag_category,count
0,description,37
1,rights,18
2,creator,23
3,contributor,11
4,publisher,13
5,date,6
6,relation,5
8,identifier,2
9,coverage,2
11,source,3


Save the reports as CSV files.

In [16]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "empty_field_counts"
df_empty.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [17]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "files_per_empty_field_count"
empty_field_count_report.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [18]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "empty_fields_by_file"
df_empty_exploded.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [19]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "empty_field_tag_counts"
tag_counts.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [20]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "empty_field_tag_category_counts"
df_cats.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

#### URLs

Check that URLs are well-formed and that they exist.

In [7]:
url_pattern = re.compile('[a-z]+:[a-z]+=[^>]+( [^>])*(?=>)')

In [22]:
# Find all the URLs
files_with_urls, url_count_per_file, urls_per_file = [], [], []
for file_path in dublin_file_paths:
    with open(file_path, "r") as f:
        f_string = f.read().lower()
        
        # Look for URLs in the file
        has_urls = re.finditer(url_pattern, f_string)
        # Save the URLs in a list per file
        file_urls = []
        for match in has_urls:
            url = match[0]
            if " " in url:
                multiple = url.split(" ")
                file_urls = file_urls + multiple
                # print(file_urls)
            else:
                file_urls += [url]
        urls_per_file += [file_urls]
        url_count_per_file += [len(file_urls)]
        
        if len(file_urls) > 0:
            # Save the file path to the XML version of the file
            file_path.replace(".txt", ".xml")
            files_with_urls += [file_path]
        
        # Save the number of empty fields in the file
        empty_fields_per_file += [len(empty_fields)]

        f.close()

print(sum(url_count_per_file), "URLs found in", len(files_with_urls), "files.")

95 URLs found in 67 files.


In [46]:
url_df = pd.DataFrame.from_dict({"file_path":dublin_file_paths, "url_count":url_count_per_file, "urls":urls_per_file}).sort_values(by="url_count", ascending=False)
url_df.head()

Unnamed: 0,file_path,url_count,urls
27,data/data_playground_task1/cleaned/dublin_core...,4,"[xmlns:dc=""http://purl.org/dc/elements/1.1/"", ..."
48,data/data_playground_task1/cleaned/dublin_core...,4,"[xmlns:dc=""http://purl.org/dc/elements/1.1/"", ..."
57,data/data_playground_task1/cleaned/dublin_core...,4,"[xmlns:dc=""http://purl.org/dc/elements/1.1/"", ..."
58,data/data_playground_task1/cleaned/dublin_core...,4,"[xmlns:dc=""http://purl.org/dc/elements/1.1/"", ..."
13,data/data_playground_task1/cleaned/dublin_core...,4,"[xmlns:dc=""http://purl.org/dc/elements/1.1/"", ..."


In [47]:
url_df = url_df.loc[url_df["url_count"] > 0]  # Keep only files with URLs
url_df_exploded = url_df.explode("urls").drop(columns=["url_count"])
url_df_exploded.head()

Unnamed: 0,file_path,urls
27,data/data_playground_task1/cleaned/dublin_core...,"xmlns:dc=""http://purl.org/dc/elements/1.1/"""
27,data/data_playground_task1/cleaned/dublin_core...,"xmlns:xsi=""http://www.w3.org/2001/xmlschema-in..."
27,data/data_playground_task1/cleaned/dublin_core...,"xsi:schemalocation=""http://purl.org/dc/element..."
27,data/data_playground_task1/cleaned/dublin_core...,http://dublincore.org/schemas/xmls/qdc/2008/02...
48,data/data_playground_task1/cleaned/dublin_core...,"xmlns:dc=""http://purl.org/dc/elements/1.1/"""


In [48]:
urls = list(url_df_exploded.urls)
print(urls[:3])

['xmlns:dc="http://purl.org/dc/elements/1.1/"', 'xmlns:xsi="http://www.w3.org/2001/xmlschema-instance"', 'xsi:schemalocation="http://purl.org/dc/elements/1.1/']


Check that each URL is preceded by a namespace and surrounded in quotes (i.e., `xmlns:dc="[URL_GOES_HERE]"`), otherwise the URL was incorrectly included in the metadata record.

In [49]:
correct_pattern = '[a-z]+:[a-z]+="https?://[a-z0-9\-._~:/?#@!$&\'()*+,;=%]+"'

  correct_pattern = '[a-z]+:[a-z]+="https?://[a-z0-9\-._~:/?#@!$&\'()*+,;=%]+"'


In [50]:
is_valid = []
for url in urls:
    if re.match(correct_pattern, url):
        is_valid += [True]
    else:
        is_valid += [False]
url_df_exploded.insert(len(url_df_exploded.columns), "is_valid", is_valid)
url_df_exploded.tail()

Unnamed: 0,file_path,urls,is_valid
92,data/data_task1/cleaned/dublin_core/dc_record_...,"xmlns:dcterms=""http://purl.org/dc/terms/""",True
94,data/data_task1/cleaned/dublin_core/dc_record_...,"xmlns:dc=""http://purl.org/dc/elements/1.1/""",True
95,data/data_task1/cleaned/dublin_core/dc_record_...,"xmlns:dc=""http://purl.org/dc/elements/1.1/""",True
23,data/data_playground_task1/cleaned/dublin_core...,"xmlns:dc=""http://purl.org/dc/elements/1.1/""",True
22,data/data_playground_task1/cleaned/dublin_core...,"xmlns:dc=""http://purl.org/dc/elements/1.1/""",True


In [109]:
total_urls = url_df_exploded.shape[0]
print("Total URLs:", total_urls)

Total URLs: 95


In [115]:
url_status = pd.DataFrame(url_df_exploded.is_valid.value_counts()).rename(columns={"count":"total_urls"})
proportions = (url_status[["total_urls"]]/total_urls).values
percentages = [f"{proportion[0]*100:.2f}%" for proportion in proportions]
url_status.insert(len(url_status.columns), "proportion_of_urls", percentages)
url_status

Unnamed: 0_level_0,total_urls,proportion_of_urls
is_valid,Unnamed: 1_level_1,Unnamed: 2_level_1
True,75,78.95%
False,20,21.05%


In [116]:
file_url_status = url_df_exploded.drop(columns=["urls"]).drop_duplicates()
file_url_status = pd.DataFrame(file_url_status.is_valid.value_counts()).rename(columns={"count":"file_count"})
df_url_status = url_status.join(file_url_status)
proportions = (df_url_status[["file_count"]]/total_dc_files).values
percentages = [f"{proportion[0]*100:.2f}%" for proportion in proportions]
df_url_status.insert(len(df_url_status.columns), "proportion_of_files", percentages)
df_url_status

Unnamed: 0_level_0,total_urls,proportion_of_urls,file_count,proportion_of_files
is_valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
True,75,78.95%,71,66.36%
False,20,21.05%,13,12.15%


Extract the URLs provided, even if not in a valid format within a metadata record, and then check whether the URL exists.

In [101]:
# Check that each URL is a valid URL
os.environ["no_proxy"] = "*"  #https://docs.python.org/3/library/urllib.request.html 
request_errors = []
for url in urls:
    clean = re.findall('https?:\/\/[^>"]+', url)
    if len(clean) > 0:
        clean_url = clean[0]
        clean_url = clean_url.strip('"')
        clean_url = clean_url.strip(' ')
        try:
            url_request = urllib.request.Request(clean_url, headers={'User-Agent': 'Mozilla/8.0'})
            html = urllib.request.urlopen(url_request, timeout=5).read()
            request_errors += [None]  # No error
        except Exception as e:
            request_errors += [str(e)]
    else:
        request_errors += ["Invalid format (no request made)"]
print("Finished requests!")

  clean = re.findall('https?:\/\/[^>"]+', url)


Finished requests!


In [102]:
print(request_errors[:5])

[None, 'HTTP Error 300: Multiple Choices', None, None, None]


In [103]:
url_df_exploded = url_df_exploded.drop(columns=["request_error"])
url_df_exploded.insert(len(url_df_exploded.columns), "request_error", request_errors)
url_df_exploded.tail()

Unnamed: 0,file_path,urls,is_valid,request_error
92,data/data_task1/cleaned/dublin_core/dc_record_...,"xmlns:dcterms=""http://purl.org/dc/terms/""",True,
94,data/data_task1/cleaned/dublin_core/dc_record_...,"xmlns:dc=""http://purl.org/dc/elements/1.1/""",True,
95,data/data_task1/cleaned/dublin_core/dc_record_...,"xmlns:dc=""http://purl.org/dc/elements/1.1/""",True,
23,data/data_playground_task1/cleaned/dublin_core...,"xmlns:dc=""http://purl.org/dc/elements/1.1/""",True,
22,data/data_playground_task1/cleaned/dublin_core...,"xmlns:dc=""http://purl.org/dc/elements/1.1/""",True,


In [104]:
url_errors_df = url_df_exploded.loc[url_df_exploded.is_valid == False]
url_errors_df = url_errors_df.loc[url_errors_df.request_error != None]
url_errors_df.shape

(20, 4)

In [122]:
url_errors_df

Unnamed: 0,file_path,urls,is_valid,request_error
27,data/data_playground_task1/cleaned/dublin_core...,"xsi:schemalocation=""http://purl.org/dc/element...",False,
27,data/data_playground_task1/cleaned/dublin_core...,http://dublincore.org/schemas/xmls/qdc/2008/02...,False,
48,data/data_playground_task1/cleaned/dublin_core...,"xsi:schemalocation=""http://purl.org/dc/element...",False,
48,data/data_playground_task1/cleaned/dublin_core...,http://dublincore.org/schemas/xmls/simpledc200...,False,
57,data/data_playground_task1/cleaned/dublin_core...,"xsi:schemalocation=""http://purl.org/dc/element...",False,
57,data/data_playground_task1/cleaned/dublin_core...,http://dublincore.org/schemas/xmls/simpledc200...,False,
58,data/data_playground_task1/cleaned/dublin_core...,"xsi:schemalocation=""http://purl.org/dc/element...",False,
58,data/data_playground_task1/cleaned/dublin_core...,http://dublincore.org/schemas/xmls/simpledc200...,False,
13,data/data_playground_task1/cleaned/dublin_core...,"xsi:schemalocation=""http://purl.org/dc/element...",False,
13,data/data_playground_task1/cleaned/dublin_core...,http://dublincore.org/schemas/xmls/simpledc200...,False,


In [128]:
validity_by_url = pd.DataFrame(url_df_exploded.request_error.value_counts()).rename(columns={"count":"url_count"})
validity_by_file = pd.DataFrame(url_df_exploded.drop(columns=["urls", "is_valid"]).drop_duplicates().request_error.value_counts()).rename(columns={"count":"file_count"})
validity_stats = validity_by_url.join(validity_by_file, how="outer").reset_index()
validity_stats = validity_stats.rename(columns={"request_error":"url_error_type"})
validity_stats

Unnamed: 0,url_error_type,url_count,file_count
0,HTTP Error 300: Multiple Choices,9,9
1,HTTP Error 404: Not Found,3,3
2,Invalid format (no request made),4,4


In [129]:
invalid_ref_urls = url_errors_df.loc[url_errors_df.is_valid == False].shape[0]
invalid_ref_files = url_errors_df.drop(columns=["urls"]).drop_duplicates()
invalid_ref_files = invalid_ref_files.loc[invalid_ref_files.is_valid == False].shape[0]
invalid_ref_df = pd.DataFrame({"url_error_type":["Invalid reference to URL"], "url_count":[invalid_ref_urls], "file_count":[invalid_ref_files]})
validity_stats = pd.concat([validity_stats, invalid_ref_df], ignore_index=True)
validity_stats

Unnamed: 0,url_error_type,url_count,file_count
0,HTTP Error 300: Multiple Choices,9,9
1,HTTP Error 404: Not Found,3,3
2,Invalid format (no request made),4,4
3,Invalid reference to URL,20,13


Save the reports as CSV files.

In [25]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "url_counts"
url_df.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [118]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "url_validity_counts"
df_url_status.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [119]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "url_errors"
url_df_exploded.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

In [130]:
metadata_standard = "dublin_core"
data_serialization = "xml"
report_type = "url_errors_stats"
validity_stats.to_csv(
    report_dir+"{metadata_standard}_{data_serialization}_{report_type}.csv".format(
        metadata_standard=metadata_standard,
        data_serialization=data_serialization,
        report_type=report_type
        ), index=False
    )

### JSON-LD

Check for empty metadata fields.

Check that URLs are well-formed, that they exist, and that they connect to a relevant web resource.