# Data Preparation 

### Linking Anthropology's Data and Archives (LADA)

### AI-Generated Linked Data Evaluation (part I)

In [24]:
import config
import utils
import pandas as pd
import numpy as np
from pathlib import Path
import os
import re

Read in CSV data as a pandas DataFrame, where `f` is the name of the data file:

In [25]:
# Uncomment one of the following code lines or replace with your own
# path to the folder (directory) where the data file f is located
# ------------------------------------
# data_dir = config.task1_data
data_dir = config.playgrd1_data
# data_dir = config.playgrd3_data
# data_dir = "path/to/your/data/"

# The name of the data file
# ------------------------------------
# f = "4-HDataExperimentAssignmentsAndOutcomes_Outcomes_Task1.csv"
f = "4-HDataExperimentAssignmentsAndOutcomes_Playground_Task1.csv"
# f = "4-HDataExperimentAssignmentsAndOutcomes_Playground_Task3.csv"
# f = "your_data_file_name.csv"

In [26]:
df = pd.read_csv(data_dir + f)
df.head(3)

Unnamed: 0,ID,Filename,"Transkribus Transcription (or link to separate doc, if too long)","GPT Transcription (or link to separate doc, if too long)",GPT Prompt,most accurate transcript?,Metadata record,Schema.org Record,CIDOC-CRM Record,prompts to GPT,...,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,,"Ann Aldrich, Mushmilling Hundred, Kent County,...",part County Deliway State\nIn quesereson taken...,Kent County Delivery State\nInquisition taken ...,"""transcribe this""",ChatGPT,<dublincore>\n <dc:title>Inquisition Report...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid...",“create a Dublin Core metadata record”\n“creat...,...,,,,,,,,,,
1,,"Edward Allen, No Hundred Listed, Kent County, ...",Personly Append before Jame Richarson\nCa...,Kent County\n\nPersonally appeared before Jame...,"""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<metad...","{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid...",“create a Dublin Core metadata record in xml”\...,...,,,,,,,,,,
2,,"Mirandy Messick, Nanticoke Hundred, Sussex Cou...",r Mutlean Indented & taken in Nantcke\nty\nSup...,Inquest convened\nState of Alabama\n\nA pre-mi...,"""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<metad...","{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": ""https://www.cidoc-crm.org/ci...",“create a Dublin Core metadata record in XML b...,...,,,,,,,,,,


Remove empty columns from the DataFrame:

In [27]:
print(df.shape)
df = df.dropna(axis=1, how='all')
print(df.shape)
df.head(3)

(999, 28)
(999, 9)


Unnamed: 0,Filename,"Transkribus Transcription (or link to separate doc, if too long)","GPT Transcription (or link to separate doc, if too long)",GPT Prompt,most accurate transcript?,Metadata record,Schema.org Record,CIDOC-CRM Record,prompts to GPT
0,"Ann Aldrich, Mushmilling Hundred, Kent County,...",part County Deliway State\nIn quesereson taken...,Kent County Delivery State\nInquisition taken ...,"""transcribe this""",ChatGPT,<dublincore>\n <dc:title>Inquisition Report...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid...",“create a Dublin Core metadata record”\n“creat...
1,"Edward Allen, No Hundred Listed, Kent County, ...",Personly Append before Jame Richarson\nCa...,Kent County\n\nPersonally appeared before Jame...,"""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<metad...","{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid...",“create a Dublin Core metadata record in xml”\...
2,"Mirandy Messick, Nanticoke Hundred, Sussex Cou...",r Mutlean Indented & taken in Nantcke\nty\nSup...,Inquest convened\nState of Alabama\n\nA pre-mi...,"""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<metad...","{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": ""https://www.cidoc-crm.org/ci...",“create a Dublin Core metadata record in XML b...


Remove empty rows from the DataFrame:

In [28]:
df.dropna(how="all", inplace=True)
print(df.shape)
df.tail(3)

(30, 9)


Unnamed: 0,Filename,"Transkribus Transcription (or link to separate doc, if too long)","GPT Transcription (or link to separate doc, if too long)",GPT Prompt,most accurate transcript?,Metadata record,Schema.org Record,CIDOC-CRM Record,prompts to GPT
27,Rumsey 8,,,,,,,,
28,Rumsey 9,,,,,,,,
29,Rumsey 10,,,,,,,,


If there's no column with unique identifiers per row, make one:

In [29]:
col_list = list(df.columns)
col_list_lower = [col.lower() for col in col_list]
# print(col_list_lower)
for col in col_list_lower:
    if (col != "id") or (col != "identifier") or (not "_id" in col):
        # Create a unique identifier per row
        new_ids = list(range(0, df.shape[0]))
        df.insert(0, "id", new_ids)
        break
    else:
        col_values = list(df[col])
        if len(set(col_values)) != len(col_values):
            # Replace the column's values with a unique value
            # so there's one identifier per row
            new_ids = list(range(0, df.shape[0]))
            df[col] = new_ids

In [30]:
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

Rows: 30
Columns: 10


In [31]:
df.head()

Unnamed: 0,id,Filename,"Transkribus Transcription (or link to separate doc, if too long)","GPT Transcription (or link to separate doc, if too long)",GPT Prompt,most accurate transcript?,Metadata record,Schema.org Record,CIDOC-CRM Record,prompts to GPT
0,0,"Ann Aldrich, Mushmilling Hundred, Kent County,...",part County Deliway State\nIn quesereson taken...,Kent County Delivery State\nInquisition taken ...,"""transcribe this""",ChatGPT,<dublincore>\n <dc:title>Inquisition Report...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid...",“create a Dublin Core metadata record”\n“creat...
1,1,"Edward Allen, No Hundred Listed, Kent County, ...",Personly Append before Jame Richarson\nCa...,Kent County\n\nPersonally appeared before Jame...,"""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<metad...","{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid...",“create a Dublin Core metadata record in xml”\...
2,2,"Mirandy Messick, Nanticoke Hundred, Sussex Cou...",r Mutlean Indented & taken in Nantcke\nty\nSup...,Inquest convened\nState of Alabama\n\nA pre-mi...,"""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<metad...","{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": ""https://www.cidoc-crm.org/ci...",“create a Dublin Core metadata record in XML b...
3,3,"James Abbott, Mispillion Hundred, Kent County,...",Renl County by Anedmstion indented and take\nS...,"\nKent County, State of Delaware\n\nAn inquisi...","""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<rdf:R...","{\n ""@context"": ""http://schema.org"",\n ""@typ...","{\n ""@context"": {\n ""@vocab"": ""http://www....",“create a Dublin core record in xml based on b...
4,4,"Merritt Allee, Little Creek Hundred, Kent Coun...",Jient Crupty S.\nAn Inquiſte Aaten Le Ealeenth...,Front County S.S.\nInquisition taken at Greenw...,"""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<metad...","{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": ""https://www.cidoc-crm.org/ci...",“create a Dublin Core record in XML”\n“create ...


Replace the column names for clarity, consistency, and conciseness:

In [None]:
# Note: if your data header is different, replace the strings of column names before
# each colon with the column names of your data (i.e., "ID" might become "identifier").
# The strings after the colon are the new column names that the code below will expect.

df.rename(columns={
    # "ID":"id",                            # Unique identifier for each row
    "Filename":"filename", 
    "Metadata record": "dc_record",       # Dublin Core metadata record
    #"Transcription or caption (or link to separate doc, if too long)":"transcription_or_caption",
    "Schema.org Record":"sdo_record",     # Schema.org metadata record
    "CIDOC-CRM Record":"cidoccrm_record"  # CIDOC-CRM metadata record
    }, inplace=True)
df.head()

Unnamed: 0,id,filename,"Transkribus Transcription (or link to separate doc, if too long)","GPT Transcription (or link to separate doc, if too long)",GPT Prompt,most accurate transcript?,dc_record,sdo_record,cidoccrm_record,prompts to GPT
0,0,"Ann Aldrich, Mushmilling Hundred, Kent County,...",part County Deliway State\nIn quesereson taken...,Kent County Delivery State\nInquisition taken ...,"""transcribe this""",ChatGPT,<dublincore>\n <dc:title>Inquisition Report...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid...",“create a Dublin Core metadata record”\n“creat...
1,1,"Edward Allen, No Hundred Listed, Kent County, ...",Personly Append before Jame Richarson\nCa...,Kent County\n\nPersonally appeared before Jame...,"""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<metad...","{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid...",“create a Dublin Core metadata record in xml”\...
2,2,"Mirandy Messick, Nanticoke Hundred, Sussex Cou...",r Mutlean Indented & taken in Nantcke\nty\nSup...,Inquest convened\nState of Alabama\n\nA pre-mi...,"""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<metad...","{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": ""https://www.cidoc-crm.org/ci...",“create a Dublin Core metadata record in XML b...
3,3,"James Abbott, Mispillion Hundred, Kent County,...",Renl County by Anedmstion indented and take\nS...,"\nKent County, State of Delaware\n\nAn inquisi...","""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<rdf:R...","{\n ""@context"": ""http://schema.org"",\n ""@typ...","{\n ""@context"": {\n ""@vocab"": ""http://www....",“create a Dublin core record in xml based on b...
4,4,"Merritt Allee, Little Creek Hundred, Kent Coun...",Jient Crupty S.\nAn Inquiſte Aaten Le Ealeenth...,Front County S.S.\nInquisition taken at Greenw...,"""transcribe this""",ChatGPT,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<metad...","{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": ""https://www.cidoc-crm.org/ci...",“create a Dublin Core record in XML”\n“create ...


Create a directory to store the cleaner version of the data:

In [33]:
data_dir = data_dir + "cleaned/"
Path(data_dir).mkdir(parents=True, exist_ok=True)
df.to_csv(data_dir + f)

In [34]:
record_ids = list(df["id"])
print("Total IDs (rows):", len(record_ids))
print("Sample ID:", record_ids[2])

Total IDs (rows): 30
Sample ID: 2


### Dublin Core
Write the [Dublin Core](https://www.dublincore.org) (DC) records as XML files.

In [35]:
# Remove rows with an empty value in the dc_record column
df_dc = df[df["dc_record"].notna()]
dc_records_ids = list(df_dc["id"])

In [36]:
dc_records = list(df_dc["dc_record"])
print(dc_records[2])

<?xml version="1.0" encoding="UTF-8"?>
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
    <dc:title>Inquest Records from Alabama, 1822</dc:title>
    <dc:creator>Jonathan Myrick (Coroner)</dc:creator>
    <dc:contributor>John Spivey, Josiah C. Nail, Thomas Howard, Harmon Howard, Mac Maxwell, William Cole, Josiah Elliott, Elias Taylor, Warner Spivey, James Roberts, Isaac Dolby, Upham Wootten, Anthony Egan, Delilah Phillips, Joseph Elliott, James Roberts</dc:contributor>
    <dc:subject>Coroner Inquests, Accidental Deaths, Historical Legal Records</dc:subject>
    <dc:description>These documents record the inquests into the deaths of Thomas Maxwell and Mirandy Myrick in Alabama in 1822. Thomas Maxwell died from injuries sustained when a sapling fell on him, while Mirandy Myrick was killed when a pine sapling, cut by her brothers, fell on her head.</dc:description>
    <dc:publisher>State of Alabama</dc:publisher>
    <dc:date>1822-04-01</dc:date>
    <dc:type>Text</dc:type>
    <

In [37]:
dc_path = data_dir+"dublin_core/"
Path(dc_path).mkdir(parents=True, exist_ok=True)

In [38]:
utils.write_xml(dc_records_ids, dc_records, dc_path, file_prefix="dc_record_")

Wrote dc_record_000.xml!
Wrote dc_record_001.xml!
Wrote dc_record_002.xml!
Wrote dc_record_003.xml!
Wrote dc_record_004.xml!
Wrote dc_record_005.xml!
Wrote dc_record_006.xml!
Wrote dc_record_007.xml!
Wrote dc_record_008.xml!
Wrote dc_record_009.xml!
Wrote dc_record_015.xml!
Wrote dc_record_016.xml!
Wrote dc_record_017.xml!
Wrote dc_record_018.xml!
Wrote dc_record_019.xml!
Wrote dc_record_020.xml!
Wrote dc_record_021.xml!
Wrote dc_record_022.xml!
Wrote dc_record_023.xml!
Wrote dc_record_024.xml!
Wrote dc_record_025.xml!
Wrote dc_record_026.xml!


**Note:** There is inconsistency in the DC record formatting!  For example:

```
<?xml version='1.0' encoding='utf-8'?>
<dublin_core><dc element="title">Turnin' Timez: Original Student Poems</dc>
    ...
</dublin_core>
```
---
```
<?xml version="1.0"?>
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>National 4-H Center Major Pledges, Contributions, and Grants</dc:title>
    ...
</metadata>
```
---
```
<?xml version="1.0" encoding="UTF-8"?>
<metadata xmlns:dcterms="http://purl.org/dc/terms/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>4-H National Youth Science Day</dc:title>
    ...
</metadata>
```
---
```
<dc:title>Climbing Up: Fun Activities for You and Your Cat</dc:title>
   ...
<dc:rights>Unknown</dc:rights>
```

Also note that [DCMI documentation](https://www.dublincore.org/specifications/dublin-core/dcmi-terms/) encourages the use of http://purl.org/dc/terms/ over http://purl.org/dc/elements/1.1/.

### Schema.org
Write the [Schema.org](https://schema.org) records as JSON-LD files.

In [39]:
# Remove rows with an empty value in the sdo_record column
df_sdo = df[df["sdo_record"].notna()]
sdo_records_ids = list(df_sdo["id"])

In [40]:
sdo_path = data_dir+"schema_org/"
Path(sdo_path).mkdir(parents=True, exist_ok=True)

In [41]:
sdo_records = list(df_sdo["sdo_record"])
print(sdo_records[2])

{
  "@context": "https://schema.org",
  "@type": "CreativeWork",
  "name": "Inquest Records from Alabama, 1822",
  "author": {
    "@type": "Person",
    "name": "Jonathan Myrick (Coroner)"
  },
  "contributor": [
    {"@type": "Person", "name": "John Spivey"},
    {"@type": "Person", "name": "Josiah C. Nail"},
    {"@type": "Person", "name": "Thomas Howard"},
    {"@type": "Person", "name": "Harmon Howard"},
    {"@type": "Person", "name": "Mac Maxwell"},
    {"@type": "Person", "name": "William Cole"},
    {"@type": "Person", "name": "Josiah Elliott"},
    {"@type": "Person", "name": "Elias Taylor"},
    {"@type": "Person", "name": "Warner Spivey"},
    {"@type": "Person", "name": "James Roberts"},
    {"@type": "Person", "name": "Isaac Dolby"},
    {"@type": "Person", "name": "Upham Wootten"},
    {"@type": "Person", "name": "Anthony Egan"},
    {"@type": "Person", "name": "Delilah Phillips"},
    {"@type": "Person", "name": "Joseph Elliott"},
    {"@type": "Person", "name": "James 

In [42]:
utils.write_json(sdo_records_ids, sdo_records, sdo_path, "sdo_record_")

Wrote sdo_record_000.json!
Wrote sdo_record_001.json!
Wrote sdo_record_002.json!
Wrote sdo_record_003.json!
Wrote sdo_record_004.json!
Wrote sdo_record_005.json!
Wrote sdo_record_006.json!
Wrote sdo_record_007.json!
Wrote sdo_record_008.json!
Wrote sdo_record_009.json!
Wrote sdo_record_010.json!
Wrote sdo_record_011.json!
Wrote sdo_record_012.json!
Wrote sdo_record_013.json!
Wrote sdo_record_014.json!
Wrote sdo_record_015.json!
Wrote sdo_record_016.json!
Wrote sdo_record_017.json!
Wrote sdo_record_018.json!
Wrote sdo_record_019.json!
Wrote sdo_record_020.json!
Wrote sdo_record_021.json!
Wrote sdo_record_022.json!
Wrote sdo_record_023.json!
Wrote sdo_record_024.json!
Wrote sdo_record_025.json!
Wrote sdo_record_026.json!


### CIDOC-CRM
Write the [CIDOC-CRM](https://cidoc-crm.org) records as JSON-LD files.

In [43]:
# Remove rows with an empty value in the cidoccrm_record column
df_cidoc = df[df["cidoccrm_record"].notna()]
cidoc_records_ids = list(df_cidoc["id"])

In [44]:
cidoc_path = data_dir+"cidoc_crm/"
Path(cidoc_path).mkdir(parents=True, exist_ok=True)


In [45]:
cidoc_records = list(df_cidoc["cidoccrm_record"])
print(cidoc_records[2])

{
  "@context": "https://www.cidoc-crm.org/cidoc-crm/",
  "@type": "E31_Document",
  "P1_is_identified_by": "Inquest Records from Alabama, 1822",
  "P14_carried_out_by": {
    "@type": "E39_Actor",
    "P131_is_identified_by": "Jonathan Myrick (Coroner)"
  },
  "P70_documents": [
    {
      "@type": "E7_Activity",
      "P2_has_type": "Coroner Inquest",
      "P4_has_time-span": "1822-04-01",
      "P7_took_place_at": "Alabama, United States"
    }
  ],
  "P67_refers_to": [
    {
      "@type": "E21_Person",
      "P131_is_identified_by": "Thomas Maxwell"
    },
    {
      "@type": "E21_Person",
      "P131_is_identified_by": "Mirandy Myrick"
    }
  ],
  "P129_is_about": [
    {
      "@type": "E55_Type",
      "P131_is_identified_by": "Accidental Death"
    },
    {
      "@type": "E55_Type",
      "P131_is_identified_by": "Historical Legal Records"
    }
  ],
  "P94_has_created": {
    "@type": "E73_Information_Object",
    "P3_has_note": "These documents record the inquests into 

In [46]:
utils.write_json(cidoc_records_ids, cidoc_records, cidoc_path, "cidoccrm_record_")

Wrote cidoccrm_record_000.json!
Wrote cidoccrm_record_001.json!
Wrote cidoccrm_record_002.json!
Wrote cidoccrm_record_003.json!
Wrote cidoccrm_record_004.json!
Wrote cidoccrm_record_005.json!
Wrote cidoccrm_record_006.json!
Wrote cidoccrm_record_007.json!
Wrote cidoccrm_record_008.json!
Wrote cidoccrm_record_009.json!
Wrote cidoccrm_record_010.json!
Wrote cidoccrm_record_011.json!
Wrote cidoccrm_record_012.json!
Wrote cidoccrm_record_013.json!
Wrote cidoccrm_record_014.json!
Wrote cidoccrm_record_015.json!
Wrote cidoccrm_record_016.json!
Wrote cidoccrm_record_017.json!
Wrote cidoccrm_record_018.json!
Wrote cidoccrm_record_019.json!
