# Data Preparation 

### Linking Anthropology's Data and Archives (LADA)

### AI-Generated Linked Data Evaluation (part I)

In [46]:
import config
import utils
import pandas as pd
import numpy as np
from pathlib import Path
import os
import re

Read in CSV data as a pandas DataFrame, where `f` is the name of the data file:

In [47]:
# Uncomment one of the following code lines or replace with your own
# path to the folder (directory) where the data file f is located
# ------------------------------------
# data_dir = config.task1_data
# data_dir = config.playgrd1_data
data_dir = config.playgrd3_data
# data_dir = "path/to/your/data/"

# The name of the data file
# ------------------------------------
# f = "4-HDataExperimentAssignmentsAndOutcomes_Outcomes_Task1.csv"
# f = "4-HDataExperimentAssignmentsAndOutcomes_Playground_Task1.csv"
f = "4-HDataExperimentAssignmentsAndOutcomes_Playground_Task3.csv"
# f = "your_data_file_name.csv"

In [48]:
df = pd.read_csv(data_dir + f)
df.head(10)

Unnamed: 0,ID,Model,Original Catalog Link,Name of Collection,File Name/Link to File,Metadata record,Identifiers?,Schema.org Record,CIDOC-CRM Record,Unnamed: 9,...,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,1.0,Model pre-1,,,,,,,,,...,,,,,,,,,,
1,2.0,Model 1,,,,,,,,,...,,,,,,,,,,
2,3.0,Model 2,,,,,,,,,...,,,,,,,,,,
3,4.0,Model 3,,,,,,,,,...,,,,,,,,,,
4,5.0,Model 4,,,,,,,,,...,,,,,,,,,,
5,6.0,Model pre-1,https://archives.library.unt.edu/repositories/...,Adam Makowicz Collection,https://archives.library.unt.edu/repositories/...,"<metadata xmlns:dc=""http://purl.org/dc/element...",Here are some relevant **Wikidata identifiers ...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""E78"": ""http://www.cid...",,...,,,,,,,,,,
6,7.0,Model 1,https://archives.lib.ku.edu/repositories/3/res...,"Wyandotte Nation correspondence, documents, an...",https://archives.lib.ku.edu/repositories/3/res...,"<metadata xmlns:dc=""http://purl.org/dc/element...",Here are some useful **Wikidata identifiers (Q...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""E78"": ""http://www.cid...",,...,,,,,,,,,,
7,8.0,Model 2,https://findingaids.library.nyu.edu/archives/m...,"Derrick A. Bell, Jr. Papers",https://archives.lib.ku.edu/repositories/3/res...,"<metadata xmlns:dc=""http://purl.org/dc/element...",Here are relevant **Wikidata identifiers (QIDs...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""E78"": ""http://www.cid...",,...,,,,,,,,,,
8,9.0,Model 3,,,,,,,,,...,,,,,,,,,,
9,10.0,Model 4,,,,,,,,,...,,,,,,,,,,


Remove empty columns from the DataFrame:

In [49]:
print(df.shape)
df = df.dropna(axis=1, how='all')
print(df.shape)
df.head()

(999, 27)
(999, 9)


Unnamed: 0,ID,Model,Original Catalog Link,Name of Collection,File Name/Link to File,Metadata record,Identifiers?,Schema.org Record,CIDOC-CRM Record
0,1.0,Model pre-1,,,,,,,
1,2.0,Model 1,,,,,,,
2,3.0,Model 2,,,,,,,
3,4.0,Model 3,,,,,,,
4,5.0,Model 4,,,,,,,


Remove rows without a metadata record from the DataFrame:

In [50]:
# df.dropna(how="all", inplace=True)
df.dropna(subset=["Metadata record"], inplace=True)
print(df.shape)
df.tail()

(14, 9)


Unnamed: 0,ID,Model,Original Catalog Link,Name of Collection,File Name/Link to File,Metadata record,Identifiers?,Schema.org Record,CIDOC-CRM Record
20,21.0,Model pre-1,https://archivesfiles.delaware.gov/Online-Guid...,Town of Clayton,Model pre-1: Town of Clayton (DE Public Archives),"<metadata xmlns:dc=""http://purl.org/dc/element...","Based on the contents of the document, I will ...","{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid..."
21,22.0,Model 1,https://archivesfiles.delaware.gov/Online-Guid...,Kent County Republican Committee Records,Model 1: Kent County Republican Committee Reco...,"<metadata xmlns:dc=""http://purl.org/dc/element...","Upon reviewing the full document, it primarily...","{\n ""@context"": ""https://schema.org"",\n ""@ty...",Here’s an example of a CIDOC-CRM record in JSO...
22,23.0,Model 2,https://sova.si.edu/record/nmah.ac.0060.s01.01...,Warshaw Collection of Business Americana Subje...,Model 2 - Warshaw women .xml Model 2: Warshaw ...,"<metadata xmlns:dc=""http://purl.org/dc/element...",Here are potential Wikidata identifiers (QIDs)...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid..."
23,24.0,Model 3,https://sova.si.edu/record/acma.06-124.11,Arthur Ellis Photographs,Model 3: Arthur Ellis Photographs (SOVA),"<dublin_core xmlns:dc=""http://purl.org/dc/elem...",Here are some potential Wikidata identifiers f...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid..."
24,25.0,Model 4,https://transcription.si.edu/project/23301,Maidenform (item only),Model 4: Maidenform (item) (Smithsonian),"<metadata xmlns:dc=""http://purl.org/dc/element...",Here are the potential Wikidata identifiers fo...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid..."


If there's no column with unique identifiers per row, make one:

In [51]:
col_list = list(df.columns)
identifier_col = ""
for col in col_list:
    if (col.lower() == "id") or (col.lower() == "identifier") or ("_id" in col.lower()):
        # Check that the existing IDs are, in fact, unique to each row
        existing_ids = list(df[col])
        if len(existing_ids) == len(set(existing_ids)):
            identifier_col = col
if len(identifier_col) == 0:
    # Create a unique identifier per row
    new_ids = list(range(0, df.shape[0]))
    df.insert(0, "id", new_ids)

In [52]:
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

Rows: 14
Columns: 9


Replace the column names for clarity, consistency, and conciseness:

In [53]:
# Note: if your data header is different, replace the strings of column names before
# each colon with the column names of your data (i.e., "ID" might become "identifier").
# The strings after the colon are the new column names that the code below will expect.
if len(identifier_col) > 0:
        df.rename(columns={
        identifier_col:"id",                  # Unique identifier for each row
        "Filename":"filename", 
        "Metadata record": "dc_record",       # Dublin Core metadata record
        #"Transcription or caption (or link to separate doc, if too long)":"transcription_or_caption",
        "Schema.org Record":"sdo_record",     # Schema.org metadata record
        "CIDOC-CRM Record":"cidoccrm_record"  # CIDOC-CRM metadata record
        }, inplace=True)
else:
    df.rename(columns={
        "Filename":"filename", 
        "Metadata record": "dc_record",       # Dublin Core metadata record
        #"Transcription or caption (or link to separate doc, if too long)":"transcription_or_caption",
        "Schema.org Record":"sdo_record",     # Schema.org metadata record
        "CIDOC-CRM Record":"cidoccrm_record"  # CIDOC-CRM metadata record
        }, inplace=True)
    
df.head()


Unnamed: 0,id,Model,Original Catalog Link,Name of Collection,File Name/Link to File,dc_record,Identifiers?,sdo_record,cidoccrm_record
5,6.0,Model pre-1,https://archives.library.unt.edu/repositories/...,Adam Makowicz Collection,https://archives.library.unt.edu/repositories/...,"<metadata xmlns:dc=""http://purl.org/dc/element...",Here are some relevant **Wikidata identifiers ...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""E78"": ""http://www.cid..."
6,7.0,Model 1,https://archives.lib.ku.edu/repositories/3/res...,"Wyandotte Nation correspondence, documents, an...",https://archives.lib.ku.edu/repositories/3/res...,"<metadata xmlns:dc=""http://purl.org/dc/element...",Here are some useful **Wikidata identifiers (Q...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""E78"": ""http://www.cid..."
7,8.0,Model 2,https://findingaids.library.nyu.edu/archives/m...,"Derrick A. Bell, Jr. Papers",https://archives.lib.ku.edu/repositories/3/res...,"<metadata xmlns:dc=""http://purl.org/dc/element...",Here are relevant **Wikidata identifiers (QIDs...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""E78"": ""http://www.cid..."
12,13.0,Model 2,https://nmdc.unm.edu/digital/collection/fapecf...,Fideicomiso Archivos Plutarco Elias Calles Y F...,Finding Aid_Fideicomiso Archivos Plutarco Elia...,<?xml version='1.0' encoding='utf-8'?>\n<dubli...,Wikidata:\nÁlvaro Obregón (Q517972)\nFernando ...,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""cidoc"": ""http://www.c..."
15,16.0,Model pre-1,https://digital.case.edu/islandora/object/ksl:...,Western Reserve Historical Society Manuscript ...,wrhsms06185.pdf,"<metadata xmlns:dc=""http://purl.org/dc/element...",Task 3_Wikidata Identifiers_Model Pre-1,"{\n ""@context"": ""https://schema.org"",\n ""@ty...","{\n ""@context"": {\n ""crm"": ""http://www.cid..."


Create a directory to store the cleaner version of the data:

In [54]:
data_dir = data_dir + "cleaned/"
Path(data_dir).mkdir(parents=True, exist_ok=True)
df.to_csv(data_dir + f)

In [55]:
record_ids = list(df["id"])
print("Total IDs (rows):", len(record_ids))
print("Sample ID:", record_ids[2])

Total IDs (rows): 14
Sample ID: 8.0


### Dublin Core
Write the [Dublin Core](https://www.dublincore.org) (DC) records as XML files.

In [56]:
# Remove rows with an empty value in the dc_record column
df_dc = df[df["dc_record"].notna()]
dc_records_ids = list(df_dc["id"])

In [57]:
dc_records = list(df_dc["dc_record"])
print(dc_records[2])

<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
  <dc:title>Derrick A. Bell, Jr. Papers</dc:title>
  <dc:creator>Bell, Derrick A., 1930-2011</dc:creator>
  <dc:contributor>New York University Archives</dc:contributor>
  <dc:date>1922-2011</dc:date>
  <dc:description>
    The papers document the personal and professional life of Derrick Bell, a pioneering civil rights attorney, legal scholar, and professor. The collection includes correspondence, legal documents, writings, teaching materials, and photographs, with significant content related to civil rights, critical race theory, and higher education.
  </dc:description>
  <dc:language>en</dc:language>
  <dc:identifier>MC.138</dc:identifier>
  <dc:publisher>New York University Archives</dc:publisher>
  <dc:rights>
    FERPA restrictions may apply. Contact NYU Archives for specific access and use conditions.
  </dc:rights>
  <dc:format>189 boxes of textual and born-digital materials</dc:format>
  <dc:type>Archival collection</dc:t

In [58]:
dc_path = data_dir+"dublin_core/"
Path(dc_path).mkdir(parents=True, exist_ok=True)

In [59]:
utils.write_xml(dc_records_ids, dc_records, dc_path, "dc_record_", ".xml")
utils.write_xml(dc_records_ids, dc_records, dc_path, "dc_record_", ".txt")

Wrote dc_record_006.xml!
Wrote dc_record_007.xml!
Wrote dc_record_008.xml!
Wrote dc_record_013.xml!
Wrote dc_record_016.xml!
Wrote dc_record_017.xml!
Wrote dc_record_018.xml!
Wrote dc_record_019.xml!
Wrote dc_record_020.xml!
Wrote dc_record_021.xml!
Wrote dc_record_022.xml!
Wrote dc_record_023.xml!
Wrote dc_record_024.xml!
Wrote dc_record_025.xml!
Wrote dc_record_006.txt!
Wrote dc_record_007.txt!
Wrote dc_record_008.txt!
Wrote dc_record_013.txt!
Wrote dc_record_016.txt!
Wrote dc_record_017.txt!
Wrote dc_record_018.txt!
Wrote dc_record_019.txt!
Wrote dc_record_020.txt!
Wrote dc_record_021.txt!
Wrote dc_record_022.txt!
Wrote dc_record_023.txt!
Wrote dc_record_024.txt!
Wrote dc_record_025.txt!


**Note:** There is inconsistency in the DC record formatting!  For example:

```
<?xml version='1.0' encoding='utf-8'?>
<dublin_core><dc element="title">Turnin' Timez: Original Student Poems</dc>
    ...
</dublin_core>
```
---
```
<?xml version="1.0"?>
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>National 4-H Center Major Pledges, Contributions, and Grants</dc:title>
    ...
</metadata>
```
---
```
<?xml version="1.0" encoding="UTF-8"?>
<metadata xmlns:dcterms="http://purl.org/dc/terms/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>4-H National Youth Science Day</dc:title>
    ...
</metadata>
```
---
```
<dc:title>Climbing Up: Fun Activities for You and Your Cat</dc:title>
   ...
<dc:rights>Unknown</dc:rights>
```

Also note that [DCMI documentation](https://www.dublincore.org/specifications/dublin-core/dcmi-terms/) encourages the use of http://purl.org/dc/terms/ over http://purl.org/dc/elements/1.1/.

### Schema.org
Write the [Schema.org](https://schema.org) records as JSON-LD files.

In [60]:
# Remove rows with an empty value in the sdo_record column
df_sdo = df[df["sdo_record"].notna()]
sdo_records_ids = list(df_sdo["id"])

In [61]:
sdo_path = data_dir+"schema_org/"
Path(sdo_path).mkdir(parents=True, exist_ok=True)

In [62]:
sdo_records = list(df_sdo["sdo_record"])
print(sdo_records[2])

{
  "@context": "https://schema.org",
  "@type": "ArchiveComponent",
  "name": "Derrick A. Bell, Jr. Papers",
  "identifier": "MC.138",
  "creator": {
    "@type": "Person",
    "name": "Derrick A. Bell",
    "birthDate": "1930",
    "deathDate": "2011",
    "sameAs": "https://www.wikidata.org/wiki/Q5267493"
  },
  "description": "The collection documents the legal, academic, and civil rights work of Derrick A. Bell. It includes correspondence, writings, legal cases, teaching materials, and photographs.",
  "temporalCoverage": "1922/2011",
  "inLanguage": "en",
  "materialExtent": "189 boxes",
  "holdingArchive": {
    "@type": "ArchiveOrganization",
    "name": "New York University Archives",
    "url": "https://findingaids.library.nyu.edu/archives/mc_138/"
  },
  "accessMode": "Partially restricted under FERPA",
  "about": [
    { "@type": "Thing", "name": "Critical race theory" },
    { "@type": "Thing", "name": "Civil rights" },
    { "@type": "Place", "name": "United States" }
  ]

In [63]:
utils.write_json(sdo_records_ids, sdo_records, sdo_path, "sdo_record_", ".json")
utils.write_json(sdo_records_ids, sdo_records, sdo_path, "sdo_record_", ".txt")

Wrote sdo_record_006.json!
Wrote sdo_record_007.json!
Wrote sdo_record_008.json!
Wrote sdo_record_013.json!
Wrote sdo_record_016.json!
Wrote sdo_record_017.json!
Wrote sdo_record_018.json!
Wrote sdo_record_019.json!
Wrote sdo_record_020.json!
Wrote sdo_record_021.json!
Wrote sdo_record_022.json!
Wrote sdo_record_023.json!
Wrote sdo_record_024.json!
Wrote sdo_record_025.json!
Wrote sdo_record_006.txt!
Wrote sdo_record_007.txt!
Wrote sdo_record_008.txt!
Wrote sdo_record_013.txt!
Wrote sdo_record_016.txt!
Wrote sdo_record_017.txt!
Wrote sdo_record_018.txt!
Wrote sdo_record_019.txt!
Wrote sdo_record_020.txt!
Wrote sdo_record_021.txt!
Wrote sdo_record_022.txt!
Wrote sdo_record_023.txt!
Wrote sdo_record_024.txt!
Wrote sdo_record_025.txt!


### CIDOC-CRM
Write the [CIDOC-CRM](https://cidoc-crm.org) records as JSON-LD files.

In [64]:
# Remove rows with an empty value in the cidoccrm_record column
df_cidoc = df[df["cidoccrm_record"].notna()]
cidoc_records_ids = list(df_cidoc["id"])

In [65]:
cidoc_path = data_dir+"cidoc_crm/"
Path(cidoc_path).mkdir(parents=True, exist_ok=True)

In [66]:
cidoc_records = list(df_cidoc["cidoccrm_record"])
print(cidoc_records[2])

{
  "@context": {
    "E78": "http://www.cidoc-crm.org/cidoc-crm/E78_Collection",
    "P102_has_title": "http://www.cidoc-crm.org/cidoc-crm/P102_has_title",
    "P3_has_note": "http://www.cidoc-crm.org/cidoc-crm/P3_has_note",
    "P4_has_time-span": "http://www.cidoc-crm.org/cidoc-crm/P4_has_time-span",
    "P50_has_current_keeper": "http://www.cidoc-crm.org/cidoc-crm/P50_has_current_keeper",
    "E39": "http://www.cidoc-crm.org/cidoc-crm/E39_Actor"
  },
  "@type": "E78",
  "P102_has_title": "Derrick A. Bell, Jr. Papers",
  "P3_has_note": "Archival collection documenting Derrick Bell’s civil rights work, legal scholarship, and academic career from 1922–2011.",
  "P4_has_time-span": "1922–2011",
  "P50_has_current_keeper": {
    "@type": "E39",
    "P102_has_title": "New York University Archives"
  },
  "P14_carried_out_by": {
    "@type": "E39",
    "P102_has_title": "Derrick A. Bell"
  }
}



In [67]:
utils.write_json(cidoc_records_ids, cidoc_records, cidoc_path, "cidoccrm_record_", ".json")
utils.write_json(cidoc_records_ids, cidoc_records, cidoc_path, "cidoccrm_record_", ".txt")

Wrote cidoccrm_record_006.json!
Wrote cidoccrm_record_007.json!
Wrote cidoccrm_record_008.json!
Wrote cidoccrm_record_013.json!
Wrote cidoccrm_record_016.json!
Wrote cidoccrm_record_017.json!
Wrote cidoccrm_record_018.json!
Wrote cidoccrm_record_019.json!
Wrote cidoccrm_record_020.json!
Wrote cidoccrm_record_021.json!
Wrote cidoccrm_record_022.json!
Wrote cidoccrm_record_023.json!
Wrote cidoccrm_record_024.json!
Wrote cidoccrm_record_025.json!
Wrote cidoccrm_record_006.txt!
Wrote cidoccrm_record_007.txt!
Wrote cidoccrm_record_008.txt!
Wrote cidoccrm_record_013.txt!
Wrote cidoccrm_record_016.txt!
Wrote cidoccrm_record_017.txt!
Wrote cidoccrm_record_018.txt!
Wrote cidoccrm_record_019.txt!
Wrote cidoccrm_record_020.txt!
Wrote cidoccrm_record_021.txt!
Wrote cidoccrm_record_022.txt!
Wrote cidoccrm_record_023.txt!
Wrote cidoccrm_record_024.txt!
Wrote cidoccrm_record_025.txt!
