# Preprocessing

Preprocess the data for classification, combining the EAD-formatted and custom-formatted data into a single file.

In [2]:
# Custom filepaths and functions
import config, clf_utils

# Libraries for data, file, and model loading
import pandas as pd
from pathlib import Path

### 1. Preprocessing

Load the extracted archival metadata from the Newcastle University Special Collections catalog.

In [3]:
extracted_dir = "data/extracted/"
ead_df = pd.read_csv(extracted_dir + "newcastle_archival_metadata_sample.csv", index_col=0)
nonead_df = pd.read_csv(extracted_dir + "complete_bell_archival_metadata_nonead.csv", index_col=0)   #nonead_df = pd.read_csv(extracted_dir + "bell_archival_metadata_nonead.csv", index_col=0)
print(ead_df.shape, nonead_df.shape)

(16683, 9) (11990, 10)


Transform the data so all the text that will be classified is in a single column.

In [4]:
text_cols = ["unittitle", "bioghist", "scopecontent", "processinfo"]
ead_doc_df = clf_utils.transformForClassification(ead_df, text_cols, "unitid")

text_cols = ["title", "description", "extent-and-medium"]
nonead_doc_df = clf_utils.transformForClassification(nonead_df, text_cols, "node_id")

doc_df = pd.concat([ead_doc_df, nonead_doc_df])
doc_df.sort_values(["eadid", "rowid", "field", "doc"], inplace=True)
doc_df.head()

Unnamed: 0,eadid,rowid,field,doc
26920,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...
26520,BP,BP,scopecontent,The papers of the distinguished public servant...
25720,BP,BP,unittitle,Plowden (Lady Bridget) Archive
26521,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ..."
25721,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...


Create a unique identifier for each description:

In [4]:
doc_df = doc_df.reset_index()
doc_df = doc_df.drop(columns=["index"])
doc_df = doc_df.reset_index()
doc_df = doc_df.rename(columns={"index":"description_id"})
doc_df.head()

Unnamed: 0,description_id,eadid,rowid,field,doc
0,0,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...
1,1,BP,BP,scopecontent,The papers of the distinguished public servant...
2,2,BP,BP,unittitle,Plowden (Lady Bridget) Archive
3,3,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ..."
4,4,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...


In [8]:
print(doc_df.shape)

(57947, 5)


Save the descriptions with the unique identifier column.

In [9]:
doc_df.to_csv(extracted_dir+"nusc_archival_descs_for_clf.csv")