# Preprocessing

Preprocess the data for classification, combining the EAD-formatted and custom-formatted data into a single file.

In [None]:
# Custom filepaths and functions
import config, ext_utils

# Libraries for data, file, and model loading
import pandas as pd

### 1. Preprocessing

Load the extracted archival metadata from the Newcastle University Special Collections catalog.

In [14]:
extracted_dir = "data/extracted/"
ead_f = "nusc_ead_all_fonds.csv"  #"newcastle_archival_metadata_sample.csv"
ead_df = pd.read_csv(extracted_dir + ead_f, index_col=0)
nonead_df = pd.read_csv(extracted_dir + "complete_bell_archival_metadata_nonead.csv", index_col=0)   #nonead_df = pd.read_csv(extracted_dir + "bell_archival_metadata_nonead.csv", index_col=0)
print(ead_df.shape, nonead_df.shape)

(37943, 9) (11990, 10)


In [15]:
ead_df.eadid.unique()

array(['WBC', 'LAY', 'SWAN', 'G', 'RBD', 'DB', 'LV', 'NS', 'DAG', 'LE',
       'Bradshaw-Berwick', 'Crawhall', 'ILL', "Benefactor's Library",
       'Fletcher', 'Hev', 'K', 'BAI', 'St Bees', 'Maurice Bell Coll.',
       'HL', 'MISC.MSS', '20th C. Coll', '21st C. Coll.', '17th C. Coll',
       '18th C. Coll', '19th C. Coll', 'Inc.', 'PI', 'RB', 'Sandes', 'HD',
       'Friends', 'ECG', 'Bradshaw', 'Layard', 'JJ', 'Bell-White',
       'Clarke Misc.', 'NHG', 'TH', 'Burman Alnwick', 'Clarke', 'JTB',
       'Clarke Med.', 'CPT', 'AUWH', 'NI', 'JWD', 'CHE', 'BBHB', 'TS',
       'Heslop', 'SWL', 'NCN', 'PD', 'PDCOLL', 'Meade', 'FW', 'Ure Coll.',
       'Burnett', 'Chorley', 'PB', 'MAP', 'SH', 'Walmsley',
       'AC Poetry Coll.', 'Bloodaxe', 'BXB', 'FLP', 'IRONP Coll.', 'GG',
       'JC', 'MT', 'Eagle Press', 'W', 'EWL', 'Robinson', 'THH', 'CG',
       'M', 'THODG', 'Med. Coll.', 'NRI', 'Pyb', 'LD', 'FP', 'Ent. Coll.',
       'MAPS', 'Ritchie', 'Wallis', 'HH', 'MM', 'MS', 'THP', 'JCII', 'JG',


Transform the data so all the text that will be classified is in a single column.

In [16]:
text_cols = ["unittitle", "bioghist", "scopecontent", "processinfo"]
ead_doc_df = ext_utils.consolidateText(ead_df, text_cols, "unitid")

text_cols = ["title", "description", "extent-and-medium"]
nonead_doc_df = ext_utils.consolidateText(nonead_df, text_cols, "node_id")

doc_df = pd.concat([ead_doc_df, nonead_doc_df])
doc_df.sort_values(["eadid", "rowid", "field", "doc"], inplace=True)
doc_df.head()

Unnamed: 0,eadid,rowid,field,doc
6621,17th C. Coll,17th C. Coll,bioghist,Formed in 1963 after an amalgamation of instit...
6622,17th C. Coll,17th C. Coll,scopecontent,The 17th Century Collection is a small but exp...
6620,17th C. Coll,17th C. Coll,unittitle,17th Century Collection
6625,18th C. Coll,18th C. Coll,bioghist,Formed in 1963 after an amalgamation of instit...
6626,18th C. Coll,18th C. Coll,scopecontent,The 18th Century Collection contains approxima...


Create a unique identifier for each description:

In [17]:
doc_df = doc_df.reset_index()
doc_df = doc_df.drop(columns=["index"])
doc_df = doc_df.reset_index()
doc_df = doc_df.rename(columns={"index":"description_id"})
doc_df.head()

Unnamed: 0,description_id,eadid,rowid,field,doc
0,0,17th C. Coll,17th C. Coll,bioghist,Formed in 1963 after an amalgamation of instit...
1,1,17th C. Coll,17th C. Coll,scopecontent,The 17th Century Collection is a small but exp...
2,2,17th C. Coll,17th C. Coll,unittitle,17th Century Collection
3,3,18th C. Coll,18th C. Coll,bioghist,Formed in 1963 after an amalgamation of instit...
4,4,18th C. Coll,18th C. Coll,scopecontent,The 18th Century Collection contains approxima...


In [18]:
print(doc_df.shape)

(99312, 5)


Save the descriptions with the unique identifier column.

In [19]:
#f = "nusc_archival_descs_for_clf.csv" # for classification
f = "nusc_archival_descs_consolidated.csv" # EAD and non-EAD (i.e., Bell descriptions)
doc_df.to_csv(extracted_dir+f)