# Coding Preparation

Create subsets of the unclassified and classified NUSC archival catalog metadata descriptions, one per collection, to share with each collection's designated expert for manual coding and classification evaluation.

In [1]:
# Libraries for data analysis
import pandas as pd

# Custom variables
import config

# For writing data and creating directories
from pathlib import Path

In [2]:
unclf_df = pd.read_csv("data/extracted/nusc_archival_descs_for_clf.csv", index_col = 0)
clf_osc_df = pd.read_csv("data/classified/baseline_osc_predictions.csv", index_col = 0)
clf_lc_osc_df = pd.read_csv("data/classified/lc-osc_predictions.csv")

In [3]:
unclf_dir = "data/extracted/for_review/unclassified/"
Path(unclf_dir).mkdir(parents=True, exist_ok=True)

clf_dir = "data/extracted/for_review/classified/"
Path(clf_dir).mkdir(parents=True, exist_ok=True)

In [4]:
# unclf_df.head()
# clf_osc_df.head()
clf_lc_osc_df.head()

Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_predictions
0,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",BP,BP,['O'],processinfo,This EAD description created by Ruth Sheret 27...,()
1,1,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",BP,BP,"['Gendered-Pronoun', 'Gendered-Role', 'General...",scopecontent,The papers of the distinguished public servant...,"('Omission',)"
2,2,"[267, 268, 269, 270, 271, 272]",BP,BP,['Gendered-Role'],unittitle,Plowden (Lady Bridget) Archive,()
3,3,"[273, 274, 275, 276, 277, 278, 279, 280, 281, ...",BP,BP/01,"['Gendered-Pronoun', 'Gendered-Role', 'General...",scopecontent,"Formerly the 'Working Ladies Guild', the organ...","('Omission',)"
4,4,"[352, 353, 354, 355, 342, 343, 344, 345, 346, ...",BP,BP/01,['Generalization'],unittitle,Chairman (Managing Committee) of the Mary Fiel...,()


In [5]:
# col_name = "gender_bias?"
# df = unclf_df
# file_suffix = "_unclassified.csv"
# file_dir = unclf_dir

col_name = "agree?"
df = clf_osc_df
file_suffix = "_classified.csv"
file_dir = clf_dir

In [6]:
df = df.rename(columns={"prediction":"prediction1"})

In [7]:
print(df.shape)
print(clf_lc_osc_df.shape)

(34225, 6)
(34225, 8)


In [8]:
index_cols = ["description_id", "eadid", "rowid", "field", "doc"]
df = df.join(clf_lc_osc_df[["description_id", "eadid", "rowid", "field", "doc", "os_predictions"]].set_index(index_cols), on=index_cols)
df = df.rename(columns={"os_predictions":"prediction2"})
print(df.shape)
df.head()

(34225, 7)


Unnamed: 0,description_id,eadid,rowid,field,doc,prediction1,prediction2
0,0,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...,(),()
1,1,BP,BP,scopecontent,The papers of the distinguished public servant...,(),"('Omission',)"
2,2,BP,BP,unittitle,Plowden (Lady Bridget) Archive,(),()
3,3,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ...",(),"('Omission',)"
4,4,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...,(),()


In [9]:
empty_col = [""]*df.shape[0]
df.insert(len(df.columns), col_name, empty_col)
df.tail()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction1,prediction2,agree?
34220,34220,WCT,WCT/8,unittitle,Diaries,(),(),
34221,34221,WCT,WCT/9,scopecontent,Personal diaries and notebooks.,(),(),
34222,34222,WCT,WCT/9,unittitle,Diaries and notebooks of Lady Pauline,(),(),
34223,34223,WCT,WCT/9/4,scopecontent,Microfilm reels of Pauline's diaries and sketc...,(),(),
34224,34224,WCT,WCT/9/4,unittitle,Microfilm copies of Lady Pauline's Diaries,(),(),


In [10]:
eadids = list(df.eadid.unique())
print(eadids)

['BP', 'BXB', 'CHE', 'CPT', 'GB', 'HL', 'OBR', 'SH', 'SW', 'THS', 'WCT']


In [11]:
eadid_col = "eadid"

In [12]:
for eadid in eadids:
    subdf = df.loc[df[eadid_col] == eadid]
    subdf.to_csv(file_dir+eadid+file_suffix)