# Analysis: Model Output from Experiment 1

Load libraries:

In [27]:
import utils
import pandas as pd
import numpy as np
import re
from pathlib import Path

## Description Stereotype and Omission Labels

Load the dataset of predictions from the final model of Experiment 1, the document classifier assigning `Stereotype` and `Omission` labels:

In [3]:
output_dir = "../data/token_clf_data/experiment1/5fold/output"
# f = output_dir+"aggregated_final_validate_predictions_docclf_sgd-svm_so_ALLDATA.csv"  # From 60-20-20 split
f = output_dir+"aggregated_final_predictions_docclf_sgd-svm_so.csv"                     # From 5-fold cross validation
df = pd.read_csv(f, index_col=0)
df.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,manual_label,fold,doc_ling_pred,doc_pers_o_pred,sgd-svm_label
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,['Stereotype'],split4,"['Generalization', 'Gendered-Pronoun']","['Occupation', 'Masculine', 'Unknown']","('Stereotype',)"
7,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,['Omission'],split4,['Gendered-Pronoun'],"['Occupation', 'Masculine']","('Omission',)"
10,4736,9951,10026,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,['Omission'],split4,[],"['Occupation', 'Masculine', 'Unknown']","('Omission',)"
14,4712,4199,4485,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",['Omission'],split4,['Gendered-Pronoun'],"['Unknown', 'Masculine', 'Feminine', 'Occupati...","('Omission',)"
22,15684,845,1179,Biographical / Historical,Catherine Robina Borland was responsible for t...,[''],split4,"['Generalization', 'Gendered-Pronoun']",['Masculine'],"('',)"


In [4]:
pred_col = "sgd-svm_label"
exp_col = "manual_label"
df = utils.getColumnValuesAsLists(df, pred_col)
df = utils.getColumnValuesAsLists(df, exp_col)
df = utils.getColumnValuesAsLists(df, "doc_ling_pred")
df = utils.getColumnValuesAsLists(df, "doc_pers_o_pred")
# df.head()

In [5]:
preds = list(df[pred_col])
preds[:5]

[['Stereotype'], ['Omission'], ['Omission'], ['Omission'], ['']]

In [6]:
pred_df = df[["description_id", "start_offset", "end_offset", "field", "description", "sgd-svm_label"]]
pred_df = pred_df.explode([pred_col])
pred_df[pred_col] = pred_df[pred_col].replace(to_replace="", value="None")
pred_df.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,sgd-svm_label
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,Stereotype
7,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,Omission
10,4736,9951,10026,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,Omission
14,4712,4199,4485,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",Omission
22,15684,845,1179,Biographical / Historical,Catherine Robina Borland was responsible for t...,


Join the EADID (fonds, or collection, identifier) to the data:

In [7]:
ann_df = pd.read_csv("../data/crc_metadata/annot_descs.csv", usecols=["description_id", "file"])
eadid_col = list(ann_df["file"])
ann_df = ann_df.drop(columns=["file"])
eadid_col = [filename.split("_")[0] for filename in eadid_col]
ann_df.insert(1, "eadid", eadid_col)
ann_df.head()

Unnamed: 0,description_id,eadid
0,0,AA5
1,1,AA5
2,2,AA5
3,3,AA5
4,4,AA6


In [8]:
print(pred_df.shape)
pred_df = pred_df.join(ann_df.set_index("description_id"), on="description_id")
print(pred_df.shape)
pred_df.head()

(28376, 6)
(28376, 7)


Unnamed: 0,description_id,start_offset,end_offset,field,description,sgd-svm_label,eadid
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,Stereotype,Coll-1253
7,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,Omission,Coll-1255
10,4736,9951,10026,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,Omission,Coll-1310
14,4712,4199,4485,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",Omission,Coll-1310
22,15684,845,1179,Biographical / Historical,Catherine Robina Borland was responsible for t...,,Coll-1453


Save the data as JSON:

In [9]:
# Remove offset columns and create an index without duplicate values
pred_df = pred_df[["eadid", "description_id", "field", "description", "sgd-svm_label"]].reset_index().drop(columns=["index"])
pred_df = pred_df.rename(columns={"sgd-svm_label":"prediction"})
pred_df.head()

Unnamed: 0,eadid,description_id,field,description,prediction
0,Coll-1253,3027,Biographical / Historical,Thomas Young was probably born in 1725. By the...,Stereotype
1,Coll-1255,3397,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,Omission
2,Coll-1310,4736,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,Omission
3,Coll-1310,4712,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",Omission
4,Coll-1453,15684,Biographical / Historical,Catherine Robina Borland was responsible for t...,


In [10]:
json_data = pred_df.to_json(orient="records")
json_data[0:500]

'[{"eadid":"Coll-1253","description_id":3027,"field":"Biographical \\/ Historical","description":"Thomas Young was probably born in 1725. By the summer of 1751, Young was practicing surgery in Edinburgh. In 1756 he was appointed Professor of Midwifery at Edinburgh University - only the third holder of that Chair, after Robert Smith and Joseph Gibson, the first holder. Young was the first Professor of Midwifery at Edinburgh to actually lecture on the subject of obstetrics. He also had an associatio'

In [12]:
with open(output_dir+"so_doc_clf_preds.json", "w") as f:
    f.write(json_data)
    f.close
print("File written!")

File written!


Sum the labels across documents (descriptions):

In [14]:
pred_col = "prediction"
label_df = pd.DataFrame(pred_df[pred_col].value_counts())
label_df = label_df.rename(columns={pred_col:"Total Descriptions"})
label_df = label_df.T
label_df

Unnamed: 0,None,Omission,Stereotype
Total Descriptions,24302,2779,1295


In [26]:
def getLabelCountsPerField(df, pred_col, field):
    if field == None:
        label_df = pd.DataFrame(df[pred_col].value_counts())
        label_df = label_df.rename(columns={pred_col:"Total Descriptions"})
    else:
        subdf = df.loc[df["field"] == field]
        label_df = pd.DataFrame(subdf[pred_col].value_counts())
        label_df = label_df.rename(columns={pred_col:field})
    label_df = label_df.T
    return label_df

In [27]:
label_df = getLabelCountsPerField(pred_df, pred_col, None)
fields = pred_df.field.unique()
for field in fields:
    label_df = pd.concat([label_df, getLabelCountsPerField(pred_df, pred_col, field)])
label_df = label_df.reset_index()
label_df = label_df.rename(columns={"index":"Field"})
label_df

Unnamed: 0,Field,None,Omission,Stereotype
0,Total Descriptions,24302,2779.0,1295.0
1,Biographical / Historical,316,312.0,118.0
2,Title,14049,746.0,343.0
3,Scope and Contents,9634,1721.0,834.0
4,Processing Information,303,,


Calculate the proportions:

In [28]:
df_percentages = []
for row in range(label_df.shape[0]):
    row_percentages = []
    for col in range(1,len(label_df.columns)):
        row_percentages += [label_df.iloc[row,col]/label_df.iloc[row,1:].sum()]
    df_percentages += [row_percentages]
# df_percentages

In [29]:
label_df2 = pd.concat([label_df["Field"], pd.DataFrame(df_percentages).reset_index()], axis=1)
label_df2 = label_df2.drop(columns=["index"])
label_df2 = label_df2.rename(columns={0:"None", 1:"Omission", 2:"Stereotype"})
label_df2 = label_df2.fillna(0)
label_df2

Unnamed: 0,Field,None,Omission,Stereotype
0,Total Descriptions,0.856428,0.097935,0.045637
1,Biographical / Historical,0.423592,0.418231,0.158177
2,Title,0.928062,0.04928,0.022658
3,Scope and Contents,0.790385,0.141193,0.068422
4,Processing Information,1.0,0.0,0.0


In [30]:
label_df.insert(1, "Metric", (["Count"]*label_df.shape[0]))
label_df2.insert(1, "Metric", (["Proportion"]*label_df2.shape[0]))
label_df = pd.concat([label_df, label_df2])
label_df = label_df.fillna(0)
label_df

Unnamed: 0,Field,Metric,None,Omission,Stereotype
0,Total Descriptions,Count,24302.0,2779.0,1295.0
1,Biographical / Historical,Count,316.0,312.0,118.0
2,Title,Count,14049.0,746.0,343.0
3,Scope and Contents,Count,9634.0,1721.0,834.0
4,Processing Information,Count,303.0,0.0,0.0
0,Total Descriptions,Proportion,0.856428,0.097935,0.045637
1,Biographical / Historical,Proportion,0.423592,0.418231,0.158177
2,Title,Proportion,0.928062,0.04928,0.022658
3,Scope and Contents,Proportion,0.790385,0.141193,0.068422
4,Processing Information,Proportion,1.0,0.0,0.0


In [31]:
label_df = label_df.groupby(by="Field", group_keys=True).apply(lambda x: x).drop(columns=["Field"])
label_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Metric,None,Omission,Stereotype
Field,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Biographical / Historical,1,Count,316.0,312.0,118.0
Biographical / Historical,1,Proportion,0.423592,0.418231,0.158177
Processing Information,4,Count,303.0,0.0,0.0
Processing Information,4,Proportion,1.0,0.0,0.0
Scope and Contents,3,Count,9634.0,1721.0,834.0
Scope and Contents,3,Proportion,0.790385,0.141193,0.068422
Title,2,Count,14049.0,746.0,343.0
Title,2,Proportion,0.928062,0.04928,0.022658
Total Descriptions,0,Count,24302.0,2779.0,1295.0
Total Descriptions,0,Proportion,0.856428,0.097935,0.045637


Save the data:

In [32]:
analysis_dir = "../data/token_clf_data/experiment1/5fold/analysis/"
Path(analysis_dir).mkdir(parents=True, exist_ok=True)  # For predictions
label_df.to_csv(analysis_dir+"doc_counts_proportions_pred_label.csv")

Update the document classifier output data with the EADID column:

In [33]:
ann_df = pd.read_csv("../data/crc_metadata/annot_descs.csv", usecols=["description_id", "file"])
eadid_col = list(ann_df["file"])
ann_df = ann_df.drop(columns=["file"])
eadid_col = [filename.split("_")[0] for filename in eadid_col]
ann_df.insert(1, "eadid", eadid_col)
df = df.join(ann_df.set_index("description_id"), on="description_id")
df.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,manual_label,fold,doc_ling_pred,doc_pers_o_pred,sgd-svm_label,eadid
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,[Stereotype],split4,"[Generalization, Gendered-Pronoun]","[Occupation, Masculine, Unknown]",[Stereotype],Coll-1253
7,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,[Omission],split4,[Gendered-Pronoun],"[Occupation, Masculine]",[Omission],Coll-1255
10,4736,9951,10026,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,[Omission],split4,[],"[Occupation, Masculine, Unknown]",[Omission],Coll-1310
14,4712,4199,4485,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",[Omission],split4,[Gendered-Pronoun],"[Unknown, Masculine, Feminine, Occupation]",[Omission],Coll-1310
22,15684,845,1179,Biographical / Historical,Catherine Robina Borland was responsible for t...,[],split4,"[Generalization, Gendered-Pronoun]",[Masculine],[],Coll-1453


In [35]:
# f = output_dir+"aggregated_final_predictions_docclf_sgd-svm_so.csv"
df.to_csv(f)