# Analysis: Model Output from Experiment 1

Load libraries:

In [1]:
import utils
import pandas as pd
import numpy as np
import re
from pathlib import Path

Load the dataset of predictions from the final model of Experiment 1, the document classifier assigning `Stereotype` and `Omission` labels:

In [41]:
output_dir = "../data/token_clf_data/experiment1/output/"
f = output_dir+"aggregated_final_validate_predictions_docclf_sgd-svm_so_ALLDATA.csv"
df = pd.read_csv(f, index_col=0)
# df.head()

In [3]:
pred_col = "sgd-svm_label"
exp_col = "manual_label"
df = utils.getColumnValuesAsLists(df, pred_col)
df = utils.getColumnValuesAsLists(df, exp_col)
# df.head()

In [4]:
preds = list(df[pred_col])
preds[:5]

[['None'], ['None'], ['None'], ['Omission', 'Stereotype'], ['Omission']]

In [5]:
pred_df = df[["description_id", "start_offset", "end_offset", "field", "description", "sgd-svm_label"]]
pred_df = pred_df.explode([pred_col])
pred_df[pred_col] = pred_df[pred_col].replace(to_replace="", value="None")
pred_df.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,sgd-svm_label
4699,4699,1853,2066,Biographical / Historical,"Labelled Apparently some chapters, amounting t...",
8942,8942,384,540,Biographical / Historical,James Aikman of Perth signed his name to a vol...,
5440,5440,5692,5850,Biographical / Historical,This piece was published in 'Milk Production i...,
3474,3474,3608,8549,Biographical / Historical,Margaret Winifred Bartholomew was born on 21 A...,Omission
3474,3474,3608,8549,Biographical / Historical,Margaret Winifred Bartholomew was born on 21 A...,Stereotype


Join the EADID (fonds, or collection, identifier) to the data:

In [48]:
ann_df = pd.read_csv("../data/crc_metadata/annot_descs.csv", usecols=["description_id", "file"])
eadid_col = list(ann_df["file"])
ann_df = ann_df.drop(columns=["file"])
eadid_col = [filename.split("_")[0] for filename in eadid_col]
ann_df.insert(1, "eadid", eadid_col)
ann_df.head()

Unnamed: 0,description_id,eadid
0,0,AA5
1,1,AA5
2,2,AA5
3,3,AA5
4,4,AA6


In [49]:
print(pred_df.shape)
pred_df = pred_df.join(ann_df.set_index("description_id"), on="description_id")
print(pred_df.shape)
pred_df.head()

(28494, 6)
(28494, 7)


Unnamed: 0,description_id,start_offset,end_offset,field,description,sgd-svm_label,eadid
4699,4699,1853,2066,Biographical / Historical,"Labelled Apparently some chapters, amounting t...",,Coll-1310
8942,8942,384,540,Biographical / Historical,James Aikman of Perth signed his name to a vol...,,Coll-1427
5440,5440,5692,5850,Biographical / Historical,This piece was published in 'Milk Production i...,,Coll-1320
3474,3474,3608,8549,Biographical / Historical,Margaret Winifred Bartholomew was born on 21 A...,Omission,Coll-1260
3474,3474,3608,8549,Biographical / Historical,Margaret Winifred Bartholomew was born on 21 A...,Stereotype,Coll-1260


Save the data as JSON:

In [60]:
# Remove offset columns and create an index without duplicate values
pred_df = pred_df[["eadid", "description_id", "field", "description", "sgd-svm_label"]].reset_index().drop(columns=["index"])
pred_df = pred_df.rename(columns={"sgd-svm_label":"prediction"})
pred_df.head()

Unnamed: 0,eadid,description_id,field,description,prediction
0,Coll-1310,4699,Biographical / Historical,"Labelled Apparently some chapters, amounting t...",
1,Coll-1427,8942,Biographical / Historical,James Aikman of Perth signed his name to a vol...,
2,Coll-1320,5440,Biographical / Historical,This piece was published in 'Milk Production i...,
3,Coll-1260,3474,Biographical / Historical,Margaret Winifred Bartholomew was born on 21 A...,Omission
4,Coll-1260,3474,Biographical / Historical,Margaret Winifred Bartholomew was born on 21 A...,Stereotype


In [61]:
json_data = pred_df.to_json(orient="records")
json_data[0:500]

'[{"eadid":"Coll-1310","description_id":4699,"field":"Biographical \\/ Historical","description":"Labelled Apparently some chapters, amounting to about 160 printed pages, of a general book on education projected by my Father around 1930. KEEP, Essays on Teaching, HT 1963, by Hector.","prediction":"None"},{"eadid":"Coll-1427","description_id":8942,"field":"Biographical \\/ Historical","description":"James Aikman of Perth signed his name to a volume - a manuscript music book - containing music for 74'

In [62]:
with open(output_dir+"so_doc_clf_preds.json", "w") as f:
    f.write(json_data)
    f.close
print("File written!")

File written!


Sum the labels across documents (descriptions):

In [6]:
label_df = pd.DataFrame(pred_df[pred_col].value_counts())
label_df = label_df.rename(columns={pred_col:"Total Descriptions"})
label_df = label_df.T
label_df

Unnamed: 0,None,Omission,Stereotype
Total Descriptions,24346,2809,1339


In [7]:
def getLabelCountsPerField(df, pred_col, field):
    if field == None:
        label_df = pd.DataFrame(df[pred_col].value_counts())
        label_df = label_df.rename(columns={pred_col:"Total Descriptions"})
    else:
        subdf = df.loc[df["field"] == field]
        label_df = pd.DataFrame(subdf[pred_col].value_counts())
        label_df = label_df.rename(columns={pred_col:field})
    label_df = label_df.T
    return label_df

In [27]:
label_df = getLabelCountsPerField(pred_df, pred_col, None)
fields = pred_df.field.unique()
for field in fields:
    label_df = pd.concat([label_df, getLabelCountsPerField(pred_df, pred_col, field)])
label_df = label_df.reset_index()
label_df = label_df.rename(columns={"index":"Field"})
label_df

Unnamed: 0,Field,None,Omission,Stereotype
0,Total Descriptions,24346,2809.0,1339.0
1,Biographical / Historical,312,348.0,149.0
2,Title,14037,734.0,353.0
3,Scope and Contents,9694,1727.0,837.0
4,Processing Information,303,,


Calculate the percentages:

In [28]:
df_percentages = []
for row in range(label_df.shape[0]):
    row_percentages = []
    for col in range(1,len(label_df.columns)):
        row_percentages += [(label_df.iloc[row,col]/label_df.iloc[row,1:].sum())*100]
    df_percentages += [row_percentages]
# df_percentages

In [29]:
label_df2 = pd.concat([label_df["Field"], pd.DataFrame(df_percentages).reset_index()], axis=1)
label_df2 = label_df2.drop(columns=["index"])
label_df2 = label_df2.rename(columns={0:"None", 1:"Omission", 2:"Stereotype"})
label_df2

Unnamed: 0,Field,None,Omission,Stereotype
0,Total Descriptions,85.442549,9.858216,4.699235
1,Biographical / Historical,38.566131,43.016069,18.4178
2,Title,92.812748,4.853213,2.334039
3,Scope and Contents,79.083048,14.088758,6.828194
4,Processing Information,100.0,,


In [30]:
label_df.insert(1, "Metric", (["Count"]*label_df.shape[0]))
label_df2.insert(1, "Metric", (["Percentage"]*label_df2.shape[0]))
label_df = pd.concat([label_df, label_df2])
label_df

Unnamed: 0,Field,Metric,None,Omission,Stereotype
0,Total Descriptions,Count,24346.0,2809.0,1339.0
1,Biographical / Historical,Count,312.0,348.0,149.0
2,Title,Count,14037.0,734.0,353.0
3,Scope and Contents,Count,9694.0,1727.0,837.0
4,Processing Information,Count,303.0,,
0,Total Descriptions,Percentage,85.442549,9.858216,4.699235
1,Biographical / Historical,Percentage,38.566131,43.016069,18.4178
2,Title,Percentage,92.812748,4.853213,2.334039
3,Scope and Contents,Percentage,79.083048,14.088758,6.828194
4,Processing Information,Percentage,100.0,,


In [38]:
label_df = label_df.groupby(by="Field", group_keys=True).apply(lambda x: x).drop(columns=["Field"])
label_df
# df.groupby("Animal", group_keys=True).apply(lambda x: x)

Unnamed: 0_level_0,Unnamed: 1_level_0,Metric,None,Omission,Stereotype
Field,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Biographical / Historical,1,Count,312.0,348.0,149.0
Biographical / Historical,1,Percentage,38.566131,43.016069,18.4178
Processing Information,4,Count,303.0,,
Processing Information,4,Percentage,100.0,,
Scope and Contents,3,Count,9694.0,1727.0,837.0
Scope and Contents,3,Percentage,79.083048,14.088758,6.828194
Title,2,Count,14037.0,734.0,353.0
Title,2,Percentage,92.812748,4.853213,2.334039
Total Descriptions,0,Count,24346.0,2809.0,1339.0
Total Descriptions,0,Percentage,85.442549,9.858216,4.699235


Save the data:

In [42]:
label_df.to_csv(output_dir+"doc_counts_percentages_pred_label.csv")