# Analysis: Model Output from Experiments

Load libraries:

In [1]:
import utils
import pandas as pd
import numpy as np
import re
from pathlib import Path

## Description Stereotype and Omission Labels

Load the dataset of predictions from the final model of Experiment 1, the document classifier assigning `Stereotype` and `Omission` labels:

In [2]:
exp = 2 # 1, 3    # Experiment number

In [5]:
output_dir = "../data/token_clf_data/experiment{n}/5fold/output/".format(n=exp)
# f = output_dir+"aggregated_final_validate_predictions_docclf_sgd-svm_so_ALLDATA.csv"  # From 60-20-20 split
f = output_dir+"aggregated_final_predictions_docclf_sgd-svm_so.csv"                     # From 5-fold cross validation
df = pd.read_csv(f, index_col=0)
df.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,manual_label,fold,doc_ling_pred,sgd-svm_label,eadid
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,['Stereotype'],split4,"['Generalization', 'Gendered-Pronoun']",['Stereotype'],Coll-1253
7,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,['Omission'],split4,['Gendered-Pronoun'],['Omission'],Coll-1255
10,4736,9951,10026,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,['Omission'],split4,[''],['Omission'],Coll-1310
14,4712,4199,4485,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",['Omission'],split4,['Gendered-Pronoun'],['Omission'],Coll-1310
22,15684,845,1179,Biographical / Historical,Catherine Robina Borland was responsible for t...,[''],split4,"['Generalization', 'Gendered-Pronoun']",[''],Coll-1453


In [6]:
if exp == 3:
    pred_col = "predicted_label" 
    exp_col = "expected_label" 
    df = utils.getColumnValuesAsLists(df, pred_col)
    df = utils.getColumnValuesAsLists(df, exp_col)
    df = utils.getColumnValuesAsLists(df, "pers_o_pred")
elif exp == 2:
    pred_col = "sgd-svm_label"
    exp_col = "manual_label"
    df = utils.getColumnValuesAsLists(df, pred_col)
    df = utils.getColumnValuesAsLists(df, exp_col)
    df = utils.getColumnValuesAsLists(df, "doc_ling_pred")
else:
    pred_col = "sgd-svm_label"
    exp_col = "manual_label"
    df = utils.getColumnValuesAsLists(df, pred_col)
    df = utils.getColumnValuesAsLists(df, exp_col)
    df = utils.getColumnValuesAsLists(df, "doc_ling_pred")
    df = utils.getColumnValuesAsLists(df, "doc_pers_o_pred")

In [7]:
preds = list(df[pred_col])
preds[:5]

[['Stereotype'], ['Omission'], ['Omission'], ['Omission'], ['']]

In [8]:
pred_df = df[["description_id", "start_offset", "end_offset", "field", "description", pred_col]]
pred_df = pred_df.explode([pred_col])
pred_df[pred_col] = pred_df[pred_col].replace(to_replace="", value="None")
pred_df.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,sgd-svm_label
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,Stereotype
7,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,Omission
10,4736,9951,10026,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,Omission
14,4712,4199,4485,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",Omission
22,15684,845,1179,Biographical / Historical,Catherine Robina Borland was responsible for t...,


Join the EADID (fonds, or collection, identifier) to the data:

In [9]:
ann_df = pd.read_csv("../data/crc_metadata/annot_descs.csv", usecols=["description_id", "file"])
eadid_col = list(ann_df["file"])
ann_df = ann_df.drop(columns=["file"])
eadid_col = [filename.split("_")[0] for filename in eadid_col]
ann_df.insert(1, "eadid", eadid_col)
ann_df.head()

Unnamed: 0,description_id,eadid
0,0,AA5
1,1,AA5
2,2,AA5
3,3,AA5
4,4,AA6


In [10]:
print(pred_df.shape)
pred_df = pred_df.join(ann_df.set_index("description_id"), on="description_id")
print(pred_df.shape)
pred_df.head()

(28319, 6)
(28319, 7)


Unnamed: 0,description_id,start_offset,end_offset,field,description,sgd-svm_label,eadid
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,Stereotype,Coll-1253
7,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,Omission,Coll-1255
10,4736,9951,10026,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,Omission,Coll-1310
14,4712,4199,4485,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",Omission,Coll-1310
22,15684,845,1179,Biographical / Historical,Catherine Robina Borland was responsible for t...,,Coll-1453


Save the data as JSON:

In [11]:
# Remove offset columns and create an index without duplicate values
pred_df = pred_df[["eadid", "description_id", "field", "description", pred_col]].reset_index().drop(columns=["index"])
pred_df = pred_df.rename(columns={pred_col:"prediction"})
pred_df.head()

Unnamed: 0,eadid,description_id,field,description,prediction
0,Coll-1253,3027,Biographical / Historical,Thomas Young was probably born in 1725. By the...,Stereotype
1,Coll-1255,3397,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,Omission
2,Coll-1310,4736,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,Omission
3,Coll-1310,4712,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",Omission
4,Coll-1453,15684,Biographical / Historical,Catherine Robina Borland was responsible for t...,


In [10]:
json_data = pred_df.to_json(orient="records")
json_data[0:500]

'[{"eadid":"Coll-1253","description_id":3027,"field":"Biographical \\/ Historical","description":"Thomas Young was probably born in 1725. By the summer of 1751, Young was practicing surgery in Edinburgh. In 1756 he was appointed Professor of Midwifery at Edinburgh University - only the third holder of that Chair, after Robert Smith and Joseph Gibson, the first holder. Young was the first Professor of Midwifery at Edinburgh to actually lecture on the subject of obstetrics. He also had an associatio'

In [11]:
with open(output_dir+"so_doc_clf_preds.json", "w") as f:
    f.write(json_data)
    f.close
print("File written!")

File written!


Sum the labels across documents (descriptions):

In [12]:
pred_col = "prediction"
label_df = pd.DataFrame(pred_df[pred_col].value_counts())
label_df = label_df.rename(columns={pred_col:"Total Descriptions"})
label_df = label_df.T
label_df

Unnamed: 0,None,Omission,Stereotype
Total Descriptions,24388,2578,1321


In [13]:
def getLabelCountsPerField(df, pred_col, field):
    if field == None:
        label_df = pd.DataFrame(df[pred_col].value_counts())
        label_df = label_df.rename(columns={pred_col:"Total Descriptions"})
    else:
        subdf = df.loc[df["field"] == field]
        label_df = pd.DataFrame(subdf[pred_col].value_counts())
        label_df = label_df.rename(columns={pred_col:field})
    label_df = label_df.T
    return label_df

In [14]:
label_df = getLabelCountsPerField(pred_df, pred_col, None)
fields = pred_df.field.unique()
for field in fields:
    label_df = pd.concat([label_df, getLabelCountsPerField(pred_df, pred_col, field)])
label_df = label_df.reset_index()
label_df = label_df.rename(columns={"index":"Field"})
label_df = label_df.fillna(0)
label_df

Unnamed: 0,Field,None,Omission,Stereotype
0,Total Descriptions,24388,2578.0,1321.0
1,Biographical / Historical,311,317.0,162.0
2,Title,14062,689.0,347.0
3,Scope and Contents,9712,1572.0,812.0
4,Processing Information,303,0.0,0.0


Calculate the proportions:

In [15]:
df_percentages = []
for row in range(label_df.shape[0]):
    row_percentages = []
    for col in range(1,len(label_df.columns)):
        row_percentages += [label_df.iloc[row,col]/label_df.iloc[row,1:].sum()]
    df_percentages += [row_percentages]
# df_percentages

In [16]:
label_df2 = pd.concat([label_df["Field"], pd.DataFrame(df_percentages).reset_index()], axis=1)
label_df2 = label_df2.drop(columns=["index"])
label_df2 = label_df2.rename(columns={0:"None", 1:"Omission", 2:"Stereotype"})
label_df2 = label_df2.fillna(0)
label_df2

Unnamed: 0,Field,None,Omission,Stereotype
0,Total Descriptions,0.862163,0.091137,0.0467
1,Biographical / Historical,0.393671,0.401266,0.205063
2,Title,0.931382,0.045635,0.022983
3,Scope and Contents,0.80291,0.12996,0.06713
4,Processing Information,1.0,0.0,0.0


In [17]:
label_df.insert(1, "Metric", (["Count"]*label_df.shape[0]))
label_df2.insert(1, "Metric", (["Proportion"]*label_df2.shape[0]))
label_df = pd.concat([label_df, label_df2])
label_df = label_df.fillna(0)
label_df

Unnamed: 0,Field,Metric,None,Omission,Stereotype
0,Total Descriptions,Count,24388.0,2578.0,1321.0
1,Biographical / Historical,Count,311.0,317.0,162.0
2,Title,Count,14062.0,689.0,347.0
3,Scope and Contents,Count,9712.0,1572.0,812.0
4,Processing Information,Count,303.0,0.0,0.0
0,Total Descriptions,Proportion,0.862163,0.091137,0.0467
1,Biographical / Historical,Proportion,0.393671,0.401266,0.205063
2,Title,Proportion,0.931382,0.045635,0.022983
3,Scope and Contents,Proportion,0.80291,0.12996,0.06713
4,Processing Information,Proportion,1.0,0.0,0.0


In [18]:
label_df = label_df.groupby(by="Field", group_keys=True).apply(lambda x: x).drop(columns=["Field"])
label_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Metric,None,Omission,Stereotype
Field,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Biographical / Historical,1,Count,311.0,317.0,162.0
Biographical / Historical,1,Proportion,0.393671,0.401266,0.205063
Processing Information,4,Count,303.0,0.0,0.0
Processing Information,4,Proportion,1.0,0.0,0.0
Scope and Contents,3,Count,9712.0,1572.0,812.0
Scope and Contents,3,Proportion,0.80291,0.12996,0.06713
Title,2,Count,14062.0,689.0,347.0
Title,2,Proportion,0.931382,0.045635,0.022983
Total Descriptions,0,Count,24388.0,2578.0,1321.0
Total Descriptions,0,Proportion,0.862163,0.091137,0.0467


Save the data:

In [19]:
analysis_dir = "../data/token_clf_data/experiment{n}/5fold/analysis/".format(n=exp)
Path(analysis_dir).mkdir(parents=True, exist_ok=True)  # For predictions
label_df.to_csv(analysis_dir+"doc_counts_proportions_pred_label.csv")

Update the document classifier output data with the EADID column:

In [20]:
ann_df = pd.read_csv("../data/crc_metadata/annot_descs.csv", usecols=["description_id", "file"])
eadid_col = list(ann_df["file"])
ann_df = ann_df.drop(columns=["file"])
eadid_col = [filename.split("_")[0] for filename in eadid_col]
ann_df.insert(1, "eadid", eadid_col)
df = df.join(ann_df.set_index("description_id"), on="description_id")
df.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,expected_label,fold,pers_o_pred,predicted_label,eadid
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,[Stereotype],split4,"[Occupation, Unknown, Masculine]",[Stereotype],Coll-1253
7,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,[Omission],split4,"[Unknown, Occupation, Masculine]",[Omission],Coll-1255
10,4736,9951,10026,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,[Omission],split4,"[Unknown, Occupation]",[Omission],Coll-1310
14,4712,4199,4485,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",[Omission],split4,"[Unknown, Occupation, Feminine]",[Omission],Coll-1310
22,15684,845,1179,Biographical / Historical,Catherine Robina Borland was responsible for t...,[],split4,[Occupation],[],Coll-1453


In [21]:
f = output_dir+"aggregated_final_predictions_docclf_sgd-svm_so.csv"
df.to_csv(f)

### Join Labeled Descriptions to Additional Catalog Metadata

Examine specific labels more closely, looking at **experiment 2's classification of descriptions with Omission** (as this model cascade performed best on that label) and **experiment 3's classification of descriptions with Stereotype** (as this model cascade performed best on that label):

In [22]:
label = "Stereotype" #"Omission"
pred_col = "predicted_label" #"sgd-svm_label"

In [23]:
df_exploded = df.explode([pred_col])
df_exploded.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,expected_label,fold,pers_o_pred,predicted_label,eadid
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,[Stereotype],split4,"[Occupation, Unknown, Masculine]",Stereotype,Coll-1253
7,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,[Omission],split4,"[Unknown, Occupation, Masculine]",Omission,Coll-1255
10,4736,9951,10026,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,[Omission],split4,"[Unknown, Occupation]",Omission,Coll-1310
14,4712,4199,4485,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",[Omission],split4,"[Unknown, Occupation, Feminine]",Omission,Coll-1310
22,15684,845,1179,Biographical / Historical,Catherine Robina Borland was responsible for t...,[],split4,[Occupation],,Coll-1453


In [24]:
subdf = df_exploded.loc[df_exploded[pred_col] == label] # Only predictions of specified label
print(subdf.shape)
subdf.head()

(1321, 10)


Unnamed: 0,description_id,start_offset,end_offset,field,description,expected_label,fold,pers_o_pred,predicted_label,eadid
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,[Stereotype],split4,"[Occupation, Unknown, Masculine]",Stereotype,Coll-1253
58,2135,490,1643,Biographical / Historical,Edward Adrian Wilson (Wilson of the Antarctic)...,[],split4,"[Unknown, Feminine, Occupation, Masculine]",Stereotype,Coll-1087
71,5533,3379,6106,Biographical / Historical,Gerald Henry Elliot was born in Edinburgh on 2...,"[Omission, Stereotype]",split4,"[Unknown, Feminine, Occupation, Masculine]",Stereotype,Coll-1357
84,1174,2183,4687,Biographical / Historical,Eric Olof Lundholm was born in 1915 in Modderf...,"[Omission, Stereotype]",split4,"[Unknown, Occupation, Masculine]",Stereotype,Coll-1055
105,27353,468,1233,Biographical / Historical,Novelist Samuel Rutherford Crockett was born i...,[],split4,"[Unknown, Occupation, Masculine]",Stereotype,Coll-1475


In [25]:
df_eadid_counts = pd.DataFrame(subdf.eadid.value_counts()).reset_index()
df_eadid_counts = df_eadid_counts.rename(columns={"index":"eadid","eadid":"desc_count"})
df_eadid_counts = df_eadid_counts.sort_values(by=["desc_count"], ascending=False)
df_eadid_counts.head()

Unnamed: 0,eadid,desc_count
0,Coll-1434,820
1,Coll-146,301
2,BAI,23
3,Coll-1057,9
4,Coll-1362,4


Join the other metadata from the catalog to the eadid (fonds, or collection) data:

In [26]:
meta_df = pd.read_csv("../data/crc_metadata/CRC_units-grouped-by-fonds_clean.csv", index_col=0)
df_joined = df_eadid_counts.join(meta_df.set_index("eadid"), on=["eadid"], how="outer")
df_joined = df_joined.rename(columns={"desc_count":"{label}_exp{n}_desc_count".format(label=label.lower(), n=exp)})
df_joined.head()

Unnamed: 0,eadid,stereotype_exp3_desc_count,omission_exp2_desc_count,unit_title,unit_identifier,unique_language,unique_date,unique_geography
0.0,Coll-1434,820.0,916.0,"['Roslin Slide Collection', 'Stack of Medical ...","['Coll-1434', 'Coll-1434/1', 'Coll-1434/2', 'C...",['English'],"['1870-01-01 - 1930-12-31', '1870-01-01 - 1930...","['Winnipeg (Manitoba', 'Tulliallan (Scotland)'..."
1.0,Coll-146,301.0,161.0,,,,,
2.0,BAI,23.0,13.0,"['Papers of Professor John Baillie, and Bailli...","['BAI', 'BAI 1', 'BAI 1/1', 'BAI 1/1/1', 'BAI ...","['English', 'Greek', 'Hebrew', 'German']","['1880-01-01 - 2003-12-31', '1900-01-01 - 1964...","['Argentina', 'Dunedin (New Zealand)', 'New Ze..."
3.0,Coll-1057,9.0,32.0,"['Papers of Alan W. Greenwood', 'Academic cert...","['Coll-1057', 'Coll-1057/1', 'Coll-1057/1/1', ...","['English', 'Russian', 'French']","['1889-01-01 - 1943-12-31', '1916-01-01 - 1931...","['Edinburgh (Scotland)', 'Wye Mills Talbot Cou..."
4.0,Coll-1362,4.0,2.0,"['Roslin Institute Offprint Collection', 'Anim...","['Coll-1362', 'Coll-1362/1', 'Coll-1362/1/1', ...",['English'],"['1947-01-01 - 2007-12-31', '1947-01-01 - 1985...","['New Zealand', 'Mexico', 'Great Britain', 'Sc..."


Note any missing data:

In [27]:
df_joined.loc[df_joined.unit_title.isna()]

Unnamed: 0,eadid,stereotype_exp3_desc_count,omission_exp2_desc_count,unit_title,unit_identifier,unique_language,unique_date,unique_geography
1.0,Coll-146,301.0,161.0,,,,,
118.0,AA5,1.0,,,,,,
97.0,Coll-1320,1.0,3.0,,,,,
156.0,Coll-1176,1.0,2.0,,,,,
132.0,Coll-1492,1.0,2.0,,,,,
80.0,Coll-1307,1.0,,,,,,
85.0,AA7,1.0,1.0,,,,,
15.0,Coll-1022,1.0,4.0,,,,,
70.0,AA6,1.0,,,,,,
,Coll-1266,,19.0,,,,,


Save the metadata with this additional column:

In [28]:
df_joined.to_csv("../data/crc_metadata/CRC_units-grouped-by-fonds_clean.csv")

Analyze the results:

In [29]:
unique_fonds = list(df_joined.eadid.unique())
print(len(unique_fonds))

848


In [31]:
# Top Stereotype collections
top_st = df_joined.sort_values(by=["stereotype_exp3_desc_count"], ascending=False).head(50)
top_st.head()

Unnamed: 0,eadid,stereotype_exp3_desc_count,omission_exp2_desc_count,unit_title,unit_identifier,unique_language,unique_date,unique_geography
0.0,Coll-1434,820.0,916.0,"['Roslin Slide Collection', 'Stack of Medical ...","['Coll-1434', 'Coll-1434/1', 'Coll-1434/2', 'C...",['English'],"['1870-01-01 - 1930-12-31', '1870-01-01 - 1930...","['Winnipeg (Manitoba', 'Tulliallan (Scotland)'..."
1.0,Coll-146,301.0,161.0,,,,,
2.0,BAI,23.0,13.0,"['Papers of Professor John Baillie, and Bailli...","['BAI', 'BAI 1', 'BAI 1/1', 'BAI 1/1/1', 'BAI ...","['English', 'Greek', 'Hebrew', 'German']","['1880-01-01 - 2003-12-31', '1900-01-01 - 1964...","['Argentina', 'Dunedin (New Zealand)', 'New Ze..."
3.0,Coll-1057,9.0,32.0,"['Papers of Alan W. Greenwood', 'Academic cert...","['Coll-1057', 'Coll-1057/1', 'Coll-1057/1/1', ...","['English', 'Russian', 'French']","['1889-01-01 - 1943-12-31', '1916-01-01 - 1931...","['Edinburgh (Scotland)', 'Wye Mills Talbot Cou..."
4.0,Coll-1362,4.0,2.0,"['Roslin Institute Offprint Collection', 'Anim...","['Coll-1362', 'Coll-1362/1', 'Coll-1362/1/1', ...",['English'],"['1947-01-01 - 2007-12-31', '1947-01-01 - 1985...","['New Zealand', 'Mexico', 'Great Britain', 'Sc..."


In [32]:
# Top Omission collections
top_om = df_joined.sort_values(by=["omission_exp2_desc_count"], ascending=False).head(50)
top_om.head()

Unnamed: 0,eadid,stereotype_exp3_desc_count,omission_exp2_desc_count,unit_title,unit_identifier,unique_language,unique_date,unique_geography
0.0,Coll-1434,820.0,916.0,"['Roslin Slide Collection', 'Stack of Medical ...","['Coll-1434', 'Coll-1434/1', 'Coll-1434/2', 'C...",['English'],"['1870-01-01 - 1930-12-31', '1870-01-01 - 1930...","['Winnipeg (Manitoba', 'Tulliallan (Scotland)'..."
9.0,Coll-1310,2.0,623.0,"['Papers of Godfrey H. Thomson', 'Biographical...","['Coll-1310', 'Coll-1310-1', 'Coll-1310/1/1', ...","['French', 'English', 'German', 'Swedish', 'Po...","['1890-01-01 - 1978-12-31', '1890-01-01 - 1978...","['St Andrews (Scotland)', 'Falkirk Scotland', ..."
115.0,Coll-1064,1.0,239.0,"['Papers of Professor Walter Ledermann', '1 (3...","['Coll-1064', 'Coll-1064/1', 'Coll-1064/2', 'C...",['English'],"['1937-01-01 - 1954-12-31', '1937-02-02 - 1938...","['Edinburgh (Scotland)', 'St Andrews (Scotland..."
1.0,Coll-146,301.0,161.0,,,,,
,Coll-1255,,143.0,"['Papers of Geoffrey Beale, founder of malaria...","['Coll-1255', 'Coll-1255/1', 'Coll-1255/2', 'C...","['English', 'Russian', 'German', 'Italian', 'F...","['1876-01-01 - 2003-12-31', '1947-01-01 - 2002...",['']


Save the top 50 rows for each label's description count:

In [33]:
top_st.to_csv("../data/crc_metadata/top50_stereotype_desc_count.csv")
top_om.to_csv("../data/crc_metadata/top50_omission_desc_count.csv")

Get and export the descriptions only from the **top 3** Omission- and Stereotype-labeled fonds:

In [15]:
exp = 3

In [16]:
output_dir = "../data/token_clf_data/experiment{n}/5fold/output/".format(n=exp)
f = output_dir+"aggregated_final_predictions_docclf_sgd-svm_so.csv"                     # From 5-fold cross validation
df = pd.read_csv(f, index_col=0)
df.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,expected_label,fold,pers_o_pred,predicted_label,eadid
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,['Stereotype'],split4,"['Occupation', 'Unknown', 'Masculine']",['Stereotype'],Coll-1253
7,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,['Omission'],split4,"['Unknown', 'Occupation', 'Masculine']",['Omission'],Coll-1255
10,4736,9951,10026,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,['Omission'],split4,"['Unknown', 'Occupation']",['Omission'],Coll-1310
14,4712,4199,4485,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",['Omission'],split4,"['Unknown', 'Occupation', 'Feminine']",['Omission'],Coll-1310
22,15684,845,1179,Biographical / Historical,Catherine Robina Borland was responsible for t...,[''],split4,['Occupation'],[''],Coll-1453


In [17]:
top_st = pd.read_csv("../data/crc_metadata/top50_stereotype_desc_count.csv", index_col=0)
top_st = top_st.sort_values(by="stereotype_exp3_desc_count", ascending=False)
top_st_eadids = list(top_st.eadid)[:3]
df = df.loc[df.eadid.isin(top_st_eadids)]
# Only keep the rows with a label
df = utils.getColumnValuesAsLists(df, "predicted_label")
df_exploded = df.explode(["predicted_label"])
subdf = df_exploded.loc[df_exploded["predicted_label"] == "Stereotype"]
print(subdf.shape)
subdf.head()

(1144, 10)


Unnamed: 0,description_id,start_offset,end_offset,field,description,expected_label,fold,pers_o_pred,predicted_label,eadid
128,720,136,3561,Biographical / Historical,"John Baillie was born in 1886, the son of Rev ...","['Omission', 'Stereotype']",split4,"['Unknown', 'Feminine', 'Occupation', 'Masculi...",Stereotype,BAI
382,733,6804,7584,Biographical / Historical,"In the couse of his life, John Baillie acquire...","['Omission', 'Stereotype']",split4,['Unknown'],Stereotype,BAI
466,21619,1346,1430,Title,Close-up photograph of Arthur Koestler and a m...,['Stereotype'],split4,['O'],Stereotype,Coll-146
534,21359,1840,1887,Title,Photograph of a young girl in a park ::,['Stereotype'],split4,['O'],Stereotype,Coll-146
567,22001,1788,1820,Title,Mural with four women ::,['Stereotype'],split4,['O'],Stereotype,Coll-146


Save the data sample:

In [18]:
subdf.to_csv("../data/crc_metadata/stereotype_descs_from_top3_labeled_fonds.csv")

In [19]:
exp = 2

In [20]:
output_dir = "../data/token_clf_data/experiment{n}/5fold/output/".format(n=exp)
f = output_dir+"aggregated_final_predictions_docclf_sgd-svm_so.csv"                     # From 5-fold cross validation
df = pd.read_csv(f, index_col=0)
df.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,manual_label,fold,doc_ling_pred,sgd-svm_label,eadid
6,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,['Stereotype'],split4,"['Generalization', 'Gendered-Pronoun']",['Stereotype'],Coll-1253
7,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,['Omission'],split4,['Gendered-Pronoun'],['Omission'],Coll-1255
10,4736,9951,10026,Biographical / Historical,Delivered by Thomson to teachers in Darlington.,['Omission'],split4,[''],['Omission'],Coll-1310
14,4712,4199,4485,Biographical / Historical,"This was gifted by Thomson to his secretary, M...",['Omission'],split4,['Gendered-Pronoun'],['Omission'],Coll-1310
22,15684,845,1179,Biographical / Historical,Catherine Robina Borland was responsible for t...,[''],split4,"['Generalization', 'Gendered-Pronoun']",[''],Coll-1453


In [21]:
top_om = pd.read_csv("../data/crc_metadata/top50_omission_desc_count.csv", index_col=0)
top_om = top_om.sort_values(by="omission_exp2_desc_count", ascending=False)
top_om_eadids = list(top_st.eadid)[:3]
df = df.loc[df.eadid.isin(top_om_eadids)]
# Only keep the rows with a label
df = utils.getColumnValuesAsLists(df, "sgd-svm_label")
df_exploded = df.explode(["sgd-svm_label"])
subdf = df_exploded.loc[df_exploded["sgd-svm_label"] == "Omission"]
print(subdf.shape)
subdf.head()

(1090, 10)


Unnamed: 0,description_id,start_offset,end_offset,field,description,manual_label,fold,doc_ling_pred,sgd-svm_label,eadid
128,720,136,3561,Biographical / Historical,"John Baillie was born in 1886, the son of Rev ...","['Omission', 'Stereotype']",split4,"['Gendered-Role', 'Generalization', 'Gendered-...",Omission,BAI
487,9113,378,435,Title,"Cows of the Buckhold Herd, owned by Dr. H. Watney",[''],split4,[''],Omission,Coll-1434
567,22001,1788,1820,Title,Mural with four women ::,['Stereotype'],split4,['Gendered-Role'],Omission,Coll-146
795,19093,1349,1409,Title,Kolomea Juni 1941 - Februar 1943 :: [Weinleber...,['Omission'],split4,['Gendered-Role'],Omission,Coll-146
815,11913,745,798,Title,How Mr. Rhodes Liked to Travel in the Country,['Omission'],split4,['Gendered-Role'],Omission,Coll-1434


Save the data sample:

In [22]:
subdf.to_csv("../data/crc_metadata/omission_descs_from_top3_labeled_fonds.csv")

### Analysis By Metadata Field

In [49]:
top_df = top_om
# Remove NaN
top_df = top_df.dropna()

In [50]:
cols = "unit_title", "unit_identifier", "unique_language", "unique_date", "unique_geography"
for col_name in cols:
    top_df = utils.getColumnValuesAsLists(top_df, col_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = col_list_values


In [58]:
titles = list(top_df.unit_title)
fonds_titles = [title_list[0] for title_list in titles]
print(fonds_titles)

['Roslin Slide Collection', 'Papers of Godfrey H. Thomson', 'Papers of Professor Walter Ledermann', 'Papers of Richard Alan Beatty', 'Collection of Ian R. Grant', 'Papers of Alan W. Greenwood', 'Papers of Marjory Kennedy-Fraser', 'Papers of Sir John Jackson', 'Papers of Professor John Baillie', 'Material relating to Albert E. G. Pilliner', 'Patrick Geddes Collection', 'Letter from Mendelssohn-Bartholdy to J. Thomson', 'George Baillie Collection of Music Manuscripts', 'Letters of Frederick Sleigh Roberts', 'Papers and other material relating to C. Wyville Thomson', 'Letters of W. S. Morrison to his brother', 'Letter', 'Material relating to Helen Millar Lowe', 'Material relating to Tours of Duty with Colonial Medical Service', 'The staggering state of the Scottish statesmen', 'Material relating to Hugh Marwick', 'Letters of Ralph Erskine', 'Letter', 'Papers of Andrew Nisbet Bogle', 'Notes of lectures given by William Saunders', 'Papers of Lewis T. Waters', 'Research writings of Dr. Ewart

In [53]:
languages = list(top_df.unique_language)
# # Flatten the list
flat_languages = []
for language_list in languages:
    if type(language_list) == list:
        for lang in language_list:
            flat_languages += [lang]
unique_languages = list(set(flat_languages))
print(len(unique_languages))
print(unique_languages)

24
['Spanish', 'English', 'Burmese', 'Hebrew', 'Dutch', 'Flemish', 'Czech', 'German', 'Scots Dialect', 'Scots dialect', 'French', 'English Gaelic', 'Gaelic Scottish Gaelic', 'Swedish', 'Latin', 'Irish Gaelic', 'Greek', 'Scottish Gaelic', 'Mostly English', 'Scots', 'Polish', 'Russian', 'Italian', 'Multiple languages']


In [57]:
geographies = list(top_df.unique_geography)
# # Flatten the list
flat_geographies = []
for geography_list in geographies:
    if type(geography_list) == list:
        for geography in geography_list:
            flat_geographies += [geography]
unique_geographies = list(set(flat_geographies))
print(len(unique_geographies))
print(unique_geographies)

426
['', 'Lancashire (England)', 'Karlsburg (Germany)', 'Manitoba Canada', 'Ranchi (India)', 'Burma -- Mandalay', 'British Columbia (Canada)', 'United States)', 'Guyana', 'Canada', 'Zanzibar (Tanzania)', 'North', 'Mez&#337;hegyes (Hungary)', 'Dartmoor (England)', 'Delhi (India)', 'Rio de la Plata River Valley (Argentina and Uruguay)', 'United States -- Connecticut', 'England -- Salford', 'Cyrenaica (Libya)', 'West', 'Bervie (Scotland)', 'Saskatchewan (Canada)', 'Shropshire (England)', 'Buenos Aires (Argentina)', 'Oxfordshire England', 'Omsk (Russia)', 'Wye Mills Talbot County', 'Buckinghamshire England', 'Scotland -- Kinross', 'Bulawayo (Zimbabwe)', 'Mount (New Zealand)', 'Sudan', 'Scotland -- Orkney', 'Cirencester West Gloucestershire England', 'Northwest Territories (Canada)', 'Winnipeg (Manitoba', 'Uganda', 'Crieff (Scotland)', 'Woburn (England)', 'Hisar (India)', 'Woking (England)', 'Ireland', 'Leeds (England)', 'Aberdeen Scotland', 'California (United States)', 'Edinburgh (Scotlan

In [59]:
top_df = top_st
# Remove NaN
top_df = top_df.dropna()

In [60]:
cols = "unit_title", "unit_identifier", "unique_language", "unique_date", "unique_geography"
for col_name in cols:
    top_df = utils.getColumnValuesAsLists(top_df, col_name)

In [61]:
titles = list(top_df.unit_title)
fonds_titles = [title_list[0] for title_list in titles]
print(fonds_titles)

['Roslin Slide Collection', 'Papers of Professor John Baillie', 'Papers of Alan W. Greenwood', 'Roslin Institute Offprint Collection', 'Papers of Marjory Kennedy-Fraser', 'Susan Binnie Anderson', 'Papers and other material relating to C. Wyville Thomson', 'Papers and artwork of Yolanda Sonnabend relating to her collaboration with C.H. Waddington', 'Collection relating to Winifred Rushforth', 'Papers of Sir Thomas Hudson Middleton', 'Papers of Godfrey H. Thomson', 'Papers of Allan Maconochie (Lord Meadowbank) - Legal material', 'Papers of Dr. J. M. Caborn', 'Collection of Ian R. Grant', 'Material relating to the winding up of the Edinburgh Association for the University Education of Women', 'George Baillie Collection of Music Manuscripts', 'Journal of a tour round the world taken by Sir John Fraser (1885-1947)', 'Letter from the University of Aberdeen to My Dear Sir', 'Research writings of Dr. Ewart Geoffrey Walsh', 'Notes of lectures given by William Saunders', 'Correspondence between 

In [62]:
languages = list(top_df.unique_language)
# # Flatten the list
flat_languages = []
for language_list in languages:
    if type(language_list) == list:
        for lang in language_list:
            flat_languages += [lang]
unique_languages = list(set(flat_languages))
print(len(unique_languages))
print(unique_languages)

23
['Spanish', 'Arabic', 'English', 'Burmese', 'Hebrew', 'Czech', 'German', 'Scots Dialect', 'Scots dialect', 'French', 'English Gaelic', 'Gaelic Scottish Gaelic', 'Swedish', 'Latin', 'Irish Gaelic', 'Scottish Gaelic', 'Greek', 'Mostly English', 'Scots', 'Russian', 'Polish', 'Italian', 'Multiple languages']


In [63]:
geographies = list(top_df.unique_geography)
# # Flatten the list
flat_geographies = []
for geography_list in geographies:
    if type(geography_list) == list:
        for geography in geography_list:
            flat_geographies += [geography]
unique_geographies = list(set(flat_geographies))
print(len(unique_geographies))
print(unique_geographies)

444
['', 'Lancashire (England)', 'Karlsburg (Germany)', 'Manitoba Canada', 'Ranchi (India)', 'Burma -- Mandalay', 'British Columbia (Canada)', 'United States)', 'Guyana', 'Canada', 'Zanzibar (Tanzania)', 'North', 'Mez&#337;hegyes (Hungary)', 'Dartmoor (England)', 'Delhi (India)', 'Rio de la Plata River Valley (Argentina and Uruguay)', 'United States -- Connecticut', 'England -- Salford', 'Cyrenaica (Libya)', 'West', 'Bervie (Scotland)', 'Saskatchewan (Canada)', 'Shropshire (England)', 'Buenos Aires (Argentina)', 'Oxfordshire England', 'Omsk (Russia)', 'Wye Mills Talbot County', 'Buckinghamshire England', 'Scotland -- Kinross', 'Bulawayo (Zimbabwe)', 'Arctic regions', 'Mount (New Zealand)', 'Sudan', 'Scotland -- Orkney', 'Cirencester West Gloucestershire England', 'Northwest Territories (Canada)', 'Winnipeg (Manitoba', 'Uganda', 'Crieff (Scotland)', 'Woburn (England)', 'Hisar (India)', 'Woking (England)', 'Ireland', 'Leeds (England)', 'Aberdeen Scotland', 'California (United States)', '