# Classification

## Baseline Omission and Stereotype Classifier

In [1]:
import config, clf_utils
import re
import random
import numpy as np
import pandas as pd
from pathlib import Path
import joblib
from joblib import load
from nltk.tokenize import word_tokenize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>


### 1. Preprocessing

Load the extracted archival metadata from the Newcastle University Special Collections catalog.

In [3]:
# extracted_dir = "data/extracted/"
# doc_df = pd.read_csv(extracted_dir+"nusc_archival_descs_for_clf.csv", index_col=0)

manually_coded = config.coded_data_path1+"all_manually_coded_data.csv"
doc_df = pd.read_csv(manually_coded, index_col=0)
doc_df = doc_df.rename(columns={"gender_bias":"gender_bias_manual", "omission":"omission_manual", "stereotype":"stereotype_manual"})

doc_df.head()

Unnamed: 0_level_0,doc,gender_bias_manual,omission_manual,stereotype_manual,type,note,eadid,description_id,rowid,field
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11452,Drafts and meeting notes relating to the creat...,n,n,n,,,CHE,11452,CHE/01/01,scopecontent
11467,CHE Supporters Review,n,n,n,,,CHE,11467,CHE/02/04,unittitle
11471,Friend Newcastle Annual Reports,n,n,n,,,CHE,11471,CHE/02/06,unittitle
11554,Collection of documents on sex education in sc...,n,n,n,,,CHE,11554,CHE/03/06/12,unittitle
11555,"Letters, newsletters and leaflets on the topic...",n,n,n,,,CHE,11555,CHE/03/06/13,unittitle


In [4]:
doc_df.tail()

Unnamed: 0_level_0,doc,gender_bias_manual,omission_manual,stereotype_manual,type,note,eadid,description_id,rowid,field
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4861,“The Way to the Cathedral” I.,n,n,n,,,THS,32972,THS 61.1,unittitle
4862,“The Way to the Cathedral” II.,n,n,n,,,THS,32974,THS 61.2,scopecontent
4862,“The Way to the Cathedral” II.,n,n,n,,,THS,32975,THS 61.2,unittitle
4863,“Thomas Sharp – an appreciation” by Lewis Keeble.,n,n,n,,,THS,32934,THS 56.2,unittitle
4864,“Thomas Sharp – an appreciation” by Lewis Keeb...,n,n,n,,,THS,32933,THS 56.2,scopecontent


In [5]:
doc_df.shape  # All extracted data: (57947, 5) #complete Bell Archive metadata added about 25k more descriptions

(12337, 10)

There are 57,947 metadata descriptions extracted from the Newcastle University Special Collections archival catalog to classify for *Omission* and *Stereotype*.

There are 12,337 metadata descriptions that were manually coded for *Omission* and *Stereotype*.

### 2. Classification
Load the baseline Omission and Stereotype Classifier (OSC).

In [6]:
cvectorizer = joblib.load("models/transform_docs/count_vectorizer.joblib")
tfidf = joblib.load("models/transform_docs/tfidf_transformer.joblib")
mlb = joblib.load("models/transform_labels/mlb_targets_os.joblib")
clf = joblib.load("models/baseline_osc/sgd-svm_F-tfidf_T-os.joblib")

Classify the descriptions.

In [7]:
vectorized = cvectorizer.transform(doc_df["doc"])
X = tfidf.transform(vectorized)
y = clf.predict(X)

In [8]:
predictions = mlb.inverse_transform(y)

In [9]:
doc_df.insert(len(doc_df.columns), "baseline_prediction", predictions)
doc_df.head()

Unnamed: 0_level_0,doc,gender_bias_manual,omission_manual,stereotype_manual,type,note,eadid,description_id,rowid,field,baseline_prediction
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
11452,Drafts and meeting notes relating to the creat...,n,n,n,,,CHE,11452,CHE/01/01,scopecontent,()
11467,CHE Supporters Review,n,n,n,,,CHE,11467,CHE/02/04,unittitle,()
11471,Friend Newcastle Annual Reports,n,n,n,,,CHE,11471,CHE/02/06,unittitle,()
11554,Collection of documents on sex education in sc...,n,n,n,,,CHE,11554,CHE/03/06/12,unittitle,()
11555,"Letters, newsletters and leaflets on the topic...",n,n,n,,,CHE,11555,CHE/03/06/13,unittitle,()


In [10]:
doc_df.baseline_prediction.value_counts()

()                        12168
(Omission,)                 164
(Stereotype,)                 3
(Omission, Stereotype)        2
Name: baseline_prediction, dtype: int64

In [11]:
# print("Proportion of descriptions classified:", (1489+909+314)/(55235+1489+909+314))  # about 4.7%
print("Proportion of manually coded descriptions classified:", (164+3+2)/(12168+164+3+2))  # about 1.4%

Proportion of manually coded descriptions classified: 0.0136986301369863


Export the classified data.

In [12]:
# filename = "baseline_osc_predictions.csv"
# doc_df.to_csv(config.classified_data_path+filename)

filename = "manually_coded_baselineosc.csv"
doc_df.to_csv(config.coded_and_classified+filename)

***
Deduplicate the descriptions and create a new export where each row has a unique doc-eadid-prediction combination.

In [12]:
doc_df_dedup1 = doc_df.drop_duplicates(subset=["eadid", "doc", "prediction"])
doc_df_dedup2 = doc_df.drop_duplicates(subset=["doc", "prediction"])
doc_df_dedup3 = doc_df.drop_duplicates(subset=["doc"])
print(doc_df_dedup1.shape)
print(doc_df_dedup2.shape)  # as expected, same as next
print(doc_df_dedup3.shape)  # as expected, same as previous

(27235, 6)
(27209, 6)
(27209, 6)


In [13]:
dedup_filename = "baseline_osc_predictions_deduplicated.csv"
doc_df_dedup1.to_csv(config.classified_data_path+dedup_filename)

In [14]:
doc_df_dedup1.prediction.value_counts()

()                        24819
(Omission,)                1362
(Omission, Stereotype)      773
(Stereotype,)               281
Name: prediction, dtype: int64

In [15]:
print("Proportion of unique descriptions classified:", (1362+773+281)/24819) # about 10%

Proportion of unique descriptions classified: 0.09734477617953986


### 3. Manual Review
Export a selection of descriptions for manual review, including a combination of unclassified and classified descriptions, randomly selected to include, per collection equal parts descriptions that were unclassified, that were classified as *Omission*, that were classified as *Omission* and *Stereotype*, and that were classified as *Stereotype*.

In [17]:
eadid_col = "eadid"
eadids = list(doc_df_dedup1[eadid_col].unique())
print(eadids)

['BP', 'BXB', 'CHE', 'CPT', 'GB', 'HL', 'OBR', 'SH', 'SW', 'THS', 'WCT']


In [18]:
sample_path = config.classified_data_path+"sample_baseline/"
Path(sample_path).mkdir(parents=True, exist_ok=True)

In [19]:
file_suffix = "_sample.csv"
pred_col = "prediction"
sample_size = 0.05 # 5%
for eadid in eadids:
    subdf = doc_df_dedup1.loc[doc_df_dedup1[eadid_col] == eadid]
    unclf_docs = subdf.loc[subdf[pred_col] == ()] # descriptions without an Omission or Stereotype classification
    o_docs = subdf.loc[subdf[pred_col] == ('Omission',)] # descriptions with an Omission but no Stereotype classification
    s_docs = subdf.loc[subdf[pred_col] == ('Stereotype',)] # descriptions with a Stereotype but no Omission classification
    os_docs = subdf.loc[subdf[pred_col] == ('Omission', 'Stereotype')] # descriptions with an Omission and a Stereotype classification
    dfs = [unclf_docs, o_docs, s_docs, os_docs]

    all_samples = pd.DataFrame()
    for df in dfs:
        desc_id_list = list(df.description_id)
        if len(desc_id_list) > 0:
            sample_ids = random.sample(desc_id_list, int((len(desc_id_list))*sample_size))
        else:
            sample_ids = []
        sample = df.loc[df.description_id.isin(sample_ids)]
    
        all_samples = pd.concat([all_samples, sample])
    
    all_samples.sort_values(by=["description_id"])
    # print(eadid, "-", all_samples.shape)
    all_samples.to_csv(sample_path+eadid+file_suffix)
    print("Wrote",eadid+file_suffix+"!")

Wrote BP_sample.csv!
Wrote BXB_sample.csv!
Wrote CHE_sample.csv!
Wrote CPT_sample.csv!
Wrote GB_sample.csv!
Wrote HL_sample.csv!
Wrote OBR_sample.csv!
Wrote SH_sample.csv!
Wrote SW_sample.csv!
Wrote THS_sample.csv!
Wrote WCT_sample.csv!


In [20]:
sample_path = config.classified_data_path+"sample_no_preds/"
Path(sample_path).mkdir(parents=True, exist_ok=True)

In [21]:
doc_df_dedup1.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
0,0,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...,()
1,1,BP,BP,scopecontent,The papers of the distinguished public servant...,()
2,2,BP,BP,unittitle,Plowden (Lady Bridget) Archive,()
3,3,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ...",()
4,4,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...,()


In [22]:
file_suffix = "_sample_no_preds.csv"
pred_col = "prediction"
sample_size = 0.05 # 5%
for eadid in eadids:
    subdf = doc_df_dedup1.loc[doc_df_dedup1[eadid_col] == eadid]
    unclf_docs = subdf.loc[subdf[pred_col] == ()] # descriptions without an Omission or Stereotype classification
    o_docs = subdf.loc[subdf[pred_col] == ('Omission',)] # descriptions with an Omission but no Stereotype classification
    s_docs = subdf.loc[subdf[pred_col] == ('Stereotype',)] # descriptions with a Stereotype but no Omission classification
    os_docs = subdf.loc[subdf[pred_col] == ('Omission', 'Stereotype')] # descriptions with an Omission and a Stereotype classification
    dfs = [unclf_docs, o_docs, s_docs, os_docs]

    all_samples = pd.DataFrame()
    for df in dfs:
        df = df.drop(columns=[pred_col])
        desc_id_list = list(df.description_id)
        if len(desc_id_list) > 0:
            sample_ids = random.sample(desc_id_list, int((len(desc_id_list))*sample_size))
        else:
            sample_ids = []
        sample = df.loc[df.description_id.isin(sample_ids)]
    
        all_samples = pd.concat([all_samples, sample])
    
    all_samples.sort_values(by=["description_id"])
    # print(eadid, "-", all_samples.shape)
    all_samples.to_csv(sample_path+eadid+file_suffix)
    print("Wrote",eadid+file_suffix+"!")

Wrote BP_sample_no_preds.csv!
Wrote BXB_sample_no_preds.csv!
Wrote CHE_sample_no_preds.csv!
Wrote CPT_sample_no_preds.csv!
Wrote GB_sample_no_preds.csv!
Wrote HL_sample_no_preds.csv!
Wrote OBR_sample_no_preds.csv!
Wrote SH_sample_no_preds.csv!
Wrote SW_sample_no_preds.csv!
Wrote THS_sample_no_preds.csv!
Wrote WCT_sample_no_preds.csv!


#### 3.1 Analyze Description Lengths

In [23]:
unclf_docs = doc_df_dedup1.loc[doc_df_dedup1.prediction == tuple()]
unclf_docs.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
0,0,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...,()
1,1,BP,BP,scopecontent,The papers of the distinguished public servant...,()
2,2,BP,BP,unittitle,Plowden (Lady Bridget) Archive,()
3,3,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ...",()
4,4,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...,()


In [24]:
clf_docs = doc_df_dedup1.loc[doc_df_dedup1.prediction != tuple()]
clf_docs.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
51,51,BP,BP/03/03/05,scopecontent,A collection of reports published by external ...,"(Omission,)"
108,108,BP,BP/08/02/03,scopecontent,Priority Area Development: Preparation for Par...,"(Omission,)"
129,129,BP,BP/08/06,scopecontent,Correspondence files kept by Lady Plowden in r...,"(Omission,)"
131,131,BP,BP/08/07,scopecontent,Notes on various speeches made by Lady Plowden...,"(Omission,)"
188,188,BP,BP/09/09/05,scopecontent,A submission from ACERT to the inquiry organis...,"(Omission,)"


In [25]:
print(unclf_docs.shape)
print(clf_docs.shape)

(24819, 6)
(2416, 6)


In [26]:
unclf_docs_list = list(unclf_docs.doc)
clf_docs_list = list(clf_docs.doc)
print(clf_docs_list[0])

A collection of reports published by external organisations. This collection is assumed to have been accumulated by Lady Plowden to inform her work with the Council - the collection was found with Lady Plowden's Council files and deal with related issues.
Note, although these publications relate specifically to Lady Plowden’s work with the Council , Lady Plowden also held an extensive library of reports and publications which covered the range of her personal and professional interests. A complete list of reports and publications held by Lady Plowden can be viewed at: INSERT LINK


In [27]:
unclf_token_counts = [len(word_tokenize(doc)) for doc in unclf_docs_list]
clf_token_counts = [len(word_tokenize(doc)) for doc in clf_docs_list]
print(unclf_token_counts[0])
print(clf_token_counts[0])

16
101


In [28]:
print("Minimum, average, standard deviation, median, and maximum token counts in UNCLASSIFIED descriptions:")
print(np.min(unclf_token_counts))
print(np.mean(unclf_token_counts))
print(np.std(unclf_token_counts))
print(np.median(unclf_token_counts))
print(np.max(unclf_token_counts))

Minimum, average, standard deviation, median, and maximum token counts in UNCLASSIFIED descriptions:
1
16.73999758249728
31.397437371032396
11.0
2478


In [29]:
print("Minimum, average, standard deviation, median, and maximum token counts in CLASSIFIED descriptions:")
print(np.min(clf_token_counts))
print(np.mean(clf_token_counts))
print(np.std(clf_token_counts))
print(np.median(clf_token_counts))
print(np.max(clf_token_counts))

Minimum, average, standard deviation, median, and maximum token counts in CLASSIFIED descriptions:
2
39.40604304635762
68.22333998794495
17.0
839


In [30]:
o_docs = clf_docs.loc[clf_docs.prediction == ('Omission',)]
print(o_docs.shape)
o_docs.head()

(1362, 6)


Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
51,51,BP,BP/03/03/05,scopecontent,A collection of reports published by external ...,"(Omission,)"
108,108,BP,BP/08/02/03,scopecontent,Priority Area Development: Preparation for Par...,"(Omission,)"
129,129,BP,BP/08/06,scopecontent,Correspondence files kept by Lady Plowden in r...,"(Omission,)"
131,131,BP,BP/08/07,scopecontent,Notes on various speeches made by Lady Plowden...,"(Omission,)"
188,188,BP,BP/09/09/05,scopecontent,A submission from ACERT to the inquiry organis...,"(Omission,)"


In [31]:
os_docs = clf_docs.loc[clf_docs.prediction == ('Omission', 'Stereotype')]
print(os_docs.shape)
os_docs.head()

(773, 6)


Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
655,655,BXB,BXB/1/1/AGA/1/4,scopecontent,Consists of a photocopy of both the book cover...,"(Omission, Stereotype)"
724,724,BXB,BXB/1/1/AGA/6/11,scopecontent,Consists of photocopies of a poem entitled 'Ma...,"(Omission, Stereotype)"
840,840,BXB,BXB/1/1/ALV/1,unittitle,Carrying My Wife,"(Omission, Stereotype)"
1177,1177,BXB,BXB/1/1/BOL,bioghist,Robyn Bolam (formerly Marion Lomax) was born i...,"(Omission, Stereotype)"
1197,1197,BXB,BXB/1/1/BON/1,bioghist,"Galway Kinnell was born in Providence, Rhode I...","(Omission, Stereotype)"


In [32]:
s_docs = clf_docs.loc[clf_docs.prediction == ('Stereotype',)]
print(s_docs.shape)
s_docs.head()

(281, 6)


Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
374,374,BP,BP/19/02/10,unittitle,Women and Employment,"(Stereotype,)"
459,459,BP,BP/27/39,unittitle,Research File: Women and Work,"(Stereotype,)"
460,460,BP,BP/27/40,unittitle,Research File: A Woman's View of the Recession...,"(Stereotype,)"
871,871,BXB,BXB/1/1/ARM,bioghist,Simon Armitage was born in 1963 in Huddersfiel...,"(Stereotype,)"
1180,1180,BXB,BXB/1/1/BOL/1,unittitle,The Peepshow Girl,"(Stereotype,)"
