# Classification

## Baseline Omission and Stereotype Classifier

In [62]:
import config, clf_utils
import re
import random
import numpy as np
import pandas as pd
from pathlib import Path
import joblib
from joblib import load
from nltk.tokenize import word_tokenize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support


### 1. Preprocessing

Load the extracted archival metadata from the Newcastle University Special Collections catalog.

In [4]:
extracted_dir = "data/extracted/"
doc_df = pd.read_csv(extracted_dir+"nusc_archival_descs_for_clf.csv", index_col=0)
doc_df.head()


Unnamed: 0,description_id,eadid,rowid,field,doc
0,0,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...
1,1,BP,BP,scopecontent,The papers of the distinguished public servant...
2,2,BP,BP,unittitle,Plowden (Lady Bridget) Archive
3,3,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ..."
4,4,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...


In [5]:
doc_df.tail()

Unnamed: 0,description_id,eadid,rowid,field,doc
57942,57942,WCT,WCT/8,unittitle,Diaries
57943,57943,WCT,WCT/9,scopecontent,Personal diaries and notebooks.
57944,57944,WCT,WCT/9,unittitle,Diaries and notebooks of Lady Pauline
57945,57945,WCT,WCT/9/4,scopecontent,Microfilm reels of Pauline's diaries and sketc...
57946,57946,WCT,WCT/9/4,unittitle,Microfilm copies of Lady Pauline's Diaries


In [6]:
doc_df.shape  # complete Bell Archive metadata added about 25k more descriptions

(57947, 5)

There are 57,947 metadata descriptions to classify for *Omission* and *Stereotype*.

### 2. Classification
Load the baseline Omission and Stereotype Classifier (OSC).

In [7]:
cvectorizer = joblib.load("models/transform_docs/count_vectorizer.joblib")
tfidf = joblib.load("models/transform_docs/tfidf_transformer.joblib")
mlb = joblib.load("models/transform_labels/mlb_targets_os.joblib")
clf = joblib.load("models/baseline_osc/sgd-svm_F-tfidf_T-os.joblib")

Classify the descriptions.

In [8]:
vectorized = cvectorizer.transform(doc_df["doc"])
X = tfidf.transform(vectorized)
y = clf.predict(X)

In [9]:
predictions = mlb.inverse_transform(y)

In [10]:
doc_df.insert(len(doc_df.columns), "prediction", predictions)
doc_df.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
0,0,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...,()
1,1,BP,BP,scopecontent,The papers of the distinguished public servant...,()
2,2,BP,BP,unittitle,Plowden (Lady Bridget) Archive,()
3,3,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ...",()
4,4,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...,()


In [11]:
doc_df.prediction.value_counts()

()                        55235
(Omission,)                1489
(Omission, Stereotype)      909
(Stereotype,)               314
Name: prediction, dtype: int64

In [13]:
print("Proportion of descriptions classified:", (1489+909+314)/55235)  # about 5%

Proportion of descriptions classified: 0.04909930297818412


Export the classified data.

In [12]:
filename = "baseline_osc_predictions.csv"
doc_df.to_csv(config.classified_data_path+filename)

Deduplicate the descriptions and create a new export where each row has a unique doc-eadid-prediction combination.

In [14]:
doc_df_dedup1 = doc_df.drop_duplicates(subset=["eadid", "doc", "prediction"])
doc_df_dedup2 = doc_df.drop_duplicates(subset=["doc", "prediction"])
doc_df_dedup3 = doc_df.drop_duplicates(subset=["doc"])
print(doc_df_dedup1.shape)
print(doc_df_dedup2.shape)  # as expected, same as next
print(doc_df_dedup3.shape)  # as expected, same as previous

(27235, 6)
(27209, 6)
(27209, 6)


In [15]:
dedup_filename = "baseline_osc_predictions_deduplicated.csv"
doc_df_dedup1.to_csv(config.classified_data_path+dedup_filename)

In [16]:
doc_df_dedup1.prediction.value_counts()

()                        24819
(Omission,)                1362
(Omission, Stereotype)      773
(Stereotype,)               281
Name: prediction, dtype: int64

In [17]:
print("Proportion of unique descriptions classified:", (1362+773+281)/24819) # about 10%

Proportion of unique descriptions classified: 0.09734477617953986


### 3. Manual Review
Export a selection of descriptions for manual review, including a combination of unclassified and classified descriptions, randomly selection 50 descriptions that were unclassified, that were classified as *Omission*, that were classified as *Omission* and *Stereotype*, and that were classified as *Stereotype*.

To select unclassified descriptions, to annotate, select based on average length of classified descriptions???

In [21]:
unclf_docs = doc_df_dedup1.loc[doc_df_dedup1.prediction == tuple()]
unclf_docs.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
0,0,BP,BP,processinfo,This EAD description created by Ruth Sheret 27...,()
1,1,BP,BP,scopecontent,The papers of the distinguished public servant...,()
2,2,BP,BP,unittitle,Plowden (Lady Bridget) Archive,()
3,3,BP,BP/01,scopecontent,"Formerly the 'Working Ladies Guild', the organ...",()
4,4,BP,BP/01,unittitle,Chairman (Managing Committee) of the Mary Fiel...,()


In [22]:
clf_docs = doc_df_dedup1.loc[doc_df_dedup1.prediction != tuple()]
clf_docs.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
51,51,BP,BP/03/03/05,scopecontent,A collection of reports published by external ...,"(Omission,)"
108,108,BP,BP/08/02/03,scopecontent,Priority Area Development: Preparation for Par...,"(Omission,)"
129,129,BP,BP/08/06,scopecontent,Correspondence files kept by Lady Plowden in r...,"(Omission,)"
131,131,BP,BP/08/07,scopecontent,Notes on various speeches made by Lady Plowden...,"(Omission,)"
188,188,BP,BP/09/09/05,scopecontent,A submission from ACERT to the inquiry organis...,"(Omission,)"


In [23]:
print(unclf_docs.shape)
print(clf_docs.shape)

(24819, 6)
(2416, 6)


In [28]:
unclf_docs_list = list(unclf_docs.doc)
clf_docs_list = list(clf_docs.doc)
print(clf_docs_list[0])

A collection of reports published by external organisations. This collection is assumed to have been accumulated by Lady Plowden to inform her work with the Council - the collection was found with Lady Plowden's Council files and deal with related issues.
Note, although these publications relate specifically to Lady Plowden’s work with the Council , Lady Plowden also held an extensive library of reports and publications which covered the range of her personal and professional interests. A complete list of reports and publications held by Lady Plowden can be viewed at: INSERT LINK


In [29]:
unclf_token_counts = [len(word_tokenize(doc)) for doc in unclf_docs_list]
clf_token_counts = [len(word_tokenize(doc)) for doc in clf_docs_list]
print(unclf_token_counts[0])
print(clf_token_counts[0])

16
101


In [40]:
print("Minimum, average, standard deviation, median, and maximum token counts in UNCLASSIFIED descriptions:")
print(np.min(unclf_token_counts))
print(np.mean(unclf_token_counts))
print(np.std(unclf_token_counts))
print(np.median(unclf_token_counts))
print(np.max(unclf_token_counts))

Minimum, average, standard deviation, median, and maximum token counts in UNCLASSIFIED descriptions:
1
16.73999758249728
31.397437371032396
11.0
2478


In [39]:
print("Minimum, average, standard deviation, median, and maximum token counts in CLASSIFIED descriptions:")
print(np.min(clf_token_counts))
print(np.mean(clf_token_counts))
print(np.std(clf_token_counts))
print(np.median(clf_token_counts))
print(np.max(clf_token_counts))

Minimum, average, standard deviation, median, and maximum token counts in CLASSIFIED descriptions:
2
39.40604304635762
68.22333998794495
17.0
839


In [49]:
o_docs = clf_docs.loc[clf_docs.prediction == ('Omission',)]
print(o_docs.shape)
o_docs.head()

(1362, 6)


Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
51,51,BP,BP/03/03/05,scopecontent,A collection of reports published by external ...,"(Omission,)"
108,108,BP,BP/08/02/03,scopecontent,Priority Area Development: Preparation for Par...,"(Omission,)"
129,129,BP,BP/08/06,scopecontent,Correspondence files kept by Lady Plowden in r...,"(Omission,)"
131,131,BP,BP/08/07,scopecontent,Notes on various speeches made by Lady Plowden...,"(Omission,)"
188,188,BP,BP/09/09/05,scopecontent,A submission from ACERT to the inquiry organis...,"(Omission,)"


In [50]:
os_docs = clf_docs.loc[clf_docs.prediction == ('Omission', 'Stereotype')]
print(os_docs.shape)
os_docs.head()

(773, 6)


Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
655,655,BXB,BXB/1/1/AGA/1/4,scopecontent,Consists of a photocopy of both the book cover...,"(Omission, Stereotype)"
724,724,BXB,BXB/1/1/AGA/6/11,scopecontent,Consists of photocopies of a poem entitled 'Ma...,"(Omission, Stereotype)"
840,840,BXB,BXB/1/1/ALV/1,unittitle,Carrying My Wife,"(Omission, Stereotype)"
1177,1177,BXB,BXB/1/1/BOL,bioghist,Robyn Bolam (formerly Marion Lomax) was born i...,"(Omission, Stereotype)"
1197,1197,BXB,BXB/1/1/BON/1,bioghist,"Galway Kinnell was born in Providence, Rhode I...","(Omission, Stereotype)"


In [51]:
s_docs = clf_docs.loc[clf_docs.prediction == ('Stereotype',)]
print(s_docs.shape)
s_docs.head()

(281, 6)


Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
374,374,BP,BP/19/02/10,unittitle,Women and Employment,"(Stereotype,)"
459,459,BP,BP/27/39,unittitle,Research File: Women and Work,"(Stereotype,)"
460,460,BP,BP/27/40,unittitle,Research File: A Woman's View of the Recession...,"(Stereotype,)"
871,871,BXB,BXB/1/1/ARM,bioghist,Simon Armitage was born in 1963 in Huddersfiel...,"(Stereotype,)"
1180,1180,BXB,BXB/1/1/BOL/1,unittitle,The Peepshow Girl,"(Stereotype,)"


In [56]:
sample_size = 50

In [58]:
unclf_sample_ids = random.sample(list(unclf_docs.description_id), sample_size)
unclf_sample = unclf_docs.loc[unclf_docs.description_id.isin(unclf_sample_ids)]
assert(unclf_sample.shape[0] == sample_size)
unclf_sample.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
478,478,BP,BP/30/02/01,unittitle,Correspondence in relation to the Gertrude Bel...,()
511,511,BP,BP/30/04/11,scopecontent,Family photograph album belonging to the Bell ...,()
764,764,BXB,BXB/1/1/AGG/1/7,unittitle,Press Cuttings and publicity,()
1066,1066,BXB,BXB/1/1/BEJ/3,unittitle,A Story I Am In: Selected Poems,()
1195,1195,BXB,BXB/1/1/BON,scopecontent,Consists of letters and proofs relating to the...,()


In [59]:
o_sample_ids = random.sample(list(o_docs.description_id), sample_size)
o_sample = o_docs.loc[o_docs.description_id.isin(o_sample_ids)]
assert(o_sample.shape[0] == sample_size)
o_sample.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
1171,1171,BXB,BXB/1/1/BIS,bioghist,Elizabeth Bishop is one of the greatest poets ...,"(Omission,)"
1999,1999,BXB,BXB/1/1/DOB,bioghist,"Stephen Dobyns is a leading American poet, nov...","(Omission,)"
3059,3059,BXB,BXB/1/1/GRG,bioghist,Andrew Greig is one of the leading Scottish wr...,"(Omission,)"
3924,3924,BXB,BXB/1/1/HOJ/2/1,scopecontent,Consists of a printout of selected poems with ...,"(Omission,)"
6727,6727,BXB,BXB/1/1/RUM,bioghist,Carol Rumens was born in 1944 in South London....,"(Omission,)"


In [60]:
os_sample_ids = random.sample(list(os_docs.description_id), sample_size)
os_sample = os_docs.loc[os_docs.description_id.isin(os_sample_ids)]
assert(os_sample.shape[0] == sample_size)
os_sample.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
1177,1177,BXB,BXB/1/1/BOL,bioghist,Robyn Bolam (formerly Marion Lomax) was born i...,"(Omission, Stereotype)"
4737,4737,BXB,BXB/1/1/KEB/9,unittitle,The Man Made of Rain,"(Omission, Stereotype)"
26592,26592,GB,109501,description,View of Miss Whitney standing on fallen trunk ...,"(Omission, Stereotype)"
28530,28530,GB,110475,description,"Ruined Castle, Man And Horse Nearby","(Omission, Stereotype)"
28710,28710,GB,110565,description,"Palace - Central Block Interior, Great Hall 7 ...","(Omission, Stereotype)"


In [61]:
s_sample_ids = random.sample(list(s_docs.description_id), sample_size)
s_sample = s_docs.loc[s_docs.description_id.isin(s_sample_ids)]
assert(s_sample.shape[0] == sample_size)
s_sample.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
871,871,BXB,BXB/1/1/ARM,bioghist,Simon Armitage was born in 1963 in Huddersfiel...,"(Stereotype,)"
10630,10630,BXB,BXB/2/3/12,scopecontent,Consists of an information sheet regarding wom...,"(Stereotype,)"
26572,26572,GB,109491,description,View of Lake Louise and reflection looking tow...,"(Stereotype,)"
26838,26838,GB,109624,description,Monastery Of Apa Jeremias - Upper Church - Tom...,"(Stereotype,)"
26886,26886,GB,109648,description,"Mosque Of Ibn Tulun At Qattai - Outer Wall, Sh...","(Stereotype,)"


In [None]:
sample_path = config.classified_data_path+"sample_baseline/"
Path(sample_path).mkdir(parents=True, exist_ok=True)

In [65]:
sample_file = "baseline_osc_predictions_unclf_sample.csv"
unclf_sample.to_csv(sample_path+sample_file)

In [66]:
sample_file = "baseline_osc_predictions_omission_sample.csv"
o_sample.to_csv(sample_path+sample_file)

In [67]:
sample_file = "baseline_osc_predictions_omission-stereotype_sample.csv"
os_sample.to_csv(sample_path+sample_file)

In [72]:
sample_file = "baseline_osc_predictions_stereotype_sample.csv"
s_sample.to_csv(sample_path+sample_file)

In [69]:
df_sample = pd.concat([unclf_sample, o_sample, os_sample, s_sample])
df_sample = df_sample.sort_values(by=["description_id"])
df_sample.head()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
478,478,BP,BP/30/02/01,unittitle,Correspondence in relation to the Gertrude Bel...,()
511,511,BP,BP/30/04/11,scopecontent,Family photograph album belonging to the Bell ...,()
764,764,BXB,BXB/1/1/AGG/1/7,unittitle,Press Cuttings and publicity,()
871,871,BXB,BXB/1/1/ARM,bioghist,Simon Armitage was born in 1963 in Huddersfiel...,"(Stereotype,)"
1066,1066,BXB,BXB/1/1/BEJ/3,unittitle,A Story I Am In: Selected Poems,()


In [70]:
df_sample.tail()

Unnamed: 0,description_id,eadid,rowid,field,doc,prediction
55226,55226,THS,THS 5.3.1,scopecontent,The Editor reminds the author about an article...,"(Omission,)"
55457,55457,THS,THS 5.4.46,unittitle,Typescript copy of a letter to Miss Barker.,"(Omission,)"
57352,57352,WCT,WCT 215,unittitle,Letter from Mrs Hind Smith,"(Omission,)"
57675,57675,WCT,WCT 300,unittitle,Wansbeck Railway Act,()
57878,57878,WCT,WCT 94,unittitle,Letters from and about Charles Tilstone Beke t...,()


In [73]:
clf_sample_file = "baseline_osc_predictions_sample_with_preds.csv"
df_sample.to_csv(sample_path+clf_sample_file)

In [74]:
df_sample_unclf = df_sample.drop(columns=["prediction"])
unclf_sample_file = "baseline_osc_predictions_sample_no_preds.csv"
df_sample_unclf.to_csv(sample_path+unclf_sample_file)