# Classification

## Linguistic Classifier (LC) + Omission & Stereotype Classifier (OSC)

In [1]:
# Custom filepaths and functions
import config, clf_utils

# Libraries for data, file, and model loading
import pandas as pd
import random
import joblib
from joblib import load
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from pathlib import Path

# Libraries for classification
from gensim.models import FastText
from gensim.utils import tokenize
from gensim import utils
from gensim.test.utils import get_tmpfile
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
import scipy
import sklearn.metrics
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lucyhavens/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lucyhavens/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 1. Load the Data

In [2]:
# extracted_dir = "data/extracted/"
# doc_df = pd.read_csv(extracted_dir+"nusc_archival_descs_for_clf.csv", index_col=0)

filename = "manually_coded_baselineosc.csv"
doc_df = pd.read_csv(config.coded_data_path+filename)

doc_df.head()

Unnamed: 0,index,doc,gender_bias_manual,omission_manual,stereotype_manual,type,note,eadid,description_id,rowid,field,baseline_prediction
0,11452,Drafts and meeting notes relating to the creat...,n,n,n,,,CHE,11452,CHE/01/01,scopecontent,()
1,11467,CHE Supporters Review,n,n,n,,,CHE,11467,CHE/02/04,unittitle,()
2,11471,Friend Newcastle Annual Reports,n,n,n,,,CHE,11471,CHE/02/06,unittitle,()
3,11554,Collection of documents on sex education in sc...,n,n,n,,,CHE,11554,CHE/03/06/12,unittitle,()
4,11555,"Letters, newsletters and leaflets on the topic...",n,n,n,,,CHE,11555,CHE/03/06/13,unittitle,()


In [3]:
text_cols = ["doc"]
token_df = clf_utils.getTokenDF(doc_df, text_cols)
token_df.head()

Unnamed: 0,description_id,token_id,token
0,11452,0,Drafts
0,11452,1,and
0,11452,2,meeting
0,11452,3,notes
0,11452,4,relating


In [4]:
token_df.tail()

Unnamed: 0,description_id,token_id,token
12118,32933,645696,Thomas
12118,32933,645697,Sharp
12118,32933,645698,Memorial
12118,32933,645699,Lecture
12118,32933,645700,.


In [5]:
token_df.shape

(645701, 3)

### 2. Linguistic Classification
Classify the data using the Linguistic Classifier to create *Generalization*, *Gendered Pronoun*, and *Gendered Role* features to input into the Omission and Stereotype Classifier.

Load the models.

In [6]:
ft_model = FastText.load("models/embeddings/fasttextcbow100_lowercased/fasttext_cbow_100d.model")
mlb_ling = joblib.load("models/transform_labels/mlb_targets_ling.joblib")  # formerly named mlb_linlabels.joblib
ling_clf = joblib.load("models/ling_osc/cc-rf_F-fasttextcbow100_T-linglabels.joblib")

In [7]:
print("Classification targets:", mlb_ling.classes_)

Classification targets: ['Gendered-Pronoun' 'Gendered-Role' 'Generalization']


Transform the token data into an embedding matrix to input into the classifier.

In [8]:
X = clf_utils.getFeatures(token_df, embedding_model=ft_model)

Classify the tokens.

In [9]:
y = ling_clf.predict(X)

In [10]:
predictions = mlb_ling.inverse_transform(y)
new_preds = [pred[0] if len(pred) > 0 else "O" for pred in predictions]

In [11]:
pred_col = "prediction"
token_df.insert(len(token_df.columns), pred_col, new_preds)
token_df.head()

Unnamed: 0,description_id,token_id,token,prediction
0,11452,0,Drafts,O
0,11452,1,and,O
0,11452,2,meeting,O
0,11452,3,notes,O
0,11452,4,relating,O


In [12]:
token_df[pred_col].value_counts()

O                   632030
Gendered-Pronoun     12913
Gendered-Role          649
Generalization         109
Name: prediction, dtype: int64

In [13]:
# token_df.to_csv(config.classified_data_path+"baseline_lc_predictions.csv")

token_df.to_csv(config.coded_data_path+"manually_coded_baselinelc.csv")

### 3. Omission & Stereotype Classification
Classify the data using the Omission and Stereotype Classifier to assign *Omission* and *Stereotype* codes to NUSC archival metadata descriptions.

### 3.1 Preprocessing

In [51]:
# ling_preds_file = config.classified_data_path+"baseline_lc_predictions.csv" # token_df
# token_df = pd.read_csv(ling_preds_file, index_col=0)
# token_df.head()

In [14]:
# Group the data by description so it can be combined with the description data
df_features = clf_utils.implodeDataFrameUnique(token_df[["description_id", "token_id", pred_col]], ["description_id"])

# Deduplicate lists in description_id, token_id, and prediction columns
cols = ["token_id", pred_col]
for col in cols:
    old_col = list(df_features[col])
    new_col = []
    if col == pred_col:
        for values in old_col:
            if (len(values) > 1) and ("O" in values):
                values.remove("O")
            values_list = list(values)
            values_list.sort()
            new_col += [values_list]
    else:
        new_col = old_col
    col_i = list(df_features.columns).index(col)
    df_features = df_features.drop(columns=[col])
    df_features.insert(col_i, col, new_col)

df_features.head()

Unnamed: 0_level_0,token_id,prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[26957, 26958, 26959, 26960, 26961, 26962, 269...",[O]
1,"[22033, 22034, 22035, 22036, 22037, 22038, 220...","[Gendered-Pronoun, Gendered-Role, Generalization]"
2,"[19977, 19978, 19979, 19980, 19981, 19982]",[Gendered-Role]
3,"[16288, 16289, 16290, 16291, 16292, 16293, 162...","[Gendered-Pronoun, Gendered-Role, Generalization]"
4,"[13184, 13171, 13172, 13173, 13174, 13175, 131...",[Generalization]


In [15]:
df_features.tail()

Unnamed: 0_level_0,token_id,prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1
57853,"[8102, 8103, 8104, 8105, 8106]",[O]
57857,"[8107, 8108, 8109, 8110, 8111, 8112, 8113]",[O]
57861,"[8114, 8115, 8116, 8117, 8118, 8119, 8120, 812...",[Gendered-Pronoun]
57883,"[8141, 8142, 8143, 8144, 8145, 8146, 8147, 814...",[O]
57906,"[8182, 8183, 8184, 8185, 8186, 8187, 8188, 8189]",[O]


In [16]:
df_features[pred_col].value_counts()

[O]                                                  9587
[Gendered-Pronoun]                                   1949
[Gendered-Role]                                       443
[Generalization]                                       79
[Gendered-Pronoun, Gendered-Role]                      37
[Gendered-Role, Generalization]                        12
[Gendered-Pronoun, Gendered-Role, Generalization]       7
[Gendered-Pronoun, Generalization]                      5
Name: prediction, dtype: int64

### 3.2 Feature Extraction

Join the feature data (i.e., classifier-predicted *Gendered Pronoun*, *Gendered Role*, and *Generalization* labels) to the document data (i.e., NUSC archival catalog metadata descriptions).

In [17]:
df = df_features.join(doc_df.set_index("description_id"), on="description_id", how="right")

# Rename label columns for clarity
feature_col = "linguistic_prediction"
df = df.rename(columns={pred_col:feature_col})

# Remove "O" from linguistic prediction lists if other labels are present in the list
ling_pred = list(df[feature_col])
new_ling_pred = []
for pred in ling_pred:
    if len(pred) > 1:
        if "O" in pred:
            pred.remove("O")
    new_ling_pred += [pred]

df = df.drop(columns=[feature_col])
df.insert(3, feature_col, new_ling_pred)
df.head()

Unnamed: 0_level_0,token_id,index,doc,linguistic_prediction,gender_bias_manual,omission_manual,stereotype_manual,type,note,eadid,rowid,field,baseline_prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
11452,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",11452,Drafts and meeting notes relating to the creat...,[O],n,n,n,,,CHE,CHE/01/01,scopecontent,()
11467,"[16, 17, 15]",11467,CHE Supporters Review,[O],n,n,n,,,CHE,CHE/02/04,unittitle,()
11471,"[18, 19, 20, 21]",11471,Friend Newcastle Annual Reports,[O],n,n,n,,,CHE,CHE/02/06,unittitle,()
11554,"[22, 23, 24, 25, 26, 27, 28, 29]",11554,Collection of documents on sex education in sc...,[O],n,n,n,,,CHE,CHE/03/06/12,unittitle,()
11555,"[32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 3...",11555,"Letters, newsletters and leaflets on the topic...",[O],n,n,n,,,CHE,CHE/03/06/13,unittitle,()


### 3.3 Classification

In [18]:
clf_os = joblib.load("models/ling_osc/sgd-svm_F-tfidf-ling_T-os.joblib")
mlb_os = joblib.load("models/transform_labels/mlb_targets_os.joblib")
cvectorizer = joblib.load("models/transform_docs/count_vectorizer.joblib")
tfidf_transformer = joblib.load("models/transform_docs/tfidf_transformer.joblib")

In [19]:
print(mlb_ling.classes_)
print(mlb_os.classes_)

['Gendered-Pronoun' 'Gendered-Role' 'Generalization']
['Omission' 'Stereotype']


In [20]:
docs = df["doc"]
vectorized = cvectorizer.transform(docs)
tfidf_matrix = tfidf_transformer.transform(vectorized)

In [21]:
features = mlb_ling.transform(df[feature_col])
X = scipy.sparse.hstack([tfidf_matrix, features])



In [None]:
y = clf_os.predict(X)
predictions = mlb_os.inverse_transform(y)
pred_col_name = "lcosc_prediction"
df.insert(len(df.columns), pred_col_name, predictions)
df.head()

Unnamed: 0_level_0,token_id,index,doc,linguistic_prediction,gender_bias_manual,omission_manual,stereotype_manual,type,note,eadid,rowid,field,baseline_prediction,lcosc_prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
11452,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",11452,Drafts and meeting notes relating to the creat...,[O],n,n,n,,,CHE,CHE/01/01,scopecontent,(),()
11467,"[16, 17, 15]",11467,CHE Supporters Review,[O],n,n,n,,,CHE,CHE/02/04,unittitle,(),()
11471,"[18, 19, 20, 21]",11471,Friend Newcastle Annual Reports,[O],n,n,n,,,CHE,CHE/02/06,unittitle,(),()
11554,"[22, 23, 24, 25, 26, 27, 28, 29]",11554,Collection of documents on sex education in sc...,[O],n,n,n,,,CHE,CHE/03/06/12,unittitle,(),()
11555,"[32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 3...",11555,"Letters, newsletters and leaflets on the topic...",[O],n,n,n,,,CHE,CHE/03/06/13,unittitle,(),()


In [25]:
df[pred_col_name].value_counts()

()                        11851
(Omission,)                 262
(Omission, Stereotype)        3
(Stereotype,)                 3
Name: lcosc_prediction, dtype: int64

In [None]:
# print("Proportion of descriptions classified:", (2096+918+285)/(54648+2096+918+285)) # CHECK THIS: about 5.7% percent (1% more than found with baseline)
print("Proportion of manually coded descriptions classified:", (262+3+3)/(11851+262+3+3)) # about 2.2%

Proportion of manually coded descriptions classified: 0.022114035811535607


Save the classified data.

In [31]:
# filename = "lc-osc_predictions.csv"
# df.to_csv(config.classified_data_path+filename)

filename = "manually_coded_lcosc.csv"
df.to_csv(config.coded_data_path+filename)

***
Deduplicate the descriptions and create a new export where each row has a unique doc-eadid-prediction combination.

In [50]:
df_dedup1 = df.drop_duplicates(subset=["eadid", "doc", "os_prediction"])
df_dedup2 = df.drop_duplicates(subset=["doc", "os_prediction"])
df_dedup3 = df.drop_duplicates(subset=["doc"])
print(df_dedup1.shape)
print(df_dedup2.shape)  # as expected, same as next
print(df_dedup3.shape)  # as expected, same as previous

(27235, 7)
(27209, 7)
(27209, 7)


In [51]:
dedup_filename = "lc-osc_predictions_deduplicated.csv"
df_dedup1.to_csv(config.classified_data_path+dedup_filename)

In [52]:
df_dedup1.os_prediction.value_counts()

()                        24330
(Omission,)                1869
(Omission, Stereotype)      779
(Stereotype,)               257
Name: os_prediction, dtype: int64

In [None]:
print("Proportion of unique descriptions classified:", (1869+779+257)/(24330+1869+779+257)) # about 10.7%

Proportion of unique descriptions classified: 0.10666421883605655


### 3. Manual Review

Export the same selection of descriptions, with their predictions, as was selected for the Baseline Omission and Stereotype Classifier.

#### 3.1 Analysis
Calculate quantities of classified and unclassified descriptions.

In [54]:
unclf_docs = df_dedup1.loc[df_dedup1.os_prediction == tuple()]
unclf_docs.head()

Unnamed: 0_level_0,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",BP,BP,[O],processinfo,This EAD description created by Ruth Sheret 27...,()
2,"[267, 268, 269, 270, 271, 272]",BP,BP,[Gendered-Role],unittitle,Plowden (Lady Bridget) Archive,()
4,"[352, 353, 354, 355, 342, 343, 344, 345, 346, ...",BP,BP/01,[Generalization],unittitle,Chairman (Managing Committee) of the Mary Fiel...,()
5,"[384, 356, 357, 358, 359, 360, 361, 362, 363, ...",BP,BP/01/01,[O],scopecontent,Annual reports covering the years 1959 - 1995....,()
6,"[385, 386]",BP,BP/01/01,[O],unittitle,Annual Reports,()


In [55]:
clf_docs = df_dedup1.loc[df_dedup1.os_prediction != tuple()]
clf_docs.head()

Unnamed: 0_level_0,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",BP,BP,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,The papers of the distinguished public servant...,"(Omission,)"
3,"[273, 274, 275, 276, 277, 278, 279, 280, 281, ...",BP,BP/01,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,"Formerly the 'Working Ladies Guild', the organ...","(Omission,)"
20,"[549, 550, 551, 552, 553, 554, 555, 556, 557, ...",BP,BP/01/04,"[Gendered-Role, Generalization]",scopecontent,Correspondence in relation to Lady Plowden's p...,"(Omission,)"
22,"[640, 641, 642, 643, 644, 645, 646, 647, 648, ...",BP,BP/01/05,[Gendered-Role],scopecontent,Small file of research (not carried out by Lad...,"(Omission,)"
30,"[768, 769, 770, 771, 772, 773, 774, 775, 776, ...",BP,BP/02/03,[Gendered-Role],scopecontent,Correspondence in relation to Lady Plowden's p...,"(Omission,)"


In [56]:
print(unclf_docs.shape)
print(clf_docs.shape)

(24330, 7)
(2905, 7)


In [57]:
unclf_docs_list = list(unclf_docs.doc)
clf_docs_list = list(clf_docs.doc)
print(clf_docs_list[0])

The papers of the distinguished public servant Lady Plowden (1910-2000) were generously gifted to Newcastle University Library in 2003 by the Plowden family.
Held in Special Collections, Lady Plowden's papers are an extensive and rich resource reflecting her many areas of concern. Lady Plowden held a number of high-profile public roles in the spheres of education reform and television broadcasting, most notably as Chairman of the Central Advisory Council for Education (England), 1963 – 1967, Vice-Chairman of the BBC Board of Governors, 1970 – 1975, and Chairman of the Independent Broadcasting Authority (IBA), 1975 – 1980. A large and varied number of public roles followed, many of which retained this focus on primary and pre-school education reform as well as the promotion of high-quality television broadcasting. Lady Plowden's interests were wide however and these papers also reflect Lady Plowden's roles within organisations related to Romany and Traveller education and rights, adult 

In [58]:
unclf_token_counts = [len(word_tokenize(doc)) for doc in unclf_docs_list]
clf_token_counts = [len(word_tokenize(doc)) for doc in clf_docs_list]
print(unclf_token_counts[0])
print(clf_token_counts[0])

16
251


In [60]:
print("Minimum, average, standard deviation, median, and maximum token counts in UNCLASSIFIED descriptions:")
print(np.min(unclf_token_counts))
print(np.mean(unclf_token_counts))
print(np.std(unclf_token_counts))
print(np.median(unclf_token_counts))
print(np.max(unclf_token_counts))

Minimum, average, standard deviation, median, and maximum token counts in UNCLASSIFIED descriptions:
1
16.016933826551583
22.144144438023456
11.0
706


In [61]:
print("Minimum, average, standard deviation, median, and maximum token counts in CLASSIFIED descriptions:")
print(np.min(clf_token_counts))
print(np.mean(clf_token_counts))
print(np.std(clf_token_counts))
print(np.median(clf_token_counts))
print(np.max(clf_token_counts))

Minimum, average, standard deviation, median, and maximum token counts in CLASSIFIED descriptions:
2
41.64647160068847
89.38022192312549
18.0
2478


In [69]:
clf_docs = clf_docs.reset_index()
unclf_docs = unclf_docs.reset_index()

In [70]:
o_docs = clf_docs.loc[clf_docs.os_prediction == ('Omission',)]
print(o_docs.shape)
o_docs.head()

(1869, 8)


Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
0,1,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",BP,BP,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,The papers of the distinguished public servant...,"(Omission,)"
1,3,"[273, 274, 275, 276, 277, 278, 279, 280, 281, ...",BP,BP/01,"[Gendered-Pronoun, Gendered-Role, Generalization]",scopecontent,"Formerly the 'Working Ladies Guild', the organ...","(Omission,)"
2,20,"[549, 550, 551, 552, 553, 554, 555, 556, 557, ...",BP,BP/01/04,"[Gendered-Role, Generalization]",scopecontent,Correspondence in relation to Lady Plowden's p...,"(Omission,)"
3,22,"[640, 641, 642, 643, 644, 645, 646, 647, 648, ...",BP,BP/01/05,[Gendered-Role],scopecontent,Small file of research (not carried out by Lad...,"(Omission,)"
4,30,"[768, 769, 770, 771, 772, 773, 774, 775, 776, ...",BP,BP/02/03,[Gendered-Role],scopecontent,Correspondence in relation to Lady Plowden's p...,"(Omission,)"


In [71]:
os_docs = clf_docs.loc[clf_docs.os_prediction == ('Omission', 'Stereotype')]
print(os_docs.shape)
os_docs.head()

(779, 8)


Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
56,459,"[16129, 16130, 16131, 16132, 16133, 16134]",BP,BP/27/39,[Gendered-Role],unittitle,Research File: Women and Work,"(Omission, Stereotype)"
89,655,"[23424, 23425, 23426, 23427, 23428, 23429, 234...",BXB,BXB/1/1/AGA/1/4,[Gendered-Role],scopecontent,Consists of a photocopy of both the book cover...,"(Omission, Stereotype)"
90,724,"[24465, 24466, 24467, 24468, 24469, 24470, 244...",BXB,BXB/1/1/AGA/6/11,[Gendered-Role],scopecontent,Consists of photocopies of a poem entitled 'Ma...,"(Omission, Stereotype)"
96,840,"[27001, 27002, 27003]",BXB,BXB/1/1/ALV/1,[Gendered-Role],unittitle,Carrying My Wife,"(Omission, Stereotype)"
98,871,"[27235, 27236, 27237, 27238, 27239, 27240, 272...",BXB,BXB/1/1/ARM,"[Gendered-Pronoun, Gendered-Role]",bioghist,Simon Armitage was born in 1963 in Huddersfiel...,"(Omission, Stereotype)"


In [72]:
s_docs = clf_docs.loc[clf_docs.os_prediction == ('Stereotype',)]
print(s_docs.shape)
s_docs.head()

(257, 8)


Unnamed: 0,description_id,token_id,eadid,rowid,linguistic_prediction,field,doc,os_prediction
45,374,"[10017, 10018, 10019]",BP,BP/19/02/10,[Gendered-Role],unittitle,Women and Employment,"(Stereotype,)"
57,460,"[16135, 16136, 16137, 16138, 16139, 16140, 161...",BP,BP/27/40,[O],unittitle,Research File: A Woman's View of the Recession...,"(Stereotype,)"
108,1180,"[33428, 33429, 33430]",BXB,BXB/1/1/BOL/1,[O],unittitle,The Peepshow Girl,"(Stereotype,)"
172,3912,"[79353, 79354, 79355, 79356, 79357, 79358, 79359]",BXB,BXB/1/1/HOJ/1,[O],unittitle,The Brief History of a Disreputable Woman,"(Stereotype,)"
188,4714,"[92263, 92264, 92265, 92266, 92267]",BXB,BXB/1/1/KEB/6,[Gendered-Role],unittitle,Euripides' The Trojan Women,"(Stereotype,)"
