# Multilabel Document Classification
## Experiment 2, Model 2
### Classification of descriptions with Contextual labels: *Stereotype*, *Omission*

In [1]:
import config

# For data analysis
import pandas as pd
import numpy as np
import os, re

# For creating directories
from pathlib import Path

# For word embeddings
from gensim.models import FastText #, Word2Vec
from gensim.utils import tokenize
from gensim import utils
from gensim.test.utils import get_tmpfile

# For preprocessing
import nltk
from nltk.tokenize import word_tokenize

# For classification
import scipy
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import sklearn.metrics
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

# For saving model
import joblib
from joblib import dump,load

### 1. Data Preprocessing

Load description data:

In [2]:
doc_data = config.exp_data_path+"document_5fold.csv"
df_docs = pd.read_csv(doc_data, index_col=0, converters={"label": lambda x: x.strip("[]").replace("'","").split(", ")})
assert type(df_docs["label"][0]) == list, "The converters should ensure the 'label' column's values are loaded as lists."
df_docs = df_docs.sort_values(by="description_id")
df_docs.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,subset,label,fold
5163,1,17,76,Title,Papers of The Very Rev Prof James Whyte (1920-...,train,[Stereotype],split2
11541,2,77,633,Scope and Contents,"Sermons and addresses, 1948-1996; lectures, 19...",train,[],split3
277,3,634,1725,Biographical / Historical,Professor James Aitken White was a leading Sco...,train,[Stereotype],split4
19301,5,17,60,Title,Papers of Rev Tom Allan (1916-1965),dev,[],split1
21645,6,61,560,Scope and Contents,"Sermons and addresses, 1947-1963; essays and l...",dev,[],split3


Load data from multilabel token classification of **Linguistic** labels:

In [3]:
ling_clf_output = config.data_path+"token_clf_data/experiment1/5fold/output/cc-{a}_linglabels_baseline_fastText{d}_strict_evaluation.csv".format(a="rf",d="100")
df_features = pd.read_csv(ling_clf_output, index_col=0)
df_features.head()

Unnamed: 0,description_id,sentence_id,token_id,token,token_offsets,pos,tag,field,fold,expected_label,predicted_label,_merge
0,0,0,0,Identifier,"(0, 10)",NN,O,Identifier,split4,O,O,true negative
1,0,0,1,:,"(10, 11)",:,O,Identifier,split4,O,O,true negative
2,0,0,2,AA5,"(12, 15)",NN,O,Identifier,split4,O,O,true negative
3,1,1,3,Title,"(17, 22)",NN,O,Title,split2,O,O,true negative
4,1,1,4,:,"(22, 23)",:,O,Title,split2,O,O,true negative


In [4]:
def implodeDataFrame(df, cols_to_groupby):
    cols_to_agg = list(df.columns)
    for col in cols_to_groupby:
        cols_to_agg.remove(col)
    agg_dict = dict.fromkeys(cols_to_agg, lambda x: list(set(x)))
    return df.groupby(cols_to_groupby).agg(agg_dict).reset_index().set_index(cols_to_groupby)

In [5]:
# Group the data by description so it can be combined with the description data
df_features = implodeDataFrame(df_features[["description_id", "sentence_id", "token_id", "predicted_label"]], ["description_id"])

# Deduplicate lists in sentence_id, token_id, and predicted_label columns
cols = ["sentence_id", "token_id", "predicted_label"]
for col in cols:
    old_col = list(df_features[col])
    # dedup_col = [list(set(values)) for values in old_col]
    new_col = []
    if col == "predicted_label":
        for values in old_col:
            if (len(values) > 1) and ("O" in values):
                values.remove("O")
            new_col += [list(values)]
    else:
        new_col = old_col
    col_i = list(df_features.columns).index(col)
    df_features = df_features.drop(columns=[col])
    df_features.insert(col_i, col, new_col)

df_features.head()

Unnamed: 0_level_0,sentence_id,token_id,predicted_label
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,[0],"[0, 1, 2]",[O]
1,[1],"[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]",[O]
2,[2],"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",[O]
3,"[3, 4, 5, 6, 7, 8, 9, 10]","[109, 110, 111, 112, 113, 114, 115, 116, 117, ...",[Gendered-Pronoun]
4,[11],"[308, 309, 310]",[O]


### 2. Feature Extraction

Join the feature data (i.e. classifier-predicted **Linguistic** labels) to the document data (i.e. catalog metadata descriptions):

In [6]:
df = df_features.join(df_docs.set_index("description_id"), on="description_id", how="right")

# Rename label columns for clarity
df = df.rename(columns={"predicted_label":"linguistic_pred", "label":"exp_label"})

df.head()

Unnamed: 0_level_0,sentence_id,token_id,linguistic_pred,start_offset,end_offset,field,description,subset,exp_label,fold
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,[1],"[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]",[O],17,76,Title,Papers of The Very Rev Prof James Whyte (1920-...,train,[Stereotype],split2
2,[2],"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",[O],77,633,Scope and Contents,"Sermons and addresses, 1948-1996; lectures, 19...",train,[],split3
3,"[3, 4, 5, 6, 7, 8, 9, 10]","[109, 110, 111, 112, 113, 114, 115, 116, 117, ...",[Gendered-Pronoun],634,1725,Biographical / Historical,Professor James Aitken White was a leading Sco...,train,[Stereotype],split4
5,[12],"[320, 311, 312, 313, 314, 315, 316, 317, 318, ...",[O],17,60,Title,Papers of Rev Tom Allan (1916-1965),dev,[],split1
6,"[13, 14]","[321, 322, 323, 324, 325, 326, 327, 328, 329, ...",[O],61,560,Scope and Contents,"Sermons and addresses, 1947-1963; essays and l...",dev,[],split3


In [7]:
split_col = "fold"
splits = df[split_col].unique()
splits.sort()
train0, test0 = list(splits[:4]), splits[4]
train1, test1 = list(splits[1:]), splits[0]
train2, test2 = list(splits[2:])+[splits[0]], splits[1]
train3, test3 = list(splits[3:])+list(splits[:2]), splits[2]
train4, test4 = [splits[4]]+list(splits[:3]), splits[3]
runs = [(train0, test0), (train1, test1), (train2, test2), (train3, test3), (train4, test4)]

In [8]:
train_splits, test_split = runs[-1][0], runs[-1][1]
train_df = df.loc[df[split_col].isin(train_splits)]
test_df = df.loc[df[split_col] == test_split]
assert df.shape[0] == train_df.shape[0] + test_df.shape[0]

In [9]:
# Binarize the features
mlb_ling = joblib.load(config.models_path+"multilabel_token/mlb_linglabels.joblib")
feat_col = "linguistic_pred"
train_feat = mlb_ling.transform(train_df[feat_col])
test_feat = mlb_ling.transform(test_df[feat_col])
print(mlb_ling.classes_)

# Binarize targets
target_col = "exp_label"
mlb_so = MultiLabelBinarizer()
mlb_so = mlb_so.fit([['Omission', 'Stereotype']])
y_train = mlb_so.transform(train_df[target_col])
y_test = mlb_so.transform(test_df[target_col])
print(mlb_so.classes_)

# Vectorize the documents (descriptions)
cvectorizer = CountVectorizer()
tfidf = TfidfTransformer()
doc_col = "description"
train_docs = cvectorizer.fit_transform(train_df[doc_col])
test_docs = cvectorizer.transform(test_df[doc_col])
train_docs = tfidf.fit_transform(train_docs)
test_docs = tfidf.transform(test_docs)

# Concatenate the features and documents
X_train = scipy.sparse.hstack([train_docs, train_feat])
X_test = scipy.sparse.hstack([test_docs, test_feat])



['Gendered-Pronoun' 'Gendered-Role' 'Generalization']
['Omission' 'Stereotype']


### 3. Classifier Training

In [10]:
clf = OneVsRestClassifier(SGDClassifier(loss="hinge"))  # Use Support Vector Machines (SVM) loss function
clf.fit(X_train, y_train)

### 4. Prediction

In [11]:
y_pred = clf.predict(X_test)

In [13]:
# Format the predicted tags as lists to match the format of the expected tags
pred_labels = mlb_so.inverse_transform(y_pred)
new_preds = []
for labels in pred_labels:
    new_preds += [list(labels)]
print(new_preds[100:200])

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Stereotype'], [], ['Stereotype'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Stereotype'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Stereotype'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]


Export the data with predicted labels:

In [14]:
test_df.insert(len(df.columns), "predicted", new_preds)
test_df.head()

Unnamed: 0_level_0,sentence_id,token_id,linguistic_pred,start_offset,end_offset,field,description,subset,exp_label,fold,predicted
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,[2],"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...",[O],77,633,Scope and Contents,"Sermons and addresses, 1948-1996; lectures, 19...",train,[],split3,[]
6,"[13, 14]","[321, 322, 323, 324, 325, 326, 327, 328, 329, ...",[O],61,560,Scope and Contents,"Sermons and addresses, 1947-1963; essays and l...",dev,[],split3,[]
14,[48],"[1216, 1217, 1218, 1219, 1220, 1215]",[O],85,115,Title,Papers of John Baillie,train,[],split3,[]
15,[49],"[1221, 1222, 1223, 1224, 1225]",[O],116,143,Title,Notes and notebooks,train,[],split3,[]
16,[50],"[1226, 1227, 1228, 1229, 1230, 1231, 1232, 123...",[O],144,210,Title,Poems chiefly from lesser authors selected by ...,dev,[Omission],split3,[]


In [15]:
test_df["predicted"].value_counts()

[]                        4845
[Omission]                 372
[Omission, Stereotype]     161
[Stereotype]                84
Name: predicted, dtype: int64

In [17]:
matrix = multilabel_confusion_matrix(y_test, y_pred, labels=clf.classes_)

tn = matrix[:, 0, 0]  # True negatives
fn = matrix[:, 1, 0]  # False negatives
tp = matrix[:, 1, 1]  # True positives
fp = matrix[:, 0, 1]  # False positives
class_names = list(mlb_so.classes_)

[precision, recall, f_1, suport] = precision_recall_fscore_support(
    y_test, y_pred, beta=1.0, zero_division=0, labels=clf.classes_
)

df = pd.DataFrame({
    "labels":class_names, "true_neg":tn, "false_neg":fn, "true_pos":tp, "false_pos":fp,
    "precision":precision, "recall":recall, "f_1":f_1
})
df

Unnamed: 0,labels,true_neg,false_neg,true_pos,false_pos,precision,recall,f_1
0,Omission,4568,361,452,81,0.84803,0.555966,0.67162
1,Stereotype,5142,75,227,18,0.926531,0.751656,0.829982


Export the models:

In [18]:
model_dir = config.models_path+"multilabel_document/"
Path(model_dir).mkdir(parents=True, exist_ok=True)

# Save classifier
filename = model_dir+"sgd-svm_F-tfidf-ling_T-so.joblib"  # include features (F) and targets (T) in model's file name
dump(clf, filename)

# Save multilabel binarizer for Stereotype and Omission labels
filename = model_dir+"mlb_so.joblib"
dump(mlb_so, filename)

# Save the Count Vectorizer
filename = model_dir+"count_vectorizer.joblib"
dump(cvectorizer, filename)

# Save the Term Frequency Inverse Document Frequency (TFIDF) transformer
filename = model_dir+"tfidf_transformer.joblib"
dump(tfidf, filename)

['models/multilabel_document/tfidf_transformer.joblib']

### 6. Classification of External Data

In [19]:
bt = pd.read_csv("../data/congruence_engine/BT_sample.csv", low_memory=False)
bt = bt.reset_index()
bt = bt.rename(columns={"index":"record_id"})
bt.head()

Unnamed: 0,record_id,RecordType,AltRefNo,RefNo,Title,Original Date Field,Description,RelatedNameCode,Authority Type,Authority Name
0,0,Photograph,TCD 263/MUS 1079,BTA/5 PH/7/MUS 1079,Wedding Greetings (General),,Image of telegram.,,,
1,1,Photograph,TCD 263/MUS 1142,BTA/5 PH/7/MUS 1142,Baby (Blanket and tag design),,Image of telegram.,,,
2,2,Photograph,TCD 263/MUS 1075,BTA/5 PH/7/MUS 1075,Wedding (General Greeting),,Image of telegram.,,,
3,3,Photograph,TCD 263/MUS 977,BTA/5 PH/7/MUS 977,Ordinary,,Designer/manufacturer: Perry,,,
4,4,Photograph,TCD 263/MUS 637,BTA/5 PH/7/MUS 637,Two girls sending telegraph message,c1910,,,,


Load the models:

In [20]:
ft_model = FastText.load(config.fasttext_path+"fasttext_cbow_100d.model")

In [21]:
# Linguistic classification
mlb_ling = joblib.load(config.models_path+"multilabel_token/mlb_linglabels.joblib")
print(mlb_ling.classes_)
trained_ling_clf = joblib.load(config.models_path+"multilabel_token/cc-rf_F-fasttext100_T-linglabels.joblib")

['Gendered-Pronoun' 'Gendered-Role' 'Generalization']


In [22]:
# Omission & Stereotype classification
mlb_so = joblib.load(config.models_path+"multilabel_document/mlb_targets_so.joblib")
print(mlb_so.classes_)
trained_so_clf = joblib.load(config.models_path+"multilabel_document/sgd-svm_F-tfidf-ling_T-so.joblib")
cvectorizer = joblib.load(config.models_path+"multilabel_document/count_vectorizer.joblib")
tfidf = joblib.load(config.models_path+"multilabel_document/tfidf_transformer.joblib")

['Omission' 'Stereotype']


Preprocess the data:

In [23]:
text_col = "Title"
bt_title = list(bt[text_col])
token_id_col, token_col = [], []
last_id = 0
for row in bt_title:
    tokens = word_tokenize(str(row))
    # Exclude any rows without a title
    if tokens != []:
        token_ids = list(range(last_id, len(tokens)+last_id))
        last_id = last_id+len(tokens)
        token_id_col += [token_ids]
        token_col += [tokens]
bt_tokenized = pd.DataFrame({"record_id": list(bt["record_id"]), "token_id": token_id_col, "token": token_col})
bt_tokenized = bt_tokenized.explode(["token_id", "token"])
bt_tokenized.head()

Unnamed: 0,record_id,token_id,token
0,0,0,Wedding
0,0,1,Greetings
0,0,2,(
0,0,3,General
0,0,4,)


Create features for Linguistic classification:

In [24]:
# Zip the features
feature_data = list(zip(bt_tokenized["token_id"], bt_tokenized["token"]))
# Make FastText feature matrix
feature_list = [ft_model.wv[token.lower()] for token_id,token in feature_data]
X = np.array(feature_list)

Run the Linguistic classifier:

In [25]:
y_pred = trained_ling_clf.predict(X)
pred_col = "predicted_linguistic"

In [26]:
pred_labels = mlb_ling.inverse_transform(y_pred)
bt_tokenized.insert(len(bt_tokenized.columns), pred_col, pred_labels)

In [27]:
bt_tokenized_imploded = implodeDataFrame(bt_tokenized, ["record_id"])
bt_tokenized_imploded.head()

Unnamed: 0_level_0,token_id,token,predicted_linguistic
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"[0, 1, 2, 3, 4]","[Greetings, Wedding, (, ), General]",[()]
1,"[5, 6, 7, 8, 9, 10, 11]","[tag, (, Baby, ), Blanket, design, and]",[()]
2,"[12, 13, 14, 15, 16]","[Greeting, Wedding, (, ), General]",[()]
3,[17],[Ordinary],[()]
4,"[18, 19, 20, 21, 22]","[girls, Two, telegraph, sending, message]","[(Generalization,), ()]"


In [28]:
pred_col = list(bt_tokenized_imploded["predicted_linguistic"])
new_pred_col = []
for values in pred_col:
    preds = []
    if (values != [tuple()]) and (values != []):
        for t in values:
            if len(t) > 0:
                label = str(t).strip("(',)")
                if label not in preds:
                    preds += [label]
    new_pred_col += [preds]
bt_tokenized_imploded = bt_tokenized_imploded.drop(columns=["predicted_linguistic"])
bt_tokenized_imploded.insert(len(bt_tokenized_imploded.columns), "predicted_linguistic", new_pred_col)
bt_tokenized_imploded.head()

Unnamed: 0_level_0,token_id,token,predicted_linguistic
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"[0, 1, 2, 3, 4]","[Greetings, Wedding, (, ), General]",[]
1,"[5, 6, 7, 8, 9, 10, 11]","[tag, (, Baby, ), Blanket, design, and]",[]
2,"[12, 13, 14, 15, 16]","[Greeting, Wedding, (, ), General]",[]
3,[17],[Ordinary],[]
4,"[18, 19, 20, 21, 22]","[girls, Two, telegraph, sending, message]",[Generalization]


In [29]:
bt_sub = bt[["record_id", "Title"]]
bt_clf = bt_tokenized_imploded.join(bt_sub.set_index("record_id"), on="record_id", how="outer")
bt_clf.head()

Unnamed: 0_level_0,token_id,token,predicted_linguistic,Title
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[0, 1, 2, 3, 4]","[Greetings, Wedding, (, ), General]",[],Wedding Greetings (General)
1,"[5, 6, 7, 8, 9, 10, 11]","[tag, (, Baby, ), Blanket, design, and]",[],Baby (Blanket and tag design)
2,"[12, 13, 14, 15, 16]","[Greeting, Wedding, (, ), General]",[],Wedding (General Greeting)
3,[17],[Ordinary],[],Ordinary
4,"[18, 19, 20, 21, 22]","[girls, Two, telegraph, sending, message]",[Generalization],Two girls sending telegraph message


Run the Omission and Stereotype classifier:

In [30]:
features = mlb_ling.transform(bt_clf["predicted_linguistic"])

In [31]:
doc_col = "Title"
bt_clf = bt_clf.fillna("")
vectorized = cvectorizer.transform(bt_clf[doc_col])
docs = tfidf.transform(vectorized)

In [32]:
X = scipy.sparse.hstack([docs, features])

In [33]:
y_pred = trained_so_clf.predict(X)

In [34]:
pred_labels = mlb_so.inverse_transform(y_pred)
print(pred_labels[:100])

[(), (), (), (), (), ('Omission', 'Stereotype'), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), ('Stereotype',), (), (), (), (), (), (), (), (), (), (), (), (), (), (), ()]


In [35]:
new_pred_col = []
for values in pred_labels:
    preds = []
    if (values != [tuple()]) and (values != []):
        for t in values:
            if len(t) > 0:
                label = str(t).strip("(',)")
                if label not in preds:
                    preds += [label]
    new_pred_col += [preds]
print(new_pred_col[:100])
pred_col_name = "predicted_so"
bt_clf.insert(len(bt_clf.columns), pred_col_name, new_pred_col)
bt_clf.head()

[[], [], [], [], [], ['Omission', 'Stereotype'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Stereotype'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]


Unnamed: 0_level_0,token_id,token,predicted_linguistic,Title,predicted_so
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[0, 1, 2, 3, 4]","[Greetings, Wedding, (, ), General]",[],Wedding Greetings (General),[]
1,"[5, 6, 7, 8, 9, 10, 11]","[tag, (, Baby, ), Blanket, design, and]",[],Baby (Blanket and tag design),[]
2,"[12, 13, 14, 15, 16]","[Greeting, Wedding, (, ), General]",[],Wedding (General Greeting),[]
3,[17],[Ordinary],[],Ordinary,[]
4,"[18, 19, 20, 21, 22]","[girls, Two, telegraph, sending, message]",[Generalization],Two girls sending telegraph message,[]


In [36]:
bt_clf[pred_col_name].value_counts()

[]                        69382
[Omission, Stereotype]        1
[Stereotype]                  1
Name: predicted_so, dtype: int64