# Multilabel Token Classification
## Experiments 1 and 2, Model 1
## Classification of Linguistic labels: *Gendered Pronoun*, *Gendered Role*, *Generalization*

In [1]:
import config

# For data analysis
import pandas as pd
import numpy as np
import os, re

# For creating directories
from pathlib import Path

# For word embeddings
from gensim.models import FastText #, Word2Vec
from gensim.utils import tokenize
from gensim import utils
from gensim.test.utils import get_tmpfile

# For preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
# nltk.download('punkt')
from nltk.corpus import PlaintextCorpusReader
# nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag

# For multilabel token classification
import sklearn.metrics
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier

# For saving model
import joblib
from joblib import dump,load

### 1. Create Word Embeddings

Train custom word embeddings on metadata descriptions from the University of Edinburgh Heritage Collections' Archives catalog.

* Data file: `descriptions_by_fonds`
* Date of harvesting: October 2020
* Harvesting and transformation code: [annot-prep/PreparationForAnnotation.ipynb](https://github.com/thegoose20/annot-prep/blob/main/PreparationForAnnotation.ipynb)

References:
* https://radimrehurek.com/gensim/models/fasttext.html
* https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html#sphx-glr-auto-examples-tutorials-run-fasttext-py

In [2]:
dir_path = "../data/descriptions_by_fonds/"
file_list = os.listdir(dir_path)
print(len(file_list))

1079


In [3]:
class CorpusIterator:
    def __iter__(self):
        file_list = os.listdir(dir_path)
        for fonds_f in file_list:
            assert ".txt" in fonds_f, "All files should be Plaintext." 
            file_path = dir_path+fonds_f
            with utils.open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    # Lowercase the tokens
                    yield list(tokenize(line.lower()))   #list(tokenize(line))

Define the hyperparameters for the unsupervised training of the fastText model:

In [4]:
# Specify training architecture (default = "cbow" for Continuous Bag of Words)
training_arch = "cbow"  #"skipgram
# Specify the learning rate (default = 0.025)
alpha = 0.025
# Specify the training objective (default = "ns")
# losses = ["ns", "hs", "softmax"]
# loss = losses[0]
# Specify the number of negative words to sample for 'ns' training objective (default = 5)
negative = 5
# Specify the threshold for downsampling higher-frequency words (default = 0.001)
sample = 0.001
# Specify the word embeddings' dimensions
vector_dimensions = 100 #50 #300
# Specify the context window (default is 5) 
context_window = 5
# Specify the number of epochs (default is 5)
epochs = 5
# Specify the threshold of word occurrences (ignore words that occur less than specified number of times; default = 5)
min_count = 5
# Specify the minimum and maximum length of character ngrams (defaults are 3 and 6)
min_n = 2
max_n = 6  # if 0, no character n-grams (subword vectors) will be used
# Specify the number of buckets for hashing ngrams (default = 2000000) 
bucket = 2000000
# Sort vocabulary by descending frequency (default = 1)
sorted_vocab = 1
# Specify the number of threads to use (default = 12)
# threads = 12

In [5]:
embedding_model = FastText(
    alpha=alpha, negative=negative, sample=sample,
    vector_size=vector_dimensions, window=context_window, 
    epochs=epochs, min_count=min_count, min_n=min_n, 
    max_n=max_n, bucket=bucket, sorted_vocab=sorted_vocab
)

In [6]:
embedding_model.build_vocab(corpus_iterable=CorpusIterator())
total_examples = embedding_model.corpus_count

In [7]:
embedding_model.train(corpus_iterable=CorpusIterator(), total_examples=total_examples, epochs=epochs)

(7320895, 10119275)

Save the model:

In [8]:
file_name = "fasttext_{a}_{d}d.model".format(a=training_arch, d=vector_dimensions)
print(file_name)

fasttext_cbow_100d.model


In [9]:
embedding_model.save("models/embeddings/custom_fasttext/"+file_name)

### 2. Data Preprocessing

In [10]:
token_data = config.exp_data_path+"token_5fold.csv"

In [11]:
df = pd.read_csv(token_data, index_col=0)
df.head()

Unnamed: 0,description_id,sentence_id,ann_id,token_id,token,token_offsets,pos,tag,field,fold
0,0,0,99999,0,Identifier,"(0, 10)",NN,O,Identifier,split4
1,0,0,99999,1,:,"(10, 11)",:,O,Identifier,split4
2,0,0,99999,2,AA5,"(12, 15)",NN,O,Identifier,split4
3,1,1,99999,3,Title,"(17, 22)",NN,O,Title,split2
4,1,1,99999,4,:,"(22, 23)",:,O,Title,split2


In [12]:
ling_tags = ["B-Generalization", "I-Generalization", "B-Gendered-Role", "I-Gendered-Role", "B-Gendered-Pronoun", "I-Gendered-Pronoun"]

In [13]:
labels_to_consider = ling_tags
col = "tag"

In [14]:
def implodeDataFrame(df, cols_to_groupby):
    cols_to_agg = list(df.columns)
    for col in cols_to_groupby:
        cols_to_agg.remove(col)
    agg_dict = dict.fromkeys(cols_to_agg, lambda x: x.tolist())
    return df.groupby(cols_to_groupby).agg(agg_dict).reset_index().set_index(cols_to_groupby)

def preprocessTokenData(df, col, label_list):
    initial_shape = df.shape
    # Change any tags not in label_list to "O"
    df_l = df.loc[df[col].isin(label_list)]
    df_o = df.loc[~df[col].isin(label_list)]
    df_o = df_o.drop(columns=[col])
    df_o.insert(len(df_o.columns), col, (["O"]*(df_o.shape[0])))
    df = pd.concat([df_l, df_o])
    df = df.sort_values(by="token_id")
    assert initial_shape == df.shape, "The DataFrame should have the same number of rows and columns after changing select column values."
    df = df.drop_duplicates()

    # Replace tags with labels, removing "B-" and "I-" from the start of the tags
    old_col = df[col]
    new_col = [tag[2:] if tag != "O" else tag for tag in old_col]
    df = df.drop(columns=[col])
    df.insert((len(df.columns)-2), col, new_col)
    
    # Group by token, so there's one row per token and lists of tags for each token
    df = implodeDataFrame(df, [
        "description_id", "sentence_id", "token_id", "token", "pos", "field", "token_offsets", "fold"
    ])
    df = df.reset_index()
    
    # Deduplicate tag lists and remove any "O" tags from lists with other values
    old_col = list(df[col])
    dedup_col = [list(set(value_list)) for value_list in old_col]
    assert len(old_col) == len(dedup_col), "The column should have the same number of rows."
    new_col = []
    for col_list in dedup_col:
        if ("O" in col_list) and (len(col_list) > 1):
            col_list.remove("O")
        col_list.sort()
        new_col += [col_list]
    assert len(new_col) == len(old_col), "The column should have the same number of rows."
    df = df.drop(columns=[col])
    df.insert((len(df.columns)-2), col, new_col)
    
    return df  #.explode([col])  # one tag-token pair per row, tokens can repeat across rows

In [15]:
df = preprocessTokenData(df, col, labels_to_consider)
df = df.sort_values(by="token_id")
df.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,ann_id
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",[O],split4,[99999]
1,0,0,1,:,:,Identifier,"(10, 11)",[O],split4,[99999]
2,0,0,2,AA5,NN,Identifier,"(12, 15)",[O],split4,[99999]
3,1,1,3,Title,NN,Title,"(17, 22)",[O],split2,[99999]
4,1,1,4,:,:,Title,"(22, 23)",[O],split2,[99999]


In [16]:
df[col].value_counts()

[O]                                   744728
[Gendered-Pronoun]                      3624
[Gendered-Role]                         3151
[Generalization]                        1808
[Gendered-Pronoun, Generalization]       107
[Gendered-Role, Generalization]          103
Name: tag, dtype: int64

### 3. Feature Extraction

In [17]:
ft_model = FastText.load(config.fasttext_path+"fasttext_cbow_100d.model")
def getFeatures(df, embedding_model=ft_model, feature_cols=["token_id", "token"]):
    # Zip the features
    feature_data = list(zip(df[feature_cols[0]], df[feature_cols[1]]))
    
    # Make FastText feature matrix
    feature_list = [embedding_model.wv[token.lower()] for token_id,token in feature_data]
    return np.array(feature_list)

Define the five splits of the data to combine iteratively into training and test sets using five-fold cross-validation:

In [18]:
split_col = "fold"
splits = df[split_col].unique()
splits.sort()
print(splits)

['split0' 'split1' 'split2' 'split3' 'split4']


In [19]:
train0, devtest0 = list(splits[:3]), splits[3]
train1, devtest1 = list(splits[1:4]), splits[0]
train2, devtest2 = list(splits[2:4])+[splits[0]], splits[1]
train3, devtest3 = [splits[3]]+list(splits[:2]), splits[2]
runs = [(train0, devtest0), (train1, devtest1), (train2, devtest2), (train3, devtest3)]
test = splits[4]
print(runs)
print(test)

[(['split0', 'split1', 'split2'], 'split3'), (['split1', 'split2', 'split3'], 'split0'), (['split2', 'split3', 'split0'], 'split1'), (['split3', 'split0', 'split1'], 'split2')]
split4


In [20]:
train_splits, devtest_split = runs[-1][0], runs[-1][1]
df_train = df.loc[df[split_col].isin(train_splits)]
df_devtest = df.loc[df[split_col] == devtest_split]
df_test = df.loc[df[split_col] == test]
assert df.shape[0] == df_train.shape[0] + df_devtest.shape[0] + df_test.shape[0]

In [21]:
X_train = getFeatures(df_train)
X_devtest = getFeatures(df_devtest)
X_test = getFeatures(df_test)

In [22]:
mlb = MultiLabelBinarizer()
mlb.fit([["Gendered-Pronoun", "Gendered-Role", "Generalization"]])
y_train = mlb.transform(df_train[col])
y_devtest = mlb.transform(df_devtest[col])
y_test = mlb.transform(df_test[col])
print(mlb.classes_)
# print((df_test[col])[:10])
# print(y_test[:10])

['Gendered-Pronoun' 'Gendered-Role' 'Generalization']




### 4. Classifier Training
#### 4.1 100-Dimension Embeddings

In [23]:
clf = ClassifierChain(
    classifier = RandomForestClassifier(random_state=22),
)
clf.fit(X_train, y_train)

### 5. Prediction

In [24]:
y_pred = clf.predict(X_devtest)

Export the data with the predicted labels:

In [25]:
# Format the predicted tags as lists to match the format of the expected tags
pred_labels = mlb.inverse_transform(y_pred)
new_preds = []
for labels in pred_labels:
    if len(labels) == 0:
        new_preds += [["O"]]
    else:
        new_preds += [list(labels)]
print(new_preds[:5])

[['O'], ['O'], ['O'], ['O'], ['O']]


In [26]:
df_devtest.insert(len(df.columns), "predicted", new_preds)
df_devtest.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,ann_id,predicted
3,1,1,3,Title,NN,Title,"(17, 22)",[O],split2,[99999],[O]
4,1,1,4,:,:,Title,"(22, 23)",[O],split2,[99999],[O]
5,1,1,5,Papers,NNS,Title,"(24, 30)",[O],split2,[99999],[O]
6,1,1,6,of,IN,Title,"(31, 33)",[O],split2,[99999],[O]
7,1,1,7,The,DT,Title,"(34, 37)",[O],split2,"[14384, 24275, 52952]",[O]


In [27]:
df_devtest.predicted.value_counts()

[O]                                151743
[Gendered-Pronoun]                    980
[Gendered-Role]                       660
[Generalization]                       84
[Gendered-Role, Generalization]         2
Name: predicted, dtype: int64

In [28]:
exp_df = df_devtest.drop(columns=["predicted", "ann_id"])
exp_df = exp_df.explode(col)
pred_df = df_devtest.drop(columns=["tag", "ann_id"])
pred_df = pred_df.explode("predicted")

In [29]:
pred_col = "predicted"
exp_col = col
no_tag_value = "O"
left_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", exp_col]
right_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", pred_col]

# Add the predicted tags to the DataFrame with expected tags
exp_pred_df = pd.merge(
    left=exp_df, 
    right=pred_df, 
    how="outer",
    left_on=left_on_cols,
    right_on=right_on_cols,
    suffixes=["", "_pred"],
    indicator=True
)

# Replace any NaN values with "O" to indicate no predicted tag
exp_pred_df[exp_col] = exp_pred_df[exp_col].fillna(no_tag_value)
exp_pred_df[pred_col] = exp_pred_df[pred_col].fillna(no_tag_value)

# Find true negatives based on the expected and predicted tags
sub_exp_pred_df = exp_pred_df.loc[exp_pred_df[exp_col] == no_tag_value]
sub_exp_pred_df = sub_exp_pred_df.loc[sub_exp_pred_df[pred_col] == no_tag_value]
sub_exp_pred_df = sub_exp_pred_df.drop(columns=["_merge"])
sub_exp_pred_df.insert( len(sub_exp_pred_df.columns), "_merge", ( ["true negative"]*(sub_exp_pred_df.shape[0]) ) )
# Record false negatives, false positives, and true positives based on the merge values
sub_exp_pred_df2 = exp_pred_df.loc[~exp_pred_df.index.isin(sub_exp_pred_df.index)]
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="left_only", value="false negative")
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="right_only", value="false positive")
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="both", value="true positive")
# Combine the DataFrames to include all agreement types and sort the DataFrame
eval_df = pd.concat([sub_exp_pred_df,sub_exp_pred_df2])
eval_df = eval_df.sort_index()
eval_df.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,predicted,_merge
0,1,1,3,Title,NN,Title,"(17, 22)",O,split2,O,true negative
1,1,1,4,:,:,Title,"(22, 23)",O,split2,O,true negative
2,1,1,5,Papers,NNS,Title,"(24, 30)",O,split2,O,true negative
3,1,1,6,of,IN,Title,"(31, 33)",O,split2,O,true negative
4,1,1,7,The,DT,Title,"(34, 37)",O,split2,O,true negative


In [30]:
eval_df._merge.value_counts()

true negative     152139
true positive       1322
false negative       623
false positive       406
Name: _merge, dtype: int64

In [31]:
labels = list(eval_df.tag.unique())
labels.sort()
labels.remove("O")
print(labels)

def precisionRecallF1(tp_count, fp_count, fn_count):
    # Precision Score: ability of classifier not to label a sample that should be negative as positive; best possible = 1, worst possible = 0
    if tp_count+fp_count == 0:
        precision = 0
    else:
        precision = (tp_count/(tp_count+fp_count))
    # Recall Score: ability of classifier to find all positive samples; best possible = 1, worst possible = 0
    if tp_count+fn_count == 0:
        recall = 0
    else:
        recall = (tp_count/(tp_count+fn_count))
    # F1 Score: harmonic mean of precision and recall; best possible = 1, worst possible = 0
    if (precision+recall == 0):
        f_1 = 0
    else:
        f_1 = (2*precision*recall)/(precision+recall)
    return precision, recall, f_1

['Gendered-Pronoun', 'Gendered-Role', 'Generalization']


In [32]:
agmt_scores = pd.DataFrame.from_dict({
        "label":[], "false negative":[], "false positive":[],
         "true positive":[], "precision":[], "recall":[], "f1":[]
    })
for label in labels:
    agmt_df = pd.concat([eval_df.loc[eval_df[exp_col] == label], eval_df.loc[eval_df[pred_col] == label]])
    agmt_df = agmt_df.drop_duplicates() # True positives will have been duplicated in line above
    tp = agmt_df.loc[agmt_df._merge == "true positive"].shape[0]
    fp = agmt_df.loc[agmt_df._merge == "false positive"].shape[0]
    fn = agmt_df.loc[agmt_df._merge == "false negative"].shape[0]
    prec, rec, f1 = precisionRecallF1(tp, fp, fn)
    label_agmt = pd.DataFrame.from_dict({
            "label":[label], "false negative":[fn], "false positive":[fp],
             "true positive":[tp], "precision":[prec], "recall":[rec], "f1":[f1]
        })
    agmt_scores = pd.concat([agmt_scores, label_agmt])
agmt_scores

Unnamed: 0,label,false negative,false positive,true positive,precision,recall,f1
0,Gendered-Pronoun,14.0,201.0,779.0,0.794898,0.982346,0.878737
0,Gendered-Role,241.0,175.0,487.0,0.73565,0.668956,0.700719
0,Generalization,368.0,30.0,56.0,0.651163,0.132075,0.219608


#### 4.2 300-Dimension Word Embeddings
Try training the model using higher-dimension word embeddings and evaluate its performance on the devtest data.

In [None]:
vector_dimensions = 300

embedding_model = FastText(
    alpha=alpha, negative=negative, sample=sample,
    vector_size=vector_dimensions, window=context_window, 
    epochs=epochs, min_count=min_count, min_n=min_n, 
    max_n=max_n, bucket=bucket, sorted_vocab=sorted_vocab
)

embedding_model.build_vocab(corpus_iterable=CorpusIterator())
total_examples = embedding_model.corpus_count

embedding_model.train(corpus_iterable=CorpusIterator(), total_examples=total_examples, epochs=epochs)

file_name = "fasttext_{a}_{d}d.model".format(a=training_arch, d=vector_dimensions)
print(file_name)

embedding_model.save("models/embeddings/custom_fasttext/"+file_name)

fasttext_cbow_300d.model


Feature extraction:

In [34]:
ft_model = FastText.load(config.fasttext_path+f"fasttext_cbow_{vector_dimensions}d.model")
def getFeatures(df, embedding_model=ft_model, feature_cols=["token_id", "token"]):
    # Zip the features
    feature_data = list(zip(df[feature_cols[0]], df[feature_cols[1]]))
    
    # Make FastText feature matrix
    feature_list = [embedding_model.wv[token.lower()] for token_id,token in feature_data]
    return np.array(feature_list)

In [35]:
train_splits, devtest_split = runs[-1][0], runs[-1][1]
df_train = df.loc[df[split_col].isin(train_splits)]
df_devtest = df.loc[df[split_col] == devtest_split]
# df_test = df.loc[df[split_col] == test]
assert df.shape[0] == df_train.shape[0] + df_devtest.shape[0] + df_test.shape[0]

In [36]:
X_train = getFeatures(df_train)
X_devtest = getFeatures(df_devtest)
# X_test = getFeatures(df_test)

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit([["Gendered-Pronoun", "Gendered-Role", "Generalization"]])
y_train = mlb.transform(df_train[col])
y_devtest = mlb.transform(df_devtest[col])
print(mlb.classes_)
# print((df_test[col])[:10])
# print(y_test[:10])

['Gendered-Pronoun' 'Gendered-Role' 'Generalization']




In [38]:
clf_300 = ClassifierChain(
    classifier = RandomForestClassifier(random_state=22),
)
clf_300.fit(X_train, y_train)

In [39]:
y_pred = clf_300.predict(X_devtest)

Export the data with the predicted labels:

In [40]:
# Format the predicted tags as lists to match the format of the expected tags
pred_labels = mlb.inverse_transform(y_pred)
new_preds = []
for labels in pred_labels:
    if len(labels) == 0:
        new_preds += [["O"]]
    else:
        new_preds += [list(labels)]
print(new_preds[:5])

[['O'], ['O'], ['O'], ['O'], ['O']]


In [41]:
df_devtest.insert(len(df.columns), "predicted_300", new_preds)
df_devtest.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,ann_id,predicted_300
3,1,1,3,Title,NN,Title,"(17, 22)",[O],split2,[99999],[O]
4,1,1,4,:,:,Title,"(22, 23)",[O],split2,[99999],[O]
5,1,1,5,Papers,NNS,Title,"(24, 30)",[O],split2,[99999],[O]
6,1,1,6,of,IN,Title,"(31, 33)",[O],split2,[99999],[O]
7,1,1,7,The,DT,Title,"(34, 37)",[O],split2,"[14384, 24275, 52952]",[O]


In [42]:
df_devtest.predicted_300.value_counts()

[O]                                151743
[Gendered-Pronoun]                    980
[Gendered-Role]                       660
[Generalization]                       84
[Gendered-Role, Generalization]         2
Name: predicted_300, dtype: int64

In [43]:
exp_df = df_devtest.drop(columns=["predicted_300", "ann_id"])
exp_df = exp_df.explode(col)
pred_df = df_devtest.drop(columns=["tag", "ann_id"])
pred_df = pred_df.explode("predicted_300")

In [44]:
pred_col = "predicted_300"
exp_col = col
no_tag_value = "O"
left_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", exp_col]
right_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", pred_col]

# Add the predicted tags to the DataFrame with expected tags
exp_pred_df = pd.merge(
    left=exp_df, 
    right=pred_df, 
    how="outer",
    left_on=left_on_cols,
    right_on=right_on_cols,
    suffixes=["", "_pred300"],
    indicator=True
)

# Replace any NaN values with "O" to indicate no predicted tag
exp_pred_df[exp_col] = exp_pred_df[exp_col].fillna(no_tag_value)
exp_pred_df[pred_col] = exp_pred_df[pred_col].fillna(no_tag_value)

# Find true negatives based on the expected and predicted tags
sub_exp_pred_df = exp_pred_df.loc[exp_pred_df[exp_col] == no_tag_value]
sub_exp_pred_df = sub_exp_pred_df.loc[sub_exp_pred_df[pred_col] == no_tag_value]
sub_exp_pred_df = sub_exp_pred_df.drop(columns=["_merge"])
sub_exp_pred_df.insert( len(sub_exp_pred_df.columns), "_merge", ( ["true negative"]*(sub_exp_pred_df.shape[0]) ) )
# Record false negatives, false positives, and true positives based on the merge values
sub_exp_pred_df2 = exp_pred_df.loc[~exp_pred_df.index.isin(sub_exp_pred_df.index)]
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="left_only", value="false negative")
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="right_only", value="false positive")
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="both", value="true positive")
# Combine the DataFrames to include all agreement types and sort the DataFrame
eval_df = pd.concat([sub_exp_pred_df,sub_exp_pred_df2])
eval_df = eval_df.sort_index()
eval_df.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,predicted_300,_merge
0,1,1,3,Title,NN,Title,"(17, 22)",O,split2,O,true negative
1,1,1,4,:,:,Title,"(22, 23)",O,split2,O,true negative
2,1,1,5,Papers,NNS,Title,"(24, 30)",O,split2,O,true negative
3,1,1,6,of,IN,Title,"(31, 33)",O,split2,O,true negative
4,1,1,7,The,DT,Title,"(34, 37)",O,split2,O,true negative


In [45]:
eval_df._merge.value_counts()

true negative     152139
true positive       1322
false negative       623
false positive       406
Name: _merge, dtype: int64

In [46]:
labels = list(eval_df.tag.unique())
labels.sort()
labels.remove("O")
print(labels)

def precisionRecallF1(tp_count, fp_count, fn_count):
    # Precision Score: ability of classifier not to label a sample that should be negative as positive; best possible = 1, worst possible = 0
    if tp_count+fp_count == 0:
        precision = 0
    else:
        precision = (tp_count/(tp_count+fp_count))
    # Recall Score: ability of classifier to find all positive samples; best possible = 1, worst possible = 0
    if tp_count+fn_count == 0:
        recall = 0
    else:
        recall = (tp_count/(tp_count+fn_count))
    # F1 Score: harmonic mean of precision and recall; best possible = 1, worst possible = 0
    if (precision+recall == 0):
        f_1 = 0
    else:
        f_1 = (2*precision*recall)/(precision+recall)
    return precision, recall, f_1

['Gendered-Pronoun', 'Gendered-Role', 'Generalization']


In [47]:
agmt_scores_300 = pd.DataFrame.from_dict({
        "label":[], "false negative":[], "false positive":[],
         "true positive":[], "precision":[], "recall":[], "f1":[]
    })
for label in labels:
    agmt_df_300 = pd.concat([eval_df.loc[eval_df[exp_col] == label], eval_df.loc[eval_df[pred_col] == label]])
    agmt_df_300 = agmt_df_300.drop_duplicates() # True positives will have been duplicated in line above
    tp = agmt_df_300.loc[agmt_df_300._merge == "true positive"].shape[0]
    fp = agmt_df_300.loc[agmt_df_300._merge == "false positive"].shape[0]
    fn = agmt_df_300.loc[agmt_df_300._merge == "false negative"].shape[0]
    prec, rec, f1 = precisionRecallF1(tp, fp, fn)
    label_agmt = pd.DataFrame.from_dict({
            "label":[label], "false negative":[fn], "false positive":[fp],
             "true positive":[tp], "precision":[prec], "recall":[rec], "f1":[f1]
        })
    agmt_scores_300 = pd.concat([agmt_scores_300, label_agmt])
agmt_scores_300

Unnamed: 0,label,false negative,false positive,true positive,precision,recall,f1
0,Gendered-Pronoun,14.0,201.0,779.0,0.794898,0.982346,0.878737
0,Gendered-Role,241.0,175.0,487.0,0.73565,0.668956,0.700719
0,Generalization,368.0,30.0,56.0,0.651163,0.132075,0.219608


In [None]:
# Model with 100 Dimensions, 3 Labels
#             label	false negative	false positive	true positive	precision	recall	    f1
# 0	    Gendered-Pronoun	24.0	       178.0	   735.0	    0.805038	0.968379	0.879187
# 0	    Gendered-Role	    242.0	       169.0	   451.0	    0.727419	0.650794	0.686976
# 0    	Generalization	    318.0	        35.0	    67.0	    0.656863	0.174026	0.275154

#### 4.3 Compare Models
Run the two models classifying with 3 labels on the blind test data to see which performs best.

4.3.1 100-Dimension Model

In [75]:
ft_model = FastText.load("models/embeddings/custom_fasttext/fasttext_cbow_100d.model")
X_test = getFeatures(df_test, ft_model)
y_test = mlb.transform(df_test[col])



In [76]:
y_pred = clf.predict(X_test)

Export the data with the predicted labels:

In [77]:
# Format the predicted tags as lists to match the format of the expected tags
pred_labels = mlb.inverse_transform(y_pred)
new_preds = []
for labels in pred_labels:
    if len(labels) == 0:
        new_preds += [["O"]]
    else:
        new_preds += [list(labels)]
print(new_preds[:5])

[['O'], ['O'], ['O'], ['Gendered-Pronoun'], ['O']]


In [80]:
df_test = df_test.drop(columns=["predicted", "predicted_300"])

In [81]:
df_test.insert(len(df.columns), "predicted", new_preds)
df_test.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,ann_id,predicted
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",[O],split4,[99999],[O]
1,0,0,1,:,:,Identifier,"(10, 11)",[O],split4,[99999],[O]
2,0,0,2,AA5,NN,Identifier,"(12, 15)",[O],split4,[99999],[O]
134,3,4,134,He,PRP,Biographical / Historical,"(789, 791)",[Gendered-Pronoun],split4,[14377],[Gendered-Pronoun]
135,3,4,135,was,VBD,Biographical / Historical,"(792, 795)",[O],split4,[99999],[O]


In [82]:
df_test.predicted.value_counts()

[O]                   147310
[Gendered-Pronoun]       965
[Gendered-Role]          604
[Generalization]         101
Name: predicted, dtype: int64

In [83]:
exp_df = df_test.drop(columns=["predicted", "ann_id"])
exp_df = exp_df.explode(col)
pred_df = df_test.drop(columns=["tag", "ann_id"])
pred_df = pred_df.explode("predicted")

In [84]:
pred_col = "predicted"
exp_col = col
no_tag_value = "O"
left_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", exp_col]
right_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", pred_col]

# Add the predicted tags to the DataFrame with expected tags
exp_pred_df = pd.merge(
    left=exp_df, 
    right=pred_df, 
    how="outer",
    left_on=left_on_cols,
    right_on=right_on_cols,
    suffixes=["", "_pred"],
    indicator=True
)

# Replace any NaN values with "O" to indicate no predicted tag
exp_pred_df[exp_col] = exp_pred_df[exp_col].fillna(no_tag_value)
exp_pred_df[pred_col] = exp_pred_df[pred_col].fillna(no_tag_value)

# Find true negatives based on the expected and predicted tags
sub_exp_pred_df = exp_pred_df.loc[exp_pred_df[exp_col] == no_tag_value]
sub_exp_pred_df = sub_exp_pred_df.loc[sub_exp_pred_df[pred_col] == no_tag_value]
sub_exp_pred_df = sub_exp_pred_df.drop(columns=["_merge"])
sub_exp_pred_df.insert( len(sub_exp_pred_df.columns), "_merge", ( ["true negative"]*(sub_exp_pred_df.shape[0]) ) )
# Record false negatives, false positives, and true positives based on the merge values
sub_exp_pred_df2 = exp_pred_df.loc[~exp_pred_df.index.isin(sub_exp_pred_df.index)]
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="left_only", value="false negative")
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="right_only", value="false positive")
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="both", value="true positive")
# Combine the DataFrames to include all agreement types and sort the DataFrame
eval_df = pd.concat([sub_exp_pred_df,sub_exp_pred_df2])
eval_df = eval_df.sort_index()
eval_df.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,predicted,_merge
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",O,split4,O,true negative
1,0,0,1,:,:,Identifier,"(10, 11)",O,split4,O,true negative
2,0,0,2,AA5,NN,Identifier,"(12, 15)",O,split4,O,true negative
3,3,4,134,He,PRP,Biographical / Historical,"(789, 791)",Gendered-Pronoun,split4,Gendered-Pronoun,true positive
4,3,4,135,was,VBD,Biographical / Historical,"(792, 795)",O,split4,O,true negative


In [85]:
eval_df._merge.value_counts()

true negative     147756
true positive       1214
false negative       585
false positive       456
Name: _merge, dtype: int64

In [86]:
labels = list(eval_df.tag.unique())
labels.sort()
labels.remove("O")
print(labels)

def precisionRecallF1(tp_count, fp_count, fn_count):
    # Precision Score: ability of classifier not to label a sample that should be negative as positive; best possible = 1, worst possible = 0
    if tp_count+fp_count == 0:
        precision = 0
    else:
        precision = (tp_count/(tp_count+fp_count))
    # Recall Score: ability of classifier to find all positive samples; best possible = 1, worst possible = 0
    if tp_count+fn_count == 0:
        recall = 0
    else:
        recall = (tp_count/(tp_count+fn_count))
    # F1 Score: harmonic mean of precision and recall; best possible = 1, worst possible = 0
    if (precision+recall == 0):
        f_1 = 0
    else:
        f_1 = (2*precision*recall)/(precision+recall)
    return precision, recall, f_1

['Gendered-Pronoun', 'Gendered-Role', 'Generalization']


In [None]:
agmt_scores = pd.DataFrame.from_dict({
        "label":[], "false negative":[], "false positive":[],
         "true positive":[], "precision":[], "recall":[], "f1":[]
    })
for label in labels:
    agmt_df = pd.concat([eval_df.loc[eval_df[exp_col] == label], eval_df.loc[eval_df[pred_col] == label]])
    agmt_df = agmt_df.drop_duplicates() # True positives will have been duplicated in line above
    tp = agmt_df.loc[agmt_df._merge == "true positive"].shape[0]
    fp = agmt_df.loc[agmt_df._merge == "false positive"].shape[0]
    fn = agmt_df.loc[agmt_df._merge == "false negative"].shape[0]
    prec, rec, f1 = precisionRecallF1(tp, fp, fn)
    label_agmt = pd.DataFrame.from_dict({
            "label":[label], "false negative":[fn], "false positive":[fp],
             "true positive":[tp], "precision":[prec], "recall":[rec], "f1":[f1]
        })
    agmt_scores = pd.concat([agmt_scores, label_agmt])
agmt_scores

Unnamed: 0,label,false negative,false positive,true positive,precision,recall,f1
0,Gendered-Pronoun,17.0,220.0,745.0,0.772021,0.97769,0.862768
0,Gendered-Role,202.0,199.0,405.0,0.67053,0.667216,0.668869
0,Generalization,366.0,37.0,64.0,0.633663,0.148837,0.241055


4.3.2 300-Dimension Model

In [88]:
ft_model = FastText.load("models/embeddings/custom_fasttext/fasttext_cbow_300d.model")
X_test = getFeatures(df_test, ft_model)
y_test = mlb.transform(df_test[col])



In [89]:
y_pred = clf_300.predict(X_test)

Export the data with the predicted labels:

In [90]:
# Format the predicted tags as lists to match the format of the expected tags
pred_labels = mlb.inverse_transform(y_pred)
new_preds = []
for labels in pred_labels:
    if len(labels) == 0:
        new_preds += [["O"]]
    else:
        new_preds += [list(labels)]
print(new_preds[:5])

[['O'], ['O'], ['O'], ['Gendered-Pronoun'], ['O']]


In [91]:
df_test.insert(len(df.columns), "predicted_300", new_preds)
df_test.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,ann_id,predicted_300,predicted
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",[O],split4,[99999],[O],[O]
1,0,0,1,:,:,Identifier,"(10, 11)",[O],split4,[99999],[O],[O]
2,0,0,2,AA5,NN,Identifier,"(12, 15)",[O],split4,[99999],[O],[O]
134,3,4,134,He,PRP,Biographical / Historical,"(789, 791)",[Gendered-Pronoun],split4,[14377],[Gendered-Pronoun],[Gendered-Pronoun]
135,3,4,135,was,VBD,Biographical / Historical,"(792, 795)",[O],split4,[99999],[O],[O]


In [92]:
df_test.predicted_300.value_counts()

[O]                   147310
[Gendered-Pronoun]       965
[Gendered-Role]          603
[Generalization]         102
Name: predicted_300, dtype: int64

In [93]:
exp_df = df_test.drop(columns=["predicted", "predicted_300", "ann_id"])
exp_df = exp_df.explode(col)
pred_df = df_test.drop(columns=["predicted", "tag", "ann_id"])
pred_df = pred_df.explode("predicted_300")

In [94]:
pred_col = "predicted_300"
exp_col = col
no_tag_value = "O"
left_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", exp_col]
right_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", pred_col]

# Add the predicted tags to the DataFrame with expected tags
exp_pred_df = pd.merge(
    left=exp_df, 
    right=pred_df, 
    how="outer",
    left_on=left_on_cols,
    right_on=right_on_cols,
    suffixes=["", "_pred300"],
    indicator=True
)

# Replace any NaN values with "O" to indicate no predicted tag
exp_pred_df[exp_col] = exp_pred_df[exp_col].fillna(no_tag_value)
exp_pred_df[pred_col] = exp_pred_df[pred_col].fillna(no_tag_value)

# Find true negatives based on the expected and predicted tags
sub_exp_pred_df = exp_pred_df.loc[exp_pred_df[exp_col] == no_tag_value]
sub_exp_pred_df = sub_exp_pred_df.loc[sub_exp_pred_df[pred_col] == no_tag_value]
sub_exp_pred_df = sub_exp_pred_df.drop(columns=["_merge"])
sub_exp_pred_df.insert( len(sub_exp_pred_df.columns), "_merge", ( ["true negative"]*(sub_exp_pred_df.shape[0]) ) )
# Record false negatives, false positives, and true positives based on the merge values
sub_exp_pred_df2 = exp_pred_df.loc[~exp_pred_df.index.isin(sub_exp_pred_df.index)]
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="left_only", value="false negative")
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="right_only", value="false positive")
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="both", value="true positive")
# Combine the DataFrames to include all agreement types and sort the DataFrame
eval_df = pd.concat([sub_exp_pred_df,sub_exp_pred_df2])
eval_df = eval_df.sort_index()
eval_df.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,predicted_300,_merge
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",O,split4,O,true negative
1,0,0,1,:,:,Identifier,"(10, 11)",O,split4,O,true negative
2,0,0,2,AA5,NN,Identifier,"(12, 15)",O,split4,O,true negative
3,3,4,134,He,PRP,Biographical / Historical,"(789, 791)",Gendered-Pronoun,split4,Gendered-Pronoun,true positive
4,3,4,135,was,VBD,Biographical / Historical,"(792, 795)",O,split4,O,true negative


In [95]:
eval_df._merge.value_counts()

true negative     147756
true positive       1214
false negative       585
false positive       456
Name: _merge, dtype: int64

In [96]:
labels = list(eval_df.tag.unique())
labels.sort()
labels.remove("O")
print(labels)

def precisionRecallF1(tp_count, fp_count, fn_count):
    # Precision Score: ability of classifier not to label a sample that should be negative as positive; best possible = 1, worst possible = 0
    if tp_count+fp_count == 0:
        precision = 0
    else:
        precision = (tp_count/(tp_count+fp_count))
    # Recall Score: ability of classifier to find all positive samples; best possible = 1, worst possible = 0
    if tp_count+fn_count == 0:
        recall = 0
    else:
        recall = (tp_count/(tp_count+fn_count))
    # F1 Score: harmonic mean of precision and recall; best possible = 1, worst possible = 0
    if (precision+recall == 0):
        f_1 = 0
    else:
        f_1 = (2*precision*recall)/(precision+recall)
    return precision, recall, f_1

['Gendered-Pronoun', 'Gendered-Role', 'Generalization']


In [97]:
agmt_scores_300 = pd.DataFrame.from_dict({
        "label":[], "false negative":[], "false positive":[],
         "true positive":[], "precision":[], "recall":[], "f1":[]
    })
for label in labels:
    agmt_df = pd.concat([eval_df.loc[eval_df[exp_col] == label], eval_df.loc[eval_df[pred_col] == label]])
    agmt_df = agmt_df.drop_duplicates() # True positives will have been duplicated in line above
    tp = agmt_df.loc[agmt_df._merge == "true positive"].shape[0]
    fp = agmt_df.loc[agmt_df._merge == "false positive"].shape[0]
    fn = agmt_df.loc[agmt_df._merge == "false negative"].shape[0]
    prec, rec, f1 = precisionRecallF1(tp, fp, fn)
    label_agmt = pd.DataFrame.from_dict({
            "label":[label], "false negative":[fn], "false positive":[fp],
             "true positive":[tp], "precision":[prec], "recall":[rec], "f1":[f1]
        })
    agmt_scores_300 = pd.concat([agmt_scores_300, label_agmt])
agmt_scores_300

Unnamed: 0,label,false negative,false positive,true positive,precision,recall,f1
0,Gendered-Pronoun,17.0,220.0,745.0,0.772021,0.97769,0.862768
0,Gendered-Role,202.0,198.0,405.0,0.671642,0.667216,0.669421
0,Generalization,366.0,38.0,64.0,0.627451,0.148837,0.240602


Export the highest-performing models:

In [None]:
# model_dir = "models/multilabel_token/"
# Path(model_dir).mkdir(parents=True, exist_ok=True)

# # Save classifier
# filename = model_dir+"cc-{a}_F-fasttext{d}_T-linglabels.joblib".format(a="rf", d="100")  # include features (F) and targets (T) in model's file name
# dump(clf, filename)

# # Save multilabel binarizer
# filename = model_dir+"mlb_linglabels.joblib"
# dump(mlb, filename)

['models/multilabel_linguistic/mlb_linglabels.joblib']