# Multilabel Token Classification
## Experiments 1 and 2, Model 1
## Classification of Linguistic labels: *Gendered Pronoun*, *Gendered Role*, *Generalization*

In [1]:
import config
import my_utils

# For data analysis
import pandas as pd
import numpy as np
import os, re

# For creating directories
from pathlib import Path

# For word embeddings
from gensim.models import FastText #, Word2Vec
from gensim.utils import tokenize
from gensim import utils
from gensim.test.utils import get_tmpfile

# For preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
# nltk.download('punkt')
from nltk.corpus import PlaintextCorpusReader
# nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag

# For multilabel token classification
import sklearn.metrics
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier

# For saving model
import joblib
from joblib import dump,load

### 1. Create Word Embeddings

Train custom word embeddings on metadata descriptions from the University of Edinburgh Heritage Collections' Archives catalog.

* Data file: `descriptions_by_fonds`
* Date of harvesting: October 2020
* Harvesting and transformation code: [annot-prep/PreparationForAnnotation.ipynb](https://github.com/thegoose20/annot-prep/blob/main/PreparationForAnnotation.ipynb)

References:
* https://radimrehurek.com/gensim/models/fasttext.html
* https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html#sphx-glr-auto-examples-tutorials-run-fasttext-py

In [2]:
dir_path = "../data/descriptions_by_fonds/"
file_list = os.listdir(dir_path)
print(len(file_list))

1079


In [3]:
class CorpusIterator:
    def __iter__(self):
        file_list = os.listdir(dir_path)
        for fonds_f in file_list:
            assert ".txt" in fonds_f, "All files should be Plaintext." 
            file_path = dir_path+fonds_f
            with utils.open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    # Lowercase the tokens
                    yield list(tokenize(line.lower()))   #list(tokenize(line))

Define the hyperparameters for the unsupervised training of the fastText model:

In [4]:
# Specify training architecture (default = "cbow" for Continuous Bag of Words)
training_arch = "cbow"  #"skipgram"  
if training_arch == "skipgram":
    sg = 1
else:
    sg = 0
# Specify the learning rate (default = 0.025)
alpha = 0.025
# Specify the training objective (default = "ns")
# losses = ["ns", "hs", "softmax"]
# loss = losses[0]
# Specify the number of negative words to sample for 'ns' training objective (default = 5)
negative = 5
# Specify the threshold for downsampling higher-frequency words (default = 0.001)
sample = 0.001
# Specify the word embeddings' dimensions
vector_dimensions = 100 #50 #300
# Specify the context window (default is 5) 
context_window = 5
# Specify the number of epochs (default is 5)
epochs = 5
# Specify the threshold of word occurrences (ignore words that occur less than specified number of times; default = 5)
min_count = 5
# Specify the minimum and maximum length of character ngrams (defaults are 3 and 6)
min_n = 2
max_n = 6  # if 0, no character n-grams (subword vectors) will be used
# Specify the number of buckets for hashing ngrams (default = 2000000) 
bucket = 2000000
# Sort vocabulary by descending frequency (default = 1)
sorted_vocab = 1
# Specify the number of threads to use (default = 12)
# threads = 12

In [5]:
embedding_model = FastText(
    alpha=alpha, sg=sg, negative=negative, sample=sample,
    vector_size=vector_dimensions, window=context_window, 
    epochs=epochs, min_count=min_count, min_n=min_n, 
    max_n=max_n, bucket=bucket, sorted_vocab=sorted_vocab
)

In [6]:
embedding_model.build_vocab(corpus_iterable=CorpusIterator())
total_examples = embedding_model.corpus_count

In [7]:
embedding_model.train(corpus_iterable=CorpusIterator(), total_examples=total_examples, epochs=epochs)

(7321026, 10119275)

Save the model:

In [8]:
file_name = "fasttext_{a}_{d}d.model".format(a=training_arch, d=vector_dimensions)
print(file_name)

fasttext_cbow_100d.model


In [9]:
embedding_model.save("models/embeddings/custom_fasttext/"+file_name)

### 2. Data Preprocessing

In [5]:
token_data = config.exp_data_path+"token_5fold.csv"

In [6]:
df = pd.read_csv(token_data, index_col=0)
df.head()

Unnamed: 0,description_id,sentence_id,ann_id,token_id,token,token_offsets,pos,tag,field,fold
0,0,0,99999,0,Identifier,"(0, 10)",NN,O,Identifier,split4
1,0,0,99999,1,:,"(10, 11)",:,O,Identifier,split4
2,0,0,99999,2,AA5,"(12, 15)",NN,O,Identifier,split4
3,1,1,99999,3,Title,"(17, 22)",NN,O,Title,split2
4,1,1,99999,4,:,"(22, 23)",:,O,Title,split2


In [7]:
ling_tags = ["B-Generalization", "I-Generalization", "B-Gendered-Role", "I-Gendered-Role", "B-Gendered-Pronoun", "I-Gendered-Pronoun"]

In [8]:
labels_to_consider = ling_tags
col = "tag"

In [9]:
df = my_utils.preprocessTokenData(df, col, labels_to_consider)
df = df.sort_values(by="token_id")
df.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,ann_id
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",[O],split4,[99999]
1,0,0,1,:,:,Identifier,"(10, 11)",[O],split4,[99999]
2,0,0,2,AA5,NN,Identifier,"(12, 15)",[O],split4,[99999]
3,1,1,3,Title,NN,Title,"(17, 22)",[O],split2,[99999]
4,1,1,4,:,:,Title,"(22, 23)",[O],split2,[99999]


In [10]:
df[col].value_counts()

[O]                                   744728
[Gendered-Pronoun]                      3624
[Gendered-Role]                         3151
[Generalization]                        1808
[Gendered-Pronoun, Generalization]       107
[Gendered-Role, Generalization]          103
Name: tag, dtype: int64

### 3. Feature Extraction

In [11]:
ft_model = FastText.load(config.fasttext_path+"fasttext_{a}_{d}d.model".format(a=training_arch, d=vector_dimensions))
print("Loading FastText model with", training_arch, "architecture and", vector_dimensions, "dimensions.")

Loading FastText model with cbow architecture and 100 dimensions.


Define the five splits of the data to combine iteratively into training and test sets using five-fold cross-validation:

In [12]:
split_col = "fold"
splits = df[split_col].unique()
splits.sort()
print(splits)

['split0' 'split1' 'split2' 'split3' 'split4']


In [13]:
train0, devtest0 = list(splits[:3]), splits[3]
train1, devtest1 = list(splits[1:4]), splits[0]
train2, devtest2 = list(splits[2:4])+[splits[0]], splits[1]
train3, devtest3 = [splits[3]]+list(splits[:2]), splits[2]
runs = [(train0, devtest0), (train1, devtest1), (train2, devtest2), (train3, devtest3)]
test = splits[4]
print(runs)
print(test)

[(['split0', 'split1', 'split2'], 'split3'), (['split1', 'split2', 'split3'], 'split0'), (['split2', 'split3', 'split0'], 'split1'), (['split3', 'split0', 'split1'], 'split2')]
split4


In [19]:
mlb = MultiLabelBinarizer()
mlb.fit([["Gendered-Pronoun", "Gendered-Role", "Generalization"]])

### 4. 4-Fold Cross-Validation: Classifier Training and Development
#### 4.1 100-Dimension Embeddings

In [20]:
final_df_devtest = pd.DataFrame()
for run in runs:
    # Select 3 subsets of data as the training set and 1 subset of data as the devtest set
    train_splits, devtest_split = run[0], run[1]
    df_train = df.loc[df[split_col].isin(train_splits)]
    df_devtest = df.loc[df[split_col] == devtest_split]
    # df_test = df.loc[df[split_col] == test]
    # assert df.shape[0] == df_train.shape[0] + df_devtest.shape[0] + df_test.shape[0]

    # Extract features
    X_train = my_utils.getFeatures(df_train, ft_model)
    X_devtest = my_utils.getFeatures(df_devtest, ft_model)

    # Binarize the targets (a.k.a. the values in the DataFrame's 'tag' column)
    y_train = mlb.transform(df_train[col])
    y_devtest = mlb.transform(df_devtest[col])
    
    # Train a classifier
    clf = ClassifierChain(
        classifier = RandomForestClassifier(random_state=22),
    )
    clf.fit(X_train, y_train)

    # Classify the devtest data with the trained classifier
    y_pred = clf.predict(X_devtest)

    # Format the predicted tags as lists to match the format of the expected tags
    pred_labels = mlb.inverse_transform(y_pred)
    new_preds = []
    for labels in pred_labels:
        if len(labels) == 0:
            new_preds += [["O"]]
        else:
            new_preds += [list(labels)]
    
    # Add the predictions to the devtest DataFrame
    df_devtest.insert(len(df.columns), "predicted", new_preds)

    # Merge any previous devtest DataFrames with the latest devtest DataFrame
    final_df_devtest = pd.concat([final_df_devtest, df_devtest])

assert final_df_devtest.shape[0] < df.shape[0]
final_df_devtest.predicted.value_counts()




[O]                                598082
[Gendered-Pronoun]                   3677
[Gendered-Role]                      2425
[Generalization]                      353
[Gendered-Role, Generalization]         4
Name: predicted, dtype: int64

Determine the classifier's performance on the devtest sets:

In [21]:
exp_df = final_df_devtest.drop(columns=["predicted", "ann_id"])
exp_df = exp_df.explode(col)
pred_df = final_df_devtest.drop(columns=["tag", "ann_id"])
pred_df = pred_df.explode("predicted")

In [22]:
pred_col = "predicted"
exp_col = col
no_tag_value = "O"
left_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", exp_col]
right_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", pred_col]

eval_df = my_utils.getTpTnFpFn(exp_df, pred_df, pred_col, exp_col, no_tag_value, left_on_cols, right_on_cols)

In [23]:
eval_df._merge.value_counts()

true negative     599617
true positive       4889
false negative      2315
false positive      1574
Name: _merge, dtype: int64

In [24]:
labels = list(eval_df.tag.unique())
labels.sort()
labels.remove("O")
print(labels)

['Gendered-Pronoun', 'Gendered-Role', 'Generalization']


In [25]:
agmt_scores = my_utils.getPerformanceScores(eval_df, exp_col, pred_col, labels)

In [26]:
agmt_scores

Unnamed: 0,label,false negative,false positive,true positive,precision,recall,f1
0,Gendered-Pronoun,61.0,769.0,2908.0,0.790862,0.979454,0.875113
0,Gendered-Role,888.0,670.0,1759.0,0.724166,0.664526,0.693065
0,Generalization,1366.0,135.0,222.0,0.621849,0.139798,0.228278


Save the devtest predictions:

In [None]:
pred_dir = "data/multilabel_token_predictions/ccrf_ftcbow100lower/"
Path(pred_dir).mkdir(parents=True, exist_ok=True)
eval_df.to_csv(pred_dir+"ccrf_ftcbow100_devtest_predictions.csv")

Evaluate the classifier on the test set:

In [28]:
df_test = df.loc[df[split_col] == test]
X_test = my_utils.getFeatures(df_test, ft_model)
y_test = mlb.transform(df_test[col])
y_pred = clf.predict(X_test)

# Format the predicted tags as lists to match the format of the expected tags
pred_labels = mlb.inverse_transform(y_pred)
new_preds = []
for labels in pred_labels:
    if len(labels) == 0:
        new_preds += [["O"]]
    else:
        new_preds += [list(labels)]

# Add the predictions to the test DataFrame
df_test.insert(len(df.columns), "predicted", new_preds)



In [29]:
exp_df = df_test.drop(columns=["predicted", "ann_id"])
exp_df = exp_df.explode(col)
pred_df = df_test.drop(columns=["tag", "ann_id"])
pred_df = pred_df.explode("predicted")

In [None]:
pred_col = "predicted"
exp_col = col
no_tag_value = "O"
left_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", exp_col]
right_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", pred_col]

test_eval_df = my_utils.getTpTnFpFn(exp_df, pred_df, pred_col, exp_col, no_tag_value, left_on_cols, right_on_cols)
test_eval_df.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,predicted,_merge
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",O,split4,O,true negative
1,0,0,1,:,:,Identifier,"(10, 11)",O,split4,O,true negative
2,0,0,2,AA5,NN,Identifier,"(12, 15)",O,split4,O,true negative
3,3,4,134,He,PRP,Biographical / Historical,"(789, 791)",Gendered-Pronoun,split4,Gendered-Pronoun,true positive
4,3,4,135,was,VBD,Biographical / Historical,"(792, 795)",O,split4,O,true negative


In [39]:
labels = list(eval_df.tag.unique())
labels.sort()
labels.remove("O")
print(labels)

['Gendered-Pronoun', 'Gendered-Role', 'Generalization']


In [40]:
test_agmt_scores = my_utils.getPerformanceScores(test_eval_df, exp_col, pred_col, labels)
test_agmt_scores

Unnamed: 0,label,false negative,false positive,true positive,precision,recall,f1
0,Gendered-Pronoun,17.0,220.0,745.0,0.772021,0.97769,0.862768
0,Gendered-Role,202.0,198.0,405.0,0.671642,0.667216,0.669421
0,Generalization,366.0,38.0,64.0,0.627451,0.148837,0.240602


#### Error analysis
A review of the *Gendered Pronoun* and *Gendered Role* false positives shows that the model's predictions are actually correct; the human coders mistakenly missed the tokens that the models classified with these labels.

Save the test set predictions:

In [41]:
test_eval_df.to_csv(pred_dir+"ccrf_ftcbow100_test_predictions.csv")

#### 4.2 300-Dimension Word Embeddings
Try training the model using higher-dimension word embeddings and evaluate its performance on the devtest data.

In [42]:
vector_dimensions = 300

embedding_model = FastText(
    alpha=alpha, sg=sg, negative=negative, sample=sample,
    vector_size=vector_dimensions, window=context_window, 
    epochs=epochs, min_count=min_count, min_n=min_n, 
    max_n=max_n, bucket=bucket, sorted_vocab=sorted_vocab
)

embedding_model.build_vocab(corpus_iterable=CorpusIterator())
total_examples = embedding_model.corpus_count

embedding_model.train(corpus_iterable=CorpusIterator(), total_examples=total_examples, epochs=epochs)

file_name = "fasttext_{a}_{d}d.model".format(a=training_arch, d=vector_dimensions)
print("Trained more embeddings and saved as:", file_name)

embedding_model.save("models/embeddings/custom_fasttext/"+file_name)

Trained more embeddings and saved as: fasttext_skipgram_300d.model


In [43]:
ft_model = embedding_model #FastText.load(config.fasttext_path+f"fasttext_{training_arch}_{vector_dimensions}d.model")

In [44]:
final_df_devtest = pd.DataFrame()
for run in runs:
    # Select 3 subsets of data as the training set and 1 subset of data as the devtest set
    train_splits, devtest_split = run[0], run[1]
    df_train = df.loc[df[split_col].isin(train_splits)]
    df_devtest = df.loc[df[split_col] == devtest_split]
    df_test = df.loc[df[split_col] == test]
    assert df.shape[0] == df_train.shape[0] + df_devtest.shape[0] + df_test.shape[0]

    # Extract features
    X_train = my_utils.getFeatures(df_train, ft_model)
    X_devtest = my_utils.getFeatures(df_devtest, ft_model)

    # Binarize the targets (a.k.a. the values in the DataFrame's 'tag' column)
    y_train = mlb.transform(df_train[col])
    y_devtest = mlb.transform(df_devtest[col])
    
    # Train a classifier
    clf_300 = ClassifierChain(
        classifier = RandomForestClassifier(random_state=22),
    )
    clf_300.fit(X_train, y_train)

    # Classify the devtest data with the trained classifier
    y_pred = clf_300.predict(X_devtest)

    # Format the predicted tags as lists to match the format of the expected tags
    pred_labels = mlb.inverse_transform(y_pred)
    new_preds = []
    for labels in pred_labels:
        if len(labels) == 0:
            new_preds += [["O"]]
        else:
            new_preds += [list(labels)]
    
    # Add the predictions to the devtest DataFrame
    df_devtest.insert(len(df.columns), "predicted", new_preds)

    # Merge any previous devtest DataFrames with the latest devtest DataFrame
    final_df_devtest = pd.concat([final_df_devtest, df_devtest])

assert final_df_devtest.shape[0] < df.shape[0]
final_df_devtest.predicted.value_counts()




[O]                                598080
[Gendered-Pronoun]                   3677
[Gendered-Role]                      2427
[Generalization]                      353
[Gendered-Role, Generalization]         4
Name: predicted, dtype: int64

Determine the classifier performance on the devtest subset of data:

In [None]:
exp_df = final_df_devtest.drop(columns=["predicted", "ann_id"])
exp_df = exp_df.explode(col)
pred_df = final_df_devtest.drop(columns=["tag", "ann_id"])
pred_df = pred_df.explode("predicted")

In [None]:
pred_col = "predicted"
exp_col = col
no_tag_value = "O"
left_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", exp_col]
right_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", pred_col]

eval_df = my_utils.getTpTnFpFn(exp_df, pred_df, pred_col, exp_col, no_tag_value, left_on_cols, right_on_cols)

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,predicted,_merge
0,1,1,3,Title,NN,Title,"(17, 22)",O,split2,O,true negative
1,1,1,4,:,:,Title,"(22, 23)",O,split2,O,true negative
2,1,1,5,Papers,NNS,Title,"(24, 30)",O,split2,O,true negative
3,1,1,6,of,IN,Title,"(31, 33)",O,split2,O,true negative
4,1,1,7,The,DT,Title,"(34, 37)",O,split2,O,true negative


In [49]:
eval_df._merge.value_counts()

true negative     152138
true positive       1323
false negative       622
false positive       406
Name: _merge, dtype: int64

In [50]:
labels = list(eval_df.tag.unique())
labels.sort()
labels.remove("O")
print(labels)

['Gendered-Pronoun', 'Gendered-Role', 'Generalization']


In [None]:
agmt_scores_300 = my_utils.getPerformanceScores(eval_df, exp_col, pred_col, labels)

Unnamed: 0,label,false negative,false positive,true positive,precision,recall,f1
0,Gendered-Pronoun,14.0,201.0,779.0,0.794898,0.982346,0.878737
0,Gendered-Role,240.0,175.0,488.0,0.736048,0.67033,0.701653
0,Generalization,368.0,30.0,56.0,0.651163,0.132075,0.219608


#### 4.3 Compare Models
Run the two models classifying with 3 labels on the blind test data to see which performs best.

In [52]:
df_test = df.loc[df[split_col] == test]
y_test = mlb.transform(df_test[col])



4.3.1 100-Dimension Model

In [53]:
training_arch, dimensions = "skipgram", 100
ft_model = FastText.load("models/embeddings/custom_fasttext/fasttext_{a}_{d}d.model".format(a=training_arch, d=dimensions))
X_test = my_utils.getFeatures(df_test, ft_model)

In [54]:
y_pred = clf.predict(X_test)

Export the data with the predicted labels:

In [55]:
# Format the predicted tags as lists to match the format of the expected tags
pred_labels = mlb.inverse_transform(y_pred)
new_preds = []
for labels in pred_labels:
    if len(labels) == 0:
        new_preds += [["O"]]
    else:
        new_preds += [list(labels)]
print(new_preds[:5])

[['O'], ['O'], ['O'], ['Gendered-Pronoun'], ['O']]


In [58]:
df_test.insert(len(df.columns), "predicted", new_preds)
df_test.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,ann_id,predicted
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",[O],split4,[99999],[O]
1,0,0,1,:,:,Identifier,"(10, 11)",[O],split4,[99999],[O]
2,0,0,2,AA5,NN,Identifier,"(12, 15)",[O],split4,[99999],[O]
134,3,4,134,He,PRP,Biographical / Historical,"(789, 791)",[Gendered-Pronoun],split4,[14377],[Gendered-Pronoun]
135,3,4,135,was,VBD,Biographical / Historical,"(792, 795)",[O],split4,[99999],[O]


In [59]:
df_test.predicted.value_counts()

[O]                   147311
[Gendered-Pronoun]       965
[Gendered-Role]          603
[Generalization]         101
Name: predicted, dtype: int64

In [60]:
exp_df = df_test.drop(columns=["predicted", "ann_id"])
exp_df = exp_df.explode(col)
pred_df = df_test.drop(columns=["tag", "ann_id"])
pred_df = pred_df.explode("predicted")

In [None]:
pred_col = "predicted"
exp_col = col
no_tag_value = "O"
left_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", exp_col]
right_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", pred_col]

eval_df = my_utils.getTpTnFpFn(exp_df, pred_df, pred_col, exp_col, no_tag_value, left_on_cols, right_on_cols)

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,predicted,_merge
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",O,split4,O,true negative
1,0,0,1,:,:,Identifier,"(10, 11)",O,split4,O,true negative
2,0,0,2,AA5,NN,Identifier,"(12, 15)",O,split4,O,true negative
3,3,4,134,He,PRP,Biographical / Historical,"(789, 791)",Gendered-Pronoun,split4,Gendered-Pronoun,true positive
4,3,4,135,was,VBD,Biographical / Historical,"(792, 795)",O,split4,O,true negative


In [62]:
eval_df._merge.value_counts()

true negative     147756
true positive       1214
false negative       585
false positive       455
Name: _merge, dtype: int64

In [63]:
labels = list(eval_df.tag.unique())
labels.sort()
labels.remove("O")
print(labels)

['Gendered-Pronoun', 'Gendered-Role', 'Generalization']


In [None]:
agmt_scores = my_utils.getPerformanceScores(eval_df, exp_col, pred_col, labels)

Unnamed: 0,label,false negative,false positive,true positive,precision,recall,f1
0,Gendered-Pronoun,17.0,220.0,745.0,0.772021,0.97769,0.862768
0,Gendered-Role,202.0,198.0,405.0,0.671642,0.667216,0.669421
0,Generalization,366.0,37.0,64.0,0.633663,0.148837,0.241055


4.3.2 300-Dimension Model

In [65]:
training_arch, dimensions = "skipgram", 300
ft_model = FastText.load("models/embeddings/custom_fasttext/fasttext_{a}_{d}d.model".format(a=training_arch, d=dimensions))
X_test = my_utils.getFeatures(df_test, ft_model)

In [66]:
y_pred = clf_300.predict(X_test)

Export the data with the predicted labels:

In [67]:
# Format the predicted tags as lists to match the format of the expected tags
pred_labels = mlb.inverse_transform(y_pred)
new_preds = []
for labels in pred_labels:
    if len(labels) == 0:
        new_preds += [["O"]]
    else:
        new_preds += [list(labels)]
print(new_preds[:5])

[['O'], ['O'], ['O'], ['Gendered-Pronoun'], ['O']]


In [None]:
df_test = df_test.drop(columns=["predicted"])
df_test.insert(len(df.columns), "predicted", new_preds)
df_test.head()

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,ann_id,predicted
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",[O],split4,[99999],[O]
1,0,0,1,:,:,Identifier,"(10, 11)",[O],split4,[99999],[O]
2,0,0,2,AA5,NN,Identifier,"(12, 15)",[O],split4,[99999],[O]
134,3,4,134,He,PRP,Biographical / Historical,"(789, 791)",[Gendered-Pronoun],split4,[14377],[Gendered-Pronoun]
135,3,4,135,was,VBD,Biographical / Historical,"(792, 795)",[O],split4,[99999],[O]


In [74]:
df_test.predicted.value_counts()

[O]                   147310
[Gendered-Pronoun]       965
[Gendered-Role]          604
[Generalization]         101
Name: predicted, dtype: int64

In [75]:
exp_df = df_test.drop(columns=["predicted", "ann_id"])
exp_df = exp_df.explode(col)
pred_df = df_test.drop(columns=["tag", "ann_id"])
pred_df = pred_df.explode("predicted")

In [None]:
pred_col = "predicted"
exp_col = col
no_tag_value = "O"
left_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", exp_col]
right_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", pred_col]


eval_df = my_utils.getTpTnFpFn(exp_df, pred_df, pred_col, exp_col, no_tag_value, left_on_cols, right_on_cols)

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,fold,predicted,_merge
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",O,split4,O,true negative
1,0,0,1,:,:,Identifier,"(10, 11)",O,split4,O,true negative
2,0,0,2,AA5,NN,Identifier,"(12, 15)",O,split4,O,true negative
3,3,4,134,He,PRP,Biographical / Historical,"(789, 791)",Gendered-Pronoun,split4,Gendered-Pronoun,true positive
4,3,4,135,was,VBD,Biographical / Historical,"(792, 795)",O,split4,O,true negative


In [None]:
eval_df._merge.value_counts()

true negative     147755
true positive       1215
false negative       584
false positive       455
Name: _merge, dtype: int64

In [78]:
labels = list(eval_df.tag.unique())
labels.sort()
labels.remove("O")
print(labels)

['Gendered-Pronoun', 'Gendered-Role', 'Generalization']


In [None]:
agmt_scores_300 = my_utils.getPerformanceScores(eval_df, exp_col, pred_col, labels)

Unnamed: 0,label,false negative,false positive,true positive,precision,recall,f1
0,Gendered-Pronoun,17.0,220.0,745.0,0.772021,0.97769,0.862768
0,Gendered-Role,201.0,198.0,406.0,0.672185,0.668863,0.67052
0,Generalization,366.0,37.0,64.0,0.633663,0.148837,0.241055


The classifiers perform very similarly, with the 100-dimension skip-gram and CBOW embedding models slightly better than the 300-dimension skip-gram and CBOW models.

### 5. Export Model(s)

In [42]:
model_dir = "models/multilabel_token/"
Path(model_dir).mkdir(parents=True, exist_ok=True)

# Save classifier
filename = model_dir+"cc-{alg}_F-fasttext{a}{d}_T-linglabels.joblib".format(alg="rf", a=training_arch, d=vector_dimensions)  # include features (F) and targets (T) in model's file name
dump(clf, filename)

# Save multilabel binarizer
filename = model_dir+"mlb_targets_ling.joblib" #"mlb_linglabels.joblib"
dump(mlb, filename)

['models/multilabel_token/mlb_targets_ling.joblib']

### 6. Classify All Data
With the highest-performing model setup, train and test a classifier using a modified form of cross-validation to get predicted classifications for the entire dataset.

In [14]:
train0, test0 = list(splits[:4]), splits[4]
train1, test1 = list(splits[1:]), splits[0]
train2, test2 = list(splits[2:])+[splits[0]], splits[1]
train3, test3 = list(splits[3:])+list(splits[:2]), splits[2]
train4, test4 = [splits[4]]+list(splits[:3]), splits[3]
runs = [(train0, test0), (train1, test1), (train2, test2), (train3, test3), (train4, test4)]
for run in runs:
    print(run)

(['split0', 'split1', 'split2', 'split3'], 'split4')
(['split1', 'split2', 'split3', 'split4'], 'split0')
(['split2', 'split3', 'split4', 'split0'], 'split1')
(['split3', 'split4', 'split0', 'split1'], 'split2')
(['split4', 'split0', 'split1', 'split2'], 'split3')


In [15]:
mlb = joblib.load('models/multilabel_token/mlb_targets_ling.joblib')
mlb.classes_

array(['Gendered-Pronoun', 'Gendered-Role', 'Generalization'],
      dtype=object)

In [16]:
ft_model =  FastText.load("models/embeddings/custom_fasttext/fasttext_{a}_{d}d.model".format(a="cbow", d="100"))

In [17]:
final_df_test = pd.DataFrame()
for run in runs:
    # Select 3 subsets of data as the training set and 1 subset of data as the devtest set
    train_splits, test_split = run[0], run[1]
    df_train = df.loc[df[split_col].isin(train_splits)]
    df_test = df.loc[df[split_col] == test_split]
    assert df.shape[0] == df_train.shape[0] + df_test.shape[0]

    # Extract features
    X_train = my_utils.getFeatures(df_train, ft_model)
    X_test = my_utils.getFeatures(df_test, ft_model)

    # Binarize the targets (a.k.a. the values in the DataFrame's 'tag' column)
    y_train = mlb.transform(df_train[col])
    y_test = mlb.transform(df_test[col])
    
    # Train a classifier
    clf = ClassifierChain(
        classifier = RandomForestClassifier(random_state=22),
    )
    clf.fit(X_train, y_train)

    # Classify the test data with the trained classifier
    y_pred = clf.predict(X_test)

    # Format the predicted tags as lists to match the format of the expected tags
    pred_labels = mlb.inverse_transform(y_pred)
    new_preds = []
    for labels in pred_labels:
        if len(labels) == 0:
            new_preds += [["O"]]
        else:
            new_preds += [list(labels)]
    
    # Add the predictions to the test DataFrame
    df_test.insert(len(df.columns), "predicted", new_preds)

    # Merge any previous test DataFrames with the latest test DataFrame
    final_df_test = pd.concat([final_df_test, df_test])

final_df_test.predicted.value_counts()




[O]                                745356
[Gendered-Pronoun]                   4645
[Gendered-Role]                      3057
[Generalization]                      461
[Gendered-Role, Generalization]         2
Name: predicted, dtype: int64

Determine the classifier's performance on the devtest sets:

In [18]:
exp_df = final_df_test.drop(columns=["predicted", "ann_id"])
exp_df = exp_df.explode(col)
pred_df = final_df_test.drop(columns=["tag", "ann_id"])
pred_df = pred_df.explode("predicted")

In [19]:
pred_col = "predicted"
exp_col = col
no_tag_value = "O"
left_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", exp_col]
right_on_cols = ["description_id", "sentence_id", "token_id", "token", "token_offsets", "pos", "field", "fold", pred_col]

eval_df = my_utils.getTpTnFpFn(exp_df, pred_df, pred_col, exp_col, no_tag_value, left_on_cols, right_on_cols)

In [20]:
eval_df._merge.value_counts()

true negative     747328
true positive       6147
false negative      2856
false positive      2020
Name: _merge, dtype: int64

In [21]:
labels = list(eval_df.tag.unique())
labels.sort()
labels.remove("O")
print(labels)

['Gendered-Pronoun', 'Gendered-Role', 'Generalization']


In [22]:
agmt_scores = my_utils.getPerformanceScores(eval_df, exp_col, pred_col, labels)

In [24]:
agmt_scores

Unnamed: 0,label,false negative,false positive,true positive,precision,recall,f1
0,Gendered-Pronoun,77.0,991.0,3654.0,0.786652,0.979362,0.872493
0,Gendered-Role,1057.0,862.0,2197.0,0.718209,0.675169,0.696024
0,Generalization,1722.0,167.0,296.0,0.639309,0.14668,0.238613


Save the data:

In [23]:
pred_dir = "data/multilabel_token_predictions/ccrf_ftcbow100lower/"
eval_df.to_csv(pred_dir+"ccrf_ftcbow100_alldata_predictions.csv")