# Baseline Gender Biased Token Classifiers

* Supervised learning
    * Train, Validate, and (Blind) Test Data: under directory `../data/token_clf_data/model_input/`
* Multilabel classification
    * 3 categories of labels:
        1. Person Name: Unknown, Non-binary, Feminine, Masculine
        2. Linguistic: Generalization, Gendered Pronoun, Gendered Role
        3. Contextual: Empowering, Occupation, Omission, Stereotype

***

**Table of Contents**

[0.](#0) Preprocessing

[1.](#1) Model Training and Evaluation

[2.](#2) Error Analysis

***

Load necessary libraries:

In [1]:
# For custom functions and variables
import utils, config

# For data analysis
import pandas as pd
import numpy as np
import os, re

# For creating directories
from pathlib import Path

# For visualization
import matplotlib.pyplot as plt

# For preprocessing
from nltk.stem import WordNetLemmatizer
# import spacy
from scipy import spatial

# For embeddings
from gensim.models import FastText
from gensim import utils as gensim_utils
from gensim.test.utils import get_tmpfile

# For classifcation
import sklearn.metrics as metrics
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV
from skmultilearn.ensemble import LabelSpacePartitioningClassifier, MajorityVotingClassifier
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.cluster import FixedLabelSpaceClusterer
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.adapt import MLTSVM
# Base estimators to try?
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neural_network import MLPClassifier

# For loading multilabel data in Attribute Relation File Format
# from skmultilearn.dataset import load_from_arff, save_to_arff  # FIRST MUST INSTALL ARFF

# from sklearn.metrics import classification_report
# from sklearn.pipeline import Pipeline, FeatureUnion
# from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay
# from sklearn.metrics import precision_recall_fscore_support, f1_score

<a id="0"></a>
## 0. Preprocessing

Load the train and validation (dev) data:

In [2]:
tokc_path = "../data/token_clf_data/"
df_train = pd.read_csv(tokc_path+"model_input/token_train.csv", index_col=0)   #(config.tokc_path+"model_input/token_train.csv", index_col=0)
df_dev = pd.read_csv(tokc_path+"model_input/token_validate.csv", index_col=0)  #(config.tokc_path+"model_input/token_validate.csv", index_col=0)
print(df_train.shape, df_dev.shape)
df_train.head()

(467564, 10) (157740, 10)


Unnamed: 0,description_id,sentence_id,ann_id,token_id,token,token_offsets,pos,tag,field,subset
3,1,1,99999,3,Title,"(17, 22)",NN,O,Title,train
4,1,1,99999,4,:,"(22, 23)",:,O,Title,train
5,1,1,99999,5,Papers,"(24, 30)",NNS,O,Title,train
6,1,1,99999,6,of,"(31, 33)",IN,O,Title,train
7,1,1,14384,7,The,"(34, 37)",DT,B-Unknown,Title,train


In [3]:
df_train = df_train.drop(columns=["ann_id"])
df_train = df_train.drop_duplicates()
df_dev = df_dev.drop(columns=["ann_id"])
df_dev = df_dev.drop_duplicates()
print(df_train.shape, df_dev.shape)

(463441, 9) (156146, 9)


Remove Non-binary labels as these were mistaken labels identified early on that were meant to be excluded, and because only one token has this label, it prevents the data from being input into the models with cross-validation.

In [4]:
df_train = df_train.loc[df_train.tag != "B-Nonbinary"]
df_train = df_train.loc[df_train.tag != "I-Nonbinary"]

In [5]:
df_train.shape

(463439, 9)

Group the data by token, so there is one row per token rather than one row per token-tag pair:

In [6]:
subdf_train = df_train.drop(columns=["description_id", "field", "subset", "token_offsets"])
subdf_dev = df_dev.drop(columns=["description_id", "field", "subset", "token_offsets"])
df_train_imploded = utils.implodeDataFrame(subdf_train, ["sentence_id", "token_id", "token", "pos"])
df_train_imploded = df_train_imploded.reset_index()
df_dev_imploded = utils.implodeDataFrame(subdf_dev, ["sentence_id", "token_id", "token", "pos"])
df_dev_imploded = df_dev_imploded.reset_index()
df_dev_imploded.head()

Unnamed: 0,sentence_id,token_id,token,pos,tag
0,5,154,After,IN,[O]
1,5,155,his,PRP$,[B-Gendered-Pronoun]
2,5,156,ordination,NN,[O]
3,5,157,he,PRP,[B-Gendered-Pronoun]
4,5,158,spent,VBD,[O]


***
#### Optional Preprocessing Steps

Group the data by sentence, creating one row for a sentence per token-tag combination (so if a token has 3 different tags, the sentence with that token will have three rows):

In [176]:
# train_sentences = utils.implodeDataFrame(df_train[["sentence_id", "token_id", "token", "tag"]], ["sentence_id", "token_id", "token"])
# train_sentences = train_sentences.reset_index()
# train_sentences = utils.implodeDataFrame(train_sentences, ["sentence_id"])
# train_sentences = train_sentences.rename(columns={"token":"sentence"})
# train_sentences = train_sentences.reset_index()
# train_sentences.head()

Unnamed: 0,sentence_id,token_id,sentence,tag
0,1,"[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]","[Title, :, Papers, of, The, Very, Rev, Prof, J...","[[O], [O], [O], [O], [B-Unknown, B-Masculine, ..."
1,2,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...","[Scope, and, Contents, :, Sermons, and, addres...","[[O], [O], [O], [O], [O], [O], [O], [O], [O], ..."
2,3,"[109, 110, 111, 112, 113, 114, 115, 116, 117, ...","[Biographical, /, Historical, :, Professor, Ja...","[[O], [O], [O], [O], [B-Masculine], [I-Masculi..."
3,7,"[216, 217, 218, 219, 220, 221, 222, 223, 224, ...","[His, primary, interests, were, in, liturgy, a...","[[B-Gendered-Pronoun], [O], [O], [O], [O], [O]..."
4,8,"[233, 234, 235, 236, 237, 238, 239, 240, 241, ...","[James, Whyte, was, called, upon, to, preach, ...","[[B-Masculine], [I-Masculine], [O], [O], [O], ..."


In [165]:
# token_tag_imploded = utils.implodeDataFrame(df_train[["sentence_id", "token_id", "token", "tag"]], ["sentence_id", "token_id", "token"]).reset_index()
# token_tag_imploded = token_tag_imploded.set_index("token_id")
# # token_tag_imploded.head()
# token_to_tags = token_tag_imploded.to_dict(orient="index")  # token IDs are keys

In [166]:
# token_to_tags[10]

{'sentence_id': 1,
 'token': 'Prof',
 'tag': ['I-Stereotype', 'I-Masculine', 'I-Unknown']}

In [168]:
# x = [[1], [1, 2, 3], [2, 4]]
# x_lengths = [len(item) for item in x]
# max_i = x_lengths.index(max(x_lengths))
# print(x[max_i])

[1, 2, 3]


In [180]:
# subdf_train = df_train.drop(columns=["tag"])
# subdf_train_imploded = utils.implodeDataFrame(subdf_train, ["sentence_id"]).reset_index()
# sent_to_token = subdf_train_imploded.to_dict(orient="index")  # sentence IDs are keys

# sent_id_col, token_id_col, sentence_col, tag_col = [], [], [], []
# # for sentence_id,values in sent_to_token.items():
# sent_id = 1
# values = sent_to_token[1]
# token_ids = values["token_id"]
# for token_id in token_ids:
#     tags = token_to_tags[token_id]["tag"]
#     for i in range(len(tags)):
#         sent_id_col += [sent_id]

#         token_id_col += [token_ids]

#         s = values["sentence"]
#         sentence_col += [sentence]

#         tag = tags[i]
#         s_tags = [token_to_tags[t_id]["tag"][0] if t_id != token_id else tag]
#         tag_col += [s_tags]

# print(sent_id_col)
# print(tag_col)

# # def sentencePerTokenTagCombo(df):
# #     tag_col = list(df.tag)
# #     sentence_col = list(df.sentence)
# #     sid_col = list(df.sentence_id)
# #     for i,tags in enumerate(tag_col):
# #         for j,tag_list in enumerate(tags):
# #             if len(tag_list) > 1:
# #                 sid_to_repeat = sid_col[i]
# #                 sentence_to_repeat = sentence_col[i]
# #                 multi_tagged_token = sentence_to_repeat[j]

KeyError: 'sentence'

In [140]:
# dev_sentences = utils.implodeDataFrame(df_dev[["sentence_id", "token_id", "token", "tag"]], ["sentence_id", "token_id", "token"])
# dev_sentences = dev_sentences.reset_index()
# dev_sentences = utils.implodeDataFrame(dev_sentences, ["sentence_id"])
# dev_sentences = dev_sentences.rename(columns={"token":"sentence"})
# dev_sentences = dev_sentences.reset_index()
# dev_sentences.head()

Unnamed: 0,sentence_id,token_id,sentence,tag
0,5,"[154, 155, 156, 157, 158, 159, 160, 161, 162, ...","[After, his, ordination, he, spent, three, yea...","[[O], [B-Gendered-Pronoun], [O], [B-Gendered-P..."
1,11,"[308, 309, 310]","[Identifier, :, AA6]","[[O], [O], [O]]"
2,13,"[321, 322, 323, 324, 325, 326, 327, 328, 329, ...","[Scope, and, Contents, :, Sermons, and, addres...","[[O], [O], [O], [O], [O], [O], [O], [O], [O], ..."
3,18,"[498, 499, 500, 501, 502, 503, 504, 505, 506, ...","[In, 1941, Tom, Allan, married, Jane, Moore, a...","[[O], [O], [B-Masculine, B-Stereotype], [I-Mas..."
4,24,"[649, 650, 651, 652, 653, 654, 655, 656, 657, ...","[In, 1955, Rev, Tom, Allan, accepted, a, call,...","[[O], [O], [B-Masculine], [I-Masculine], [I-Ma..."


If not classifying all labels at once, consider only the rows with tags for the select subset of labels:

In [5]:
# label_subset = ["B-Stereotype", "I-Stereotype", "B-Omission", "I-Omission", "B-Occupation", "I-Occupation"]
# label_subset = ["B-Unknown", "I-Unknown", "B-Feminine", "I-Feminine", "B-Masculine", "I-Masculine", "B-Nonbinary", "I-Nonbinary"]
# label_subset = ["B-Generalization", "I-Generalization", "B-Gendered-Role", "I-Gendered-Role", "B-Gendered-Pronoun", "I-Gendered-Pronoun"]
# df_train = df_train.loc[df_train.tag.isin(label_subset)]
# df_dev = df_dev.loc[df_dev.tag.isin(label_subset)]
# print(df_train.shape, df_dev.shape)

Optionally, lemmatize the tokens (a form of normalization, a.k.a. standardization):

In [6]:
lmtzr = WordNetLemmatizer()

In [7]:
tokens_train = list(df_train.token)
lemmas_train = [lmtzr.lemmatize(token) for token in tokens_train]
tokens_dev = list(df_dev.token)
lemmas_dev = [lmtzr.lemmatize(token) for token in tokens_dev]

In [8]:
df_train.insert((list(df_train.columns).index("token")+1), "lemma", lemmas_train)
df_dev.insert((list(df_dev.columns).index("token")+1), "lemma", lemmas_dev)

In [9]:
# df_train.tail()
df_dev.head()

Unnamed: 0,description_id,sentence_id,ann_id,token_id,token,lemma,token_offsets,pos,tag,field,subset
172,3,5,99999,154,After,After,"(907, 912)",IN,O,Biographical / Historical,dev
173,3,5,14379,155,his,his,"(913, 916)",PRP$,B-Gendered-Pronoun,Biographical / Historical,dev
174,3,5,99999,156,ordination,ordination,"(917, 927)",NN,O,Biographical / Historical,dev
175,3,5,14380,157,he,he,"(928, 930)",PRP,B-Gendered-Pronoun,Biographical / Historical,dev
176,3,5,99999,158,spent,spent,"(931, 936)",VBD,O,Biographical / Historical,dev


***

Associate word embeddings to the tokens:

In [7]:
file_name = tokc_path+"fasttext100_lowercased.model"  #get_tmpfile(config.tokc_path+"fasttext100_lowercased.model")
embedding_model = FastText.load(file_name)

In [8]:
vocabulary = list(df_train.token.unique())
vocabulary_lowercased = [token.lower() for token in vocabulary]
vocabulary_lowercased = list(set(vocabulary_lowercased))
print("Vocabulary size:", len(vocabulary))
print("Lowercased vocabulary size:", len(vocabulary_lowercased))

Vocabulary size: 35968
Lowercased vocabulary size: 31335


Vectorize and binarize the data:

In [9]:
mlb = MultiLabelBinarizer()

In [10]:
target_col = "tag"
feature_cols = ["token_id", "token"]  # try sentence_id?
train_data = df_train_imploded  #.head(int((df_train_imploded.shape[0]/2)))
dev_data = df_dev_imploded      #.head(int((df_dev_imploded.shape[0]/2)))

Extract features:

In [11]:
# Get a vector representation of a token from a fastText word embedding model
def extractEmbedding(token, fasttext_model=embedding_model):
    if token.isalpha():
        token = token.lower()
    embedding = fasttext_model.wv[token]
    return embedding

def makeFeatureMatrix(token_data):
    feature_list = [extractEmbedding(token) for token_id,token in token_data]
    return np.array(feature_list)

In [12]:
train_tokens = list(zip(train_data[feature_cols[0]], train_data[feature_cols[1]]))
dev_tokens = list(zip(dev_data[feature_cols[0]], dev_data[feature_cols[1]]))

In [13]:
X_train = makeFeatureMatrix(train_tokens)
X_dev = makeFeatureMatrix(dev_tokens)
print(X_train.shape, X_dev.shape)  # number_of_samples, number_of_features

(452086, 100) (152455, 100)


Binarize targets:

In [14]:
y_train_labels = train_data[target_col]
y_train = mlb.fit_transform(y_train_labels)
y_dev_labels = dev_data[target_col]
y_dev = mlb.transform(y_dev_labels)
print(y_train.shape, y_dev.shape)  # number_of_samples, number_of_labels

(452086, 19) (152455, 19)


In [15]:
for labels in y_train:
    if sum(labels) > 1:
        print("Multilabelled tokens exist, as expected.")
        break

Multilabelled tokens exist, as expected.


<a id="1"></a>

## 1. Baseline Multilabel Classifier

For baseline models, use only the tokens' embeddings as features.

### Majority Voting Classifier

#### Train & Predict

In [21]:
# clf = MLTSVM(c_k = 2**-1)
# clf.fit(X_train, y_train)  # full dataset needs 1.47 TiB of data

In [None]:
# predictions = clf.predict(X_test)

In [67]:
# ## Use Grid Search to optimize parameters
# parameters = {
#     'max_depth': [depth for depth in range(3,6)],
#     'class_weight': ['balanced', 'balanced_subsample', None],
#     'max_features': ['sqrt', 'log2', 10]
#              }
# score = 'f1_macro'

# clf = GridSearchCV(RandomForestClassifier(random_state=22,), parameters, scoring=score)
# clf.fit(X_train, y_train)

# print (clf.best_params_, clf.best_score_)  #{'class_weight': None, 'max_depth': 3, 'max_features': 'sqrt'} 0.07822504011047318}

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

{'class_weight': None, 'max_depth': 3, 'max_features': 'sqrt'} 0.07822504011047318


In [18]:
clf = MajorityVotingClassifier(
    clusterer = FixedLabelSpaceClusterer(clusters = [[1,2,3], [0, 2, 5], [4, 5]]),  # what's a good way to decide on the clusters?
    classifier = ClassifierChain(classifier=RandomForestClassifier(max_depth=3, class_weight=None, max_features='sqrt', random_state=22))
)
clf.fit(X_train, y_train)

In [19]:
predictions = clf.predict(X_dev)

In [21]:
clf_mvc_rf = MajorityVotingClassifier(
    clusterer = FixedLabelSpaceClusterer(clusters = [[1,2,3], [0, 2, 5], [4, 5]]),  # what's a good way to decide on the clusters?
    classifier = ClassifierChain(classifier=RandomForestClassifier(random_state=22))
)
clf_mvc_rf.fit(X_train, y_train)

In [22]:
predictions = clf_mvc_rf.predict(X_dev)

In [23]:
# Majority Voting + RF
print("Precision - weighted:", metrics.precision_score(y_dev, predictions, average="weighted", zero_division=0))
print("Precision - macro:", metrics.precision_score(y_dev, predictions, average="macro", zero_division=0))  # macro = mean of all labels' score
print()
print("Recall - weighted:", metrics.recall_score(y_dev, predictions, average="weighted", zero_division=0))
print("Recall - macro:", metrics.recall_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("F1 Score - weighted:", metrics.f1_score(y_dev, predictions, average="weighted", zero_division=0))
print("F1 Score - macro:", metrics.f1_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("Accuracy - normalized:", metrics.accuracy_score(y_dev, predictions, normalize=True))  # fraction of correctly classified samples
print("Accuracy - unnormalized:", metrics.accuracy_score(y_dev, predictions, normalize=False))  # number of correctly classified samples

Precision - weighted: 0.015679590603897366
Precision - macro: 0.22096954431353613

Recall - weighted: 0.014409591023785431
Recall - macro: 0.19350795123474884

F1 Score - weighted: 0.014782658612655562
F1 Score - macro: 0.201702022278094

Accuracy - normalized: 0.00846807254599718
Accuracy - unnormalized: 1291


#### Algorithm Experiments

In [54]:
# Majority Voting + Extra Trees with max. depth of 4 and balanced classes
print("Precision - weighted:", metrics.precision_score(y_dev, predictions, average="weighted", zero_division=0))
print("Precision - macro:", metrics.precision_score(y_dev, predictions, average="macro", zero_division=0))  # macro = mean of all labels' score
print()
print("Recall - weighted:", metrics.recall_score(y_dev, predictions, average="weighted", zero_division=0))
print("Recall - macro:", metrics.recall_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("F1 Score - weighted:", metrics.f1_score(y_dev, predictions, average="weighted", zero_division=0))
print("F1 Score - macro:", metrics.f1_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("Accuracy - normalized:", metrics.accuracy_score(y_dev, predictions, normalize=True))  # fraction of correctly classified samples
print("Accuracy - unnormalized:", metrics.accuracy_score(y_dev, predictions, normalize=False))  # number of correctly classified samples

Precision - weighted: 0.004975145358542142
Precision - macro: 0.05673776476853289

Recall - weighted: 0.019385703123999332
Recall - macro: 0.270422193131941

F1 Score - weighted: 0.006264680707162905
F1 Score - macro: 0.07270225712081037

Accuracy - normalized: 0.0028467416614738777
Accuracy - unnormalized: 434


In [50]:
# Majority Voting + Extra Trees with max. depth of 3
print("Precision - weighted:", metrics.precision_score(y_dev, predictions, average="weighted", zero_division=0))
print("Precision - macro:", metrics.precision_score(y_dev, predictions, average="macro", zero_division=0))  # macro = mean of all labels' score
print()
print("Recall - weighted:", metrics.recall_score(y_dev, predictions, average="weighted", zero_division=0))
print("Recall - macro:", metrics.recall_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("F1 Score - weighted:", metrics.f1_score(y_dev, predictions, average="weighted", zero_division=0))
print("F1 Score - macro:", metrics.f1_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("Accuracy - normalized:", metrics.accuracy_score(y_dev, predictions, normalize=True))  # fraction of correctly classified samples
print("Accuracy - unnormalized:", metrics.accuracy_score(y_dev, predictions, normalize=False))  # number of correctly classified samples

Precision - weighted: 0.004173136841655642
Precision - macro: 0.04609639397829385

Recall - weighted: 0.002529683757508998
Recall - macro: 0.027942840973401245

F1 Score - weighted: 0.0031499325783877733
F1 Score - macro: 0.03479409821625192

Accuracy - normalized: 0.0024203863435112
Accuracy - unnormalized: 369


In [41]:
# Majority Voting + LR with OvR and balanced class weights
print("Precision - weighted:", metrics.precision_score(y_dev, predictions, average="weighted", zero_division=0))
print("Precision - macro:", metrics.precision_score(y_dev, predictions, average="macro", zero_division=0))  # macro = mean of all labels' score
print()
print("Recall - weighted:", metrics.recall_score(y_dev, predictions, average="weighted", zero_division=0))
print("Recall - macro:", metrics.recall_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("F1 Score - weighted:", metrics.f1_score(y_dev, predictions, average="weighted", zero_division=0))
print("F1 Score - macro:", metrics.f1_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("Accuracy - normalized:", metrics.accuracy_score(y_dev, predictions, normalize=True))  # fraction of correctly classified samples
print("Accuracy - unnormalized:", metrics.accuracy_score(y_dev, predictions, normalize=False))  # number of correctly classified samples

Precision - weighted: 0.004736068132697176
Precision - macro: 0.0538548492552831

Recall - weighted: 0.01968029920715228
Recall - macro: 0.27861743847249637

F1 Score - weighted: 0.00606212683599175
F1 Score - macro: 0.06995432567570094

Accuracy - normalized: 0.0018825227116198223
Accuracy - unnormalized: 287


In [36]:
# Majority Voting + LR with OvR
print("Precision - weighted:", metrics.precision_score(y_dev, predictions, average="weighted", zero_division=0))
print("Precision - macro:", metrics.precision_score(y_dev, predictions, average="macro", zero_division=0))  # macro = mean of all labels' score
print()
print("Recall - weighted:", metrics.recall_score(y_dev, predictions, average="weighted", zero_division=0))
print("Recall - macro:", metrics.recall_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("F1 Score - weighted:", metrics.f1_score(y_dev, predictions, average="weighted", zero_division=0))
print("F1 Score - macro:", metrics.f1_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("Accuracy - normalized:", metrics.accuracy_score(y_dev, predictions, normalize=True))  # fraction of correctly classified samples
print("Accuracy - unnormalized:", metrics.accuracy_score(y_dev, predictions, normalize=False))  # number of correctly classified samples

Precision - weighted: 0.014542711068512293
Precision - macro: 0.18511478280829652

Recall - weighted: 0.00968324516798381
Recall - macro: 0.12519454215074025

F1 Score - weighted: 0.010533287235813869
F1 Score - macro: 0.13646371728939513

Accuracy - normalized: 0.005588534321603096
Accuracy - unnormalized: 852


In [47]:
# Majority Voting + Ridge Classifier
print("Precision - weighted:", metrics.precision_score(y_dev, predictions, average="weighted", zero_division=0))
print("Precision - macro:", metrics.precision_score(y_dev, predictions, average="macro", zero_division=0))  # macro = mean of all labels' score
print()
print("Recall - weighted:", metrics.recall_score(y_dev, predictions, average="weighted", zero_division=0))
print("Recall - macro:", metrics.recall_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("F1 Score - weighted:", metrics.f1_score(y_dev, predictions, average="weighted", zero_division=0))
print("F1 Score - macro:", metrics.f1_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("Accuracy - normalized:", metrics.accuracy_score(y_dev, predictions, normalize=True))  # fraction of correctly classified samples
print("Accuracy - unnormalized:", metrics.accuracy_score(y_dev, predictions, normalize=False))  # number of correctly classified samples

Precision - weighted: 0.003926223662819222
Precision - macro: 0.04336899547641273

Recall - weighted: 0.0038681746570517336
Recall - macro: 0.04272778720996038

F1 Score - weighted: 0.0038969829991150843
F1 Score - macro: 0.043046003634679114

Accuracy - normalized: 0.0036338591715588207
Accuracy - unnormalized: 554


In [44]:
# Majority Voting + Ridge Classifier with balanced classes
print("Precision - weighted:", metrics.precision_score(y_dev, predictions, average="weighted", zero_division=0))
print("Precision - macro:", metrics.precision_score(y_dev, predictions, average="macro", zero_division=0))  # macro = mean of all labels' score
print()
print("Recall - weighted:", metrics.recall_score(y_dev, predictions, average="weighted", zero_division=0))
print("Recall - macro:", metrics.recall_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("F1 Score - weighted:", metrics.f1_score(y_dev, predictions, average="weighted", zero_division=0))
print("F1 Score - macro:", metrics.f1_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("Accuracy - normalized:", metrics.accuracy_score(y_dev, predictions, normalize=True))  # fraction of correctly classified samples
print("Accuracy - unnormalized:", metrics.accuracy_score(y_dev, predictions, normalize=False))  # number of correctly classified samples

Precision - weighted: 0.004697233991329188
Precision - macro: 0.05287934345155035

Recall - weighted: 0.01952659690289857
Recall - macro: 0.27542877986736086

F1 Score - weighted: 0.005827413355677853
F1 Score - macro: 0.06636114769517683

Accuracy - normalized: 0.0014233708307369388
Accuracy - unnormalized: 217


In [30]:
# Majority Voting + RF
print("Precision - weighted:", metrics.precision_score(y_dev, predictions, average="weighted", zero_division=0))
print("Precision - macro:", metrics.precision_score(y_dev, predictions, average="macro", zero_division=0))  # macro = mean of all labels' score
print()
print("Recall - weighted:", metrics.recall_score(y_dev, predictions, average="weighted", zero_division=0))
print("Recall - macro:", metrics.recall_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("F1 Score - weighted:", metrics.f1_score(y_dev, predictions, average="weighted", zero_division=0))
print("F1 Score - macro:", metrics.f1_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("Accuracy - normalized:", metrics.accuracy_score(y_dev, predictions, normalize=True))  # fraction of correctly classified samples
print("Accuracy - unnormalized:", metrics.accuracy_score(y_dev, predictions, normalize=False))  # number of correctly classified samples

Precision - weighted: 0.01577857101823814
Precision - macro: 0.22404798759314012

Recall - weighted: 0.014313527083626862
Recall - macro: 0.19202597968383292

F1 Score - weighted: 0.01474363832208297
F1 Score - macro: 0.20132410267490758

Accuracy - normalized: 0.008422157357908891
Accuracy - unnormalized: 1284


#### Evaluate

In [20]:
print("Precision - weighted:", metrics.precision_score(y_dev, predictions, average="weighted", zero_division=0))
print("Precision - macro:", metrics.precision_score(y_dev, predictions, average="macro", zero_division=0))  # macro = mean of all labels' score
print()
print("Recall - weighted:", metrics.recall_score(y_dev, predictions, average="weighted", zero_division=0))
print("Recall - macro:", metrics.recall_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("F1 Score - weighted:", metrics.f1_score(y_dev, predictions, average="weighted", zero_division=0))
print("F1 Score - macro:", metrics.f1_score(y_dev, predictions, average="macro", zero_division=0))
print()
print("Accuracy - normalized:", metrics.accuracy_score(y_dev, predictions, normalize=True))  # fraction of correctly classified samples
print("Accuracy - unnormalized:", metrics.accuracy_score(y_dev, predictions, normalize=False))  # number of correctly classified samples

Precision - weighted: 0.01156642798880111
Precision - macro: 0.1253681719726464

Recall - weighted: 0.004809601270605715
Recall - macro: 0.053879138459085714

F1 Score - weighted: 0.005548354816765824
F1 Score - macro: 0.06240011578730738

Accuracy - normalized: 0.003955265488176839
Accuracy - unnormalized: 603


In [69]:
pred_labels = mlb.inverse_transform(predictions)
pred_df = dev_data.drop(columns=["tag"])
pred_df.insert(len(pred_df.columns), "predicted_tag", pred_labels)
pred_df = pred_df.explode(["predicted_tag"])
pred_df["predicted_tag"] = pred_df["predicted_tag"].fillna('O')
pred_df.head()

Unnamed: 0,sentence_id,token_id,token,pos,predicted_tag
0,5,154,After,IN,O
1,5,155,his,PRP$,B-Gendered-Pronoun
2,5,156,ordination,NN,O
3,5,157,he,PRP,B-Gendered-Pronoun
4,5,158,spent,VBD,O


In [63]:
exp_df = dev_data.explode(["tag"])
exp_df = exp_df.rename(columns={"tag":"expected_tag"})
exp_df.head()

Unnamed: 0,sentence_id,token_id,token,pos,expected_tag
0,5,154,After,IN,O
1,5,155,his,PRP$,B-Gendered-Pronoun
2,5,156,ordination,NN,O
3,5,157,he,PRP,B-Gendered-Pronoun
4,5,158,spent,VBD,O


In [85]:
exp_pred_df = pd.merge(
    left=exp_df, right=pred_df, how="outer",
    left_on=["sentence_id", "token_id", "token", "pos", "expected_tag"],
    right_on=["sentence_id", "token_id", "token", "pos", "predicted_tag"],
    suffixes=["", "_pred"],
    indicator=True
)
exp_pred_df = exp_pred_df[["sentence_id", "token_id", "token", "pos", "expected_tag", "predicted_tag", "_merge"]]
exp_pred_df.head()

Unnamed: 0,sentence_id,token_id,token,pos,expected_tag,predicted_tag,_merge
0,5,154,After,IN,O,O,both
1,5,155,his,PRP$,B-Gendered-Pronoun,B-Gendered-Pronoun,both
2,5,156,ordination,NN,O,O,both
3,5,157,he,PRP,B-Gendered-Pronoun,B-Gendered-Pronoun,both
4,5,158,spent,VBD,O,O,both


In [91]:
sub_exp_pred_df = exp_pred_df.loc[exp_pred_df.expected_tag == 'O']
sub_exp_pred_df = sub_exp_pred_df.loc[sub_exp_pred_df.predicted_tag == 'O']
sub_exp_pred_df.replace(to_replace="both", value="true negative", inplace=True)
tn_tokens = list(sub_exp_pred_df.token_id)

In [92]:
sub_exp_pred_df2 = exp_pred_df.loc[~exp_pred_df.token_id.isin(tn_tokens)]
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="left_only", value="false negative")
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="right_only", value="false positive")
sub_exp_pred_df2 = sub_exp_pred_df2.replace(to_replace="both", value="true positive")
sub_exp_pred_df2.head()

Unnamed: 0,sentence_id,token_id,token,pos,expected_tag,predicted_tag,_merge
1,5,155,his,PRP$,B-Gendered-Pronoun,B-Gendered-Pronoun,true positive
3,5,157,he,PRP,B-Gendered-Pronoun,B-Gendered-Pronoun,true positive
9,5,163,army,NN,B-Occupation,,false negative
10,5,164,Chaplain,NN,I-Occupation,,false negative
113,18,500,Tom,NNP,B-Masculine,,false negative


In [93]:
eval_df = pd.concat([sub_exp_pred_df,sub_exp_pred_df2])
eval_df = eval_df.sort_index()
eval_df.head()

Unnamed: 0,sentence_id,token_id,token,pos,expected_tag,predicted_tag,_merge
0,5,154,After,IN,O,O,true negative
1,5,155,his,PRP$,B-Gendered-Pronoun,B-Gendered-Pronoun,true positive
2,5,156,ordination,NN,O,O,true negative
3,5,157,he,PRP,B-Gendered-Pronoun,B-Gendered-Pronoun,true positive
4,5,158,spent,VBD,O,O,true negative


Save the data:

In [None]:
# eval_df.to_csv(config.tokc_path+"multilabel_model_output/mvc_cc-rf_baseline_predictions.csv")

In [94]:
label_tags = {
    "Unknown": ["B-Unknown", "I-Unknown"], "Feminine": ["B-Feminine", "I-Feminine"], "Masculine": ["B-Masculine", "I-Masculine"],
    "Gendered Pronoun": ["B-Gendered-Pronoun", "I-Gendered-Pronoun"], "Gendered Role": ["B-Gendered-Role", "I-Gendered-Role"],
    "Generalization": ["B-Generalization", "I-Generalization"], 
    "Stereotype": ["B-Stereotype", "I-Stereotype"], "Omission": ["B-Omission", "I-Omission"], "Occupation": ["B-Occupation", "I-Occupation"]
             }

In [59]:
# STOPPED HERE - CALCULATE TOTAL AGREEMENTS PER TAG (strict agreement) AND PER LABEL (loose agreement)

['O' 'B-Gendered-Pronoun' 'B-Occupation' 'I-Occupation' 'B-Masculine'
 'B-Stereotype' 'I-Masculine' 'I-Stereotype' 'B-Unknown' 'B-Feminine'
 'I-Unknown' 'I-Feminine' 'B-Generalization' 'B-Omission' 'I-Omission'
 'B-Gendered-Role' 'I-Gendered-Role' 'I-Generalization'
 'I-Gendered-Pronoun']


In [None]:
pred_df_joined

## 1. Logistic Regression Model

#### Feature Engineering

In [118]:
# class CountWordCaps(BaseEstimator, TransformerMixin):
# """ Model that extracts a counter of capital words from text. """
#     def fit(self, X, y=None):
#         return self
#    def transform(self, texts):
#         """ transform data :texts: The texts to count capital words in :returns: list of counts for each text """
#         return [[sum(w.isupper() for w in nltk.word_tokenize(text))] for text in texts]
    
class GloveTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, sentence):
        return [[embedding_dict[t] if t in embedding_dict.keys() else np.nan for t in token] for token in sentence]
#         self.insert(0, "embedding", embeddings)
#         return self
#         return pd.DataFrame({"embedding":embeddings})

In [119]:
col_transformer = ColumnTransformer(
    [
#         ('pos_encodings', OneHotEncoder(), ['pos']),
        ('token_embeddings', GloveTransformer(), 'token'),
    ],
    remainder='passthrough'
)

In [120]:
# v = DictVectorizer(sparse=True)
mlb = MultiLabelBinarizer()
labels2numbers = LabelEncoder()

In [121]:
feature_cols = ["sentence_id", "token"] #, "pos"] #"token",  #, "lemma"]
target_col = "tag"
labels = list(np.unique(df_train.tag))
labels2numbers = LabelEncoder()
y = labels2numbers.fit_transform(labels)
label_to_no = dict(zip(labels,list(y)))
no_to_label = dict(zip(list(y),labels))
print(label_to_no)

{'B-Feminine': 0, 'B-Gendered-Pronoun': 1, 'B-Gendered-Role': 2, 'B-Generalization': 3, 'B-Masculine': 4, 'B-Occupation': 5, 'B-Omission': 6, 'B-Stereotype': 7, 'B-Unknown': 8, 'I-Feminine': 9, 'I-Gendered-Pronoun': 10, 'I-Gendered-Role': 11, 'I-Generalization': 12, 'I-Masculine': 13, 'I-Occupation': 14, 'I-Omission': 15, 'I-Stereotype': 16, 'I-Unknown': 17, 'O': 18}


In [132]:
# X_train = train_sentences["sentence"]#df_train[feature_cols]
# # X_train = col_transformer.fit_transform(X_train)
# y_train = list(train_sentences["tag"])#df_train[target_col].values
y_train_numeric = [[tuple((label_to_no[label] for label in labels)) for labels in labels_list] for labels_list in y_train]
# y_train_numeric = utils.getNumericLabels(y_train, label_to_no)  # Convert the string labels to numeric labels
y_train_binarized = mlb.fit_transform(y_train_numeric)          # Convert each iterable of iterables above to a multilabel format
# print(X_train.shape, y_train.shape)
print(y_train_numeric[0])
print(y_train_binarized[0])

[(18,), (18,), (18,), (18,), (8, 4, 7), (17, 16, 13), (17, 13, 8, 16), (16, 13, 17, 17), (13, 17, 16, 17), (17, 13, 17, 16), (18,), (18,), (18,)]
[0 0 0 ... 0 1 0]


In [137]:
i = 0
print(y_train_numeric[i])
# print(y_train_binarized[i])
y_train_binarized[i][4]

[(18,), (18,), (18,), (18,), (8, 4, 7), (17, 16, 13), (17, 13, 8, 16), (16, 13, 17, 17), (13, 17, 16, 17), (17, 13, 17, 16), (18,), (18,), (18,)]


0

In [91]:
X_dev = df_dev[feature_cols]
# X_dev = col_transformer.transform(X_dev)  # No fit?
y_dev = df_dev[target_col].values
y_dev_numeric = utils.getNumericLabels(y_dev, label_to_no)  # Make numeric
y_dev_binarized = mlb.transform(y_dev_numeric)              # Binarize
print(y_dev.shape, y_dev_binarized.shape)

(157740,) (157740, 19)


In [92]:
# assert X_dev.shape[1] == X_train.shape[1], "The train and dev data must have the same number of columns."

#### Train the Model

In [93]:
log_reg = OneVsRestClassifier(LogisticRegression(solver="liblinear", multi_class="ovr", random_state=22))

In [102]:
pipeline = Pipeline([
    ("col_transformer", col_transformer),
    ('imputation', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ("classifier", log_reg)
])

In [103]:
clf = pipeline.fit(X_train, y_train_binarized)

ValueError: The output of the 'token_embeddings' transformer should be 2D (scipy matrix, array, or pandas DataFrame).

#### Predict

In [109]:
predicted_dev = clf.predict(X_dev)
print(predicted_dev[0])

ValueError: Found unknown categories ['……', '..', '{'] in column 0 during transform

#### Evaluate Model Performance

In [18]:
original_labels = mlb.classes_
dev_matrix = multilabel_confusion_matrix(y_dev_binarized, predicted_dev, labels=mlb.classes_)
df_dev_perf = utils.getPerformanceMetrics(y_dev_binarized, predicted_dev, dev_matrix, mlb.classes_, original_labels, no_to_label)
df_dev_perf

Unnamed: 0,labels,true_neg,false_neg,true_pos,false_pos,precision,recall,f_1
0,B-Feminine,157417,323,0,0,0.0,0.0,0.0
1,B-Gendered-Pronoun,156996,744,0,0,0.0,0.0,0.0
2,B-Gendered-Role,157150,590,0,0,0.0,0.0,0.0
3,B-Generalization,157495,245,0,0,0.0,0.0,0.0
4,B-Masculine,156716,1024,0,0,0.0,0.0,0.0
5,B-Occupation,157085,655,0,0,0.0,0.0,0.0
6,B-Omission,156658,1082,0,0,0.0,0.0,0.0
7,B-Stereotype,157481,259,0,0,0.0,0.0,0.0
8,B-Unknown,155680,2060,0,0,0.0,0.0,0.0
9,I-Feminine,156894,846,0,0,0.0,0.0,0.0


In [19]:
print("Dev Accuracy (all labels) on `token` col:", np.mean(predicted_dev == y_dev_binarized))

Dev Accuracy (all labels) on `token` col: 0.9890666186195806


Try using cross-validation (stratified k fold, where k=3) with Logistic Regression:

In [20]:
# k = 3 # number of folds

In [21]:
# log_reg_cv = OneVsRestClassifier(LogisticRegressionCV(
#     solver="liblinear", multi_class="ovr", cv=k, scoring="f1", random_state=22)  #max_iter=500, --> default is 100 iterations
#                                 )
# clf1 = log_reg_cv.fit(X_train, y_train_binarized)
# pred1_dev = clf1.predict(X_dev)
# # print("Dev Accuracy (all labels) on `lemma` col:", np.mean(pred1_dev == y_dev))  # 90%
# # print("Dev Accuracy (all labels) on `token` col:", np.mean(pred1_dev == y_dev))  # 90%
# # from sklearn import metrics
# # metrics.SCORERS.keys()
# # print("Accuracy:", clf1.score(pred1_dev, y_dev_binarized))

In [22]:
# original_labels = mlb.classes_
# dev_matrix1 = multilabel_confusion_matrix(y_dev_binarized, pred1_dev, labels=mlb.classes_)
# df_dev_perf1 = utils.getPerformanceMetrics(y_dev_binarized, pred1_dev, dev_matrix1, mlb.classes_, original_labels, no_to_label)
# df_dev_perf1

**QUESTION:** Are scores averaged across the 3 folds?

<a id="1.1"></a>
## 1.1. With Word Embeddings

In [88]:
from sklearn.base import BaseEstimator, TransformerMixin
# # Model that assigns a GloVe embedding to a token.
# class GloveVectorizer(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self
    
#     # Transform tokens from the input data to their corresponding GloVe embedding
#     def transform(self, tokens):
#         return [words_to_vectors[token] for token in tokens]
class EmbeddingTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, embedding_dict=words_to_vectors):
        tokens = list(self.token)
        embeddings = np.array([])
        for token in tokens:
            embedding = embedding_dict[token]
            embeddings += [embedding]
        return embeddings

# embedding_transformer = FunctionTransformer(embedding_extractor)

class PosTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self):
        # STOPPED HERE
        # use multilabel binarizer?
        # REFERENCE: https://stackoverflow.com/questions/56774862/how-to-add-a-feature-using-a-pipeline-and-featureunion
        
        return #np.array(self.pos)

# pos_transformer = FunctionTransformer(pos_extractor)

In [82]:
log_reg = OneVsRestClassifier(LogisticRegression(solver="liblinear", multi_class="ovr", random_state=22))

In [90]:
feature_union = FeatureUnion([("embeddings", EmbeddingTransformer), ("pos", PosTransformer)])

In [91]:
pipe = Pipeline([("features", feature_union), ("classifier", log_reg)])

In [100]:
pipe.fit(X_train, y_train_binarized)

AttributeError: fit not found

In [75]:
col_transformer = ColumnTransformer(["vectorizer", GloveVectorizer(), 'token'])

In [None]:
features = FeatureUnion(["vectorizer"])

In [78]:
# vectorizer = Pipeline(
#     ["col_selector", FunctionTransformer(lambda df: df["token"])],
#     ["vectorizer", GloveVectorizer()]
# )
pipe = Pipeline(
    ["transformer", col_transformer],
    ["estimator", log_reg]
)

TypeError: 'ColumnTransformer' object is not iterable

In [55]:
col_transformer = ColumnTransformer(["vectorizer", vectorizer, 'token'])

In [58]:
pipe = Pipeline(
    ["transformer", col_transformer],
    ["clf", log_reg]
)

TypeError: 'ColumnTransformer' object is not iterable