# Gender Biased Document Classification

With aggregated and perspectivist (individual annotator's) data

In [1]:
import utils
from pathlib import Path
import numpy as np
import pandas as pd
# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# For Aggregated Data
# data_dir = "./clf_data/"
# Path(data_dir).mkdir(parents=True, exist_ok=True)

# For Perspectivist Data
data_dir2 = "./clf_data/perspectivist/"
Path(data_dir2).mkdir(parents=True, exist_ok=True)

<a id="i"></a>
## Splitting Data into Train/Validation/Test Sets

Load and prepare the data for classification:

In [3]:
# df = pd.read_csv("../annot-post/data/aggregated_with_eadid_descid_cols.csv", index_col=0)
# df.head()
all_anns = pd.read_csv("../annot-post/data/all_annotators.csv", index_col=0)
df = all_anns.rename(columns={"field2":"description"})
df.head()

Unnamed: 0,annotator,file,entity,label,start,end,text,category,note,eadid,field,id,description
76483,Annotator 4,AA5_00100.ann,T7,Stereotype,34,63,The Very Rev Prof James Whyte,Contextual,form of address characteristic of male homosoc...,AA5,Title,0,Papers of The Very Rev Prof James Whyte (1920-...
42137,Annotator 1,AA5_00100.ann,T7,Masculine,34,63,The Very Rev Prof James Whyte,Person-Name,,AA5,Title,1,Papers of The Very Rev Prof James Whyte (1920-...
14854,Annotator 0,AA5_00100.ann,T7,Unknown,34,63,The Very Rev Prof James Whyte,Person-Name,,AA5,Title,2,Papers of The Very Rev Prof James Whyte (1920-...
48352,Annotator 2,AA5_00100.ann,T10,Unknown,43,63,Rev Prof James Whyte,Person-Name,,AA5,Title,3,Papers of The Very Rev Prof James Whyte (1920-...
14855,Annotator 0,AA5_00100.ann,T12,Masculine,661,689,Professor James Aitken White,Person-Name,,AA5,Biographical / Historical,4,Professor James Aitken White was a leading Sco...


In [4]:
descs = pd.read_csv("../annot-post/data/all_descriptions.csv", index_col=0)
descs = descs.drop(columns=["field", "eadid"])
descs.head()

Unnamed: 0,description,desc_id
0,Professor James Aitken White was a leading Sco...,0
1,Papers of The Very Rev Prof James Whyte (1920-...,1
2,Rev Thomas Allan was born on 16 August 1916 in...,2
3,Papers of Rev Tom Allan (1916-1965)\n\n,3
4,Alec Cheyne was born on 1 June 1924 in Errol i...,4


Join the two DataFrames by `desc_id` (description ID):

In [5]:
# clf_data = df.join(descs.set_index('desc_id'), on='desc_id')
clf_data = df.join(descs.set_index('description'), on='description')
clf_data.head()

Unnamed: 0,annotator,file,entity,label,start,end,text,category,note,eadid,field,id,description,desc_id
76483,Annotator 4,AA5_00100.ann,T7,Stereotype,34,63,The Very Rev Prof James Whyte,Contextual,form of address characteristic of male homosoc...,AA5,Title,0,Papers of The Very Rev Prof James Whyte (1920-...,1
42137,Annotator 1,AA5_00100.ann,T7,Masculine,34,63,The Very Rev Prof James Whyte,Person-Name,,AA5,Title,1,Papers of The Very Rev Prof James Whyte (1920-...,1
14854,Annotator 0,AA5_00100.ann,T7,Unknown,34,63,The Very Rev Prof James Whyte,Person-Name,,AA5,Title,2,Papers of The Very Rev Prof James Whyte (1920-...,1
48352,Annotator 2,AA5_00100.ann,T10,Unknown,43,63,Rev Prof James Whyte,Person-Name,,AA5,Title,3,Papers of The Very Rev Prof James Whyte (1920-...,1
14855,Annotator 0,AA5_00100.ann,T12,Masculine,661,689,Professor James Aitken White,Person-Name,,AA5,Biographical / Historical,4,Professor James Aitken White was a leading Sco...,0


The DataFrame above has a row for every label, so for descriptions with multiple labels, there are multiple rows of data.

In [6]:
clf_data.description.isnull().values.any()

False

For every description, get the labels (non-repeating) they were annotated with:

In [7]:
agg_cols = ["description", "field", "desc_id"]
anns_cols = ["description", "field", "desc_id", "annotator", "eadid"]

df_grouped = clf_data.groupby(anns_cols).agg(
    {"label": lambda label_name: set(label_name)}  #",".join(label_name)}
    ).reset_index()
df_grouped.sort_values(by="desc_id", inplace=True)
df_grouped.head()

Unnamed: 0,description,field,desc_id,annotator,eadid,label
26980,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,Annotator 0,AA5,"{Masculine, Occupation, Gendered-Pronoun}"
26984,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,Annotator 4,AA5,"{Stereotype, Occupation}"
26983,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,Annotator 3,AA5,{Occupation}
26982,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,Annotator 2,AA5,"{Masculine, Gendered-Pronoun}"
26981,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,Annotator 1,AA5,"{Masculine, Gendered-Pronoun}"


Make sure the sequences of labels in each row appear correctly:

In [8]:
valid_label_names = clf_data.label.unique()
print(valid_label_names)

['Stereotype' 'Masculine' 'Unknown' 'Occupation' 'Gendered-Pronoun'
 'Omission' 'Feminine' 'Generalization' 'Gendered-Role' 'Empowering']


In [9]:
label_col = list(df_grouped.label)
invalid = []
for label_set in label_col:
    label_list = list(label_set)
    for label_name in label_list:
        if not label_name in valid_label_names:
            invalid += [label_name]
assert len(invalid) == 0, "Label names must be valid"

Looks good!

For the aggregated data, add a column for the EADIDs (collection, or fonds, identifier) of the descriptions:

In [12]:
descs = pd.read_csv("../annot-post/data/all_descriptions.csv", index_col=0)
eadids = descs.drop(columns=["description","field"])
eadids.head()

Unnamed: 0,eadid,desc_id
0,AA5,0
1,AA5,1
2,AA6,2
3,AA6,3
4,AA7,4


In [13]:
key = "desc_id"
df_joined = df_grouped.join(eadids.set_index(key), on=key)
df_joined.head()

Unnamed: 0,description,field,desc_id,label,eadid
8054,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,"{Occupation, Stereotype, Masculine, Gendered-P...",AA5
5560,Papers of The Very Rev Prof James Whyte (1920-...,Title,1,"{Stereotype, Masculine, Unknown}",AA5
8321,Rev Thomas Allan was born on 16 August 1916 in...,Biographical / Historical,2,"{Gendered-Pronoun, Stereotype, Unknown, Mascul...",AA6
5550,Papers of Rev Tom Allan (1916-1965)\n\n,Title,3,"{Masculine, Unknown}",AA6
633,Alec Cheyne was born on 1 June 1924 in Errol i...,Biographical / Historical,4,"{Gendered-Pronoun, Stereotype, Unknown, Mascul...",AA7


Write (or read) the resulting DataFrame as a file:

In [12]:
# df_joined.to_csv("clf_data/desc_field_descid_label_eadid.csv")
df_grouped.to_csv("clf_data/perspectivist/desc_field_descid_annot_eadid_label.csv")

In [16]:
# df_joined = pd.read_csv("clf_data/desc_field_descid_label_eadid.csv", index_col=0)
# df_joined["label"] = df_joined["label"].apply(eval)

Create subsets of the DataFrame for each field type and, for the perspectivist data, for each annotator:

In [13]:
annotators = df_grouped.annotator.unique()
annotators.sort()  # ['Annotator 0' 'Annotator 1' 'Annotator 2' 'Annotator 3' 'Annotator 4']
df0 = df_grouped[df_grouped.annotator == annotators[0]]
df1 = df_grouped[df_grouped.annotator == annotators[1]]
df2 = df_grouped[df_grouped.annotator == annotators[2]]
df3 = df_grouped[df_grouped.annotator == annotators[3]]
df4 = df_grouped[df_grouped.annotator == annotators[4]]
# df3.tail()

In [14]:
# For Perspectivist Data
df_list = [df0, df1, df2, df3, df4]
for dfa in df_list:
    label_col = list(dfa.label)
    invalid = []
    for label_set in label_col:
        label_list = list(label_set)
        for label_name in label_list:
            if not label_name in valid_label_names:
                invalid += [label_name]
    assert len(invalid) == 0, "Label names must be valid"

In [19]:
# For Aggregated Data
# train, validate, test = utils.getShuffledSplitData(df_joined)



# For Perspectivist Data

# ************************************************************************************************
# INPUT:  DataFrame, fraction of DF to shuffle, and random_state of shuffle
#         Note 1 - fraction defaults to 1 to shuffle the entire DataFrame; 
#                 provide a value <1 to return that fraction of the DataFrame shuffled
#         Note 2 -random_state_value defaults to 7 for reproducibility
# OUTPUT: DataFrame with its rows shuffled
def shuffleDataFrame(df, fraction=1, random_state_value=7):
    return df.sample(frac=fraction, random_state=random_state_value)


# INPUT:  A shuffled DataFrame for a particular metadata field
# OUTPUT: The number of rows from the DataFrame to assign to train, validate (dev), 
#         and (blind) test sets of data f
def getTrainValTestSizes(df):
    indeces = list(df.index)
    
    train = indeces[ : int(df.shape[0]*0.6) ]
    validate = indeces[ int(df.shape[0]*0.6) : (int(df.shape[0]*0.6) + round(df.shape[0]*0.2)) ]
    test = indeces[ (int(df.shape[0]*0.6) + round(df.shape[0]*0.2)) : ]

    return len(train), len(validate), len(test)


# Add a column to the input DataFrame that assigns each row to train, dev, and test
# using the three input sizes
def assignSubsets(df, train_size, validate_size, test_size):
    subset_col = ["train"]*train_size + ["dev"]*validate_size + ["test"]*test_size
    df.insert(len(df.columns)-1, "subset", subset_col)
    return df


# Concatenate the rows assigned to each subset to create one DataFrame each for 
# training, validation, and testing: 
def concatBySubset(df_list, subset):
    df_all = pd.DataFrame()
    for df in df_list:
        df_subset = df.loc[df["subset"] == subset]
        df_all = pd.concat([df_all, df_subset], axis=0)
    return df_all

metadata_fields = ['Biographical / Historical', 'Title', 'Scope and Contents', 'Processing Information']
def getShuffledSplitData(df, field_names=metadata_fields):
    df_bh = df.loc[df.field == field_names[0]]
    df_t = df.loc[df.field == field_names[1]]
    df_sc = df.loc[df.field == field_names[2]]
    df_pi = df.loc[df.field == field_names[3]]
    
    # Shuffle the DataFrames for each metadata field type
    df_bh_shuffled = utils.shuffleDataFrame(df_bh)
    df_t_shuffled = utils.shuffleDataFrame(df_t)
    df_sc_shuffled = utils.shuffleDataFrame(df_sc)
    df_pi_shuffled = utils.shuffleDataFrame(df_pi)
    
    # Get the indeces of rows to assign to train, dev, and test
    train_bh, validate_bh, test_bh = utils.getTrainValTestSizes(df_bh_shuffled)
    assert train_bh+validate_bh+test_bh == df_bh_shuffled.shape[0]
    train_t, validate_t, test_t = utils.getTrainValTestSizes(df_t_shuffled)
    assert train_t+validate_t+test_t == df_t_shuffled.shape[0]
    train_sc, validate_sc, test_sc = utils.getTrainValTestSizes(df_sc_shuffled)
    assert train_sc+validate_sc+test_sc == df_sc_shuffled.shape[0]
    train_pi, validate_pi, test_pi = utils.getTrainValTestSizes(df_pi_shuffled)
    assert train_pi+validate_pi+test_pi == df_pi_shuffled.shape[0]
    
    df_bh = utils.assignSubsets(df_bh_shuffled, train_bh, validate_bh, test_bh)
    df_t = utils.assignSubsets(df_t_shuffled, train_t, validate_t, test_t)
    df_sc = utils.assignSubsets(df_sc_shuffled, train_sc, validate_sc, test_sc)
    df_pi = utils.assignSubsets(df_pi_shuffled, train_pi, validate_pi, test_pi)
    dfs = [df_bh, df_t, df_sc, df_pi]
    
    # Concatenate the rows assigned to each subset to create one DataFrame each for training, validation, and testing: 
    train = utils.concatBySubset(dfs, "train")
    assert train.subset.unique()[0] == "train"

    validate = utils.concatBySubset(dfs, "dev")
    assert validate.subset.unique()[0] == "dev"

    test = utils.concatBySubset(dfs, "test")
    assert test.subset.unique()[0] == "test"

    return train, validate, test
# ************************************************************************************************

train0, validate0, test0 = getShuffledSplitData(df0)
train1, validate1, test1 = getShuffledSplitData(df1)
train2, validate2, test2 = getShuffledSplitData(df2)
train3, validate3, test3 = getShuffledSplitData(df3)
train4, validate4, test4 = getShuffledSplitData(df4)

In [20]:
splits = [train1, validate1, test1] # [train, validate, test]
for split in splits:
    print(split.shape[0])

6023
2009
2010


In [27]:
perspectivist_data = [[train0, validate0, test0], [train1, validate1, test1], [train2, validate2, test2], 
                      [train3, validate3, test3], [train4, validate4, test4]]

In [28]:
# Check that the label names are valid
invalid = []
for annotator_splits in perspectivist_data:
    for df_split in annotator_splits:
        label_col = list(df_split.label)
        for label_set in label_col:
            label_list = list(label_set)
            for label_name in label_list:
                if not label_name in valid_label_names:
                    invalid += [label_name]
        assert len(invalid) == 0

In [29]:
train, validate, test = train2, validate2, test2
print(train.shape[0]/(train.shape[0]+validate.shape[0]+test.shape[0]))
print(validate.shape[0]/(train.shape[0]+validate.shape[0]+test.shape[0]))
print(test.shape[0]/(train.shape[0]+validate.shape[0]+test.shape[0]))

0.5997371495327103
0.1999123831775701
0.2003504672897196


Great!  We've shuffle the DataFrames and then added a column to each that assigns every row to a subset.  For each DataFrame: 
* 60% of the rows are for `training`
* 20% of the rows are for `validation`
* 20% of the rows are for `test`

Lastly, we can write the corresponding labels and descriptions to files for creating classification models.

#### Write the data to files
The files will separate labels by `\n` (a newline) and descriptions by `\n|\n` (a pipe character surrounded by newlines)

**Aggregated Data**

In [None]:
# dir_name = data_dir

In [27]:
# utils.writeDocs(list(train.description), "train_docs.txt", dir_name)
# utils.writeLabels(list(train.label), "train_labels.txt", dir_name)

Your documents file has been written!
Your labels file has been written!


In [28]:
# utils.writeDocs(list(validate.description), "validate_docs.txt", dir_name)
# utils.writeLabels(list(validate.label), "validate_labels.txt", dir_name)

Your documents file has been written!
Your labels file has been written!


In [29]:
# utils.writeDocs(list(test.description), "blindtest_docs.txt", dir_name)
# utils.writeLabels(list(test.label), "blindtest_labels.txt", dir_name)

Your documents file has been written!
Your labels file has been written!


***

**Perspectivist Data**

In [30]:
dir_name = data_dir2
print(dir_name)

./clf_data/perspectivist/


In [31]:
for i in range(len(perspectivist_data)):
    train = perspectivist_data[i][0]
    validate = perspectivist_data[i][1]
    test = perspectivist_data[i][2]
    utils.writeDocs(list(train.description), "train{}_docs.txt".format(i), dir_name)
    utils.writeLabels(list(train.label), "train{}_labels.txt".format(i), dir_name)
    utils.writeDocs(list(validate.description), "validate{}_docs.txt".format(i), dir_name)
    utils.writeLabels(list(validate.label), "validate{}_labels.txt".format(i), dir_name)
    utils.writeDocs(list(test.description), "blindtest{}_docs.txt".format(i), dir_name)
    utils.writeLabels(list(test.label), "blindtest{}_labels.txt".format(i), dir_name)

Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels f