# Gender Biased Document Classification

With aggregated and perspectivist (individual annotator's) data

In [33]:
import utils
from pathlib import Path
import numpy as np
import pandas as pd
# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [79]:
data_dir = "./clf_data/"
Path(data_dir).mkdir(parents=True, exist_ok=True)
data_dir2 = "./clf_data/perspectivist/"
Path(data_dir).mkdir(parents=True, exist_ok=True)

<a id="i"></a>
## Splitting Data into Train/Validation/Test Sets

Load and prepare the data for classification:

In [51]:
# df = pd.read_csv("../annot-post/data/aggregated_with_eadid_descid_cols.csv", index_col=0)
# df.head()
all_anns = pd.read_csv("../annot-post/data/all_annotators.csv", index_col=0)
df = all_anns.rename(columns={"field2":"description"})
df.head()

Unnamed: 0,annotator,file,entity,label,start,end,text,category,note,eadid,field,id,description
76483,Annotator 4,AA5_00100.ann,T7,Stereotype,34,63,The Very Rev Prof James Whyte,Contextual,form of address characteristic of male homosoc...,AA5,Title,0,Papers of The Very Rev Prof James Whyte (1920-...
42137,Annotator 1,AA5_00100.ann,T7,Masculine,34,63,The Very Rev Prof James Whyte,Person-Name,,AA5,Title,1,Papers of The Very Rev Prof James Whyte (1920-...
14854,Annotator 0,AA5_00100.ann,T7,Unknown,34,63,The Very Rev Prof James Whyte,Person-Name,,AA5,Title,2,Papers of The Very Rev Prof James Whyte (1920-...
48352,Annotator 2,AA5_00100.ann,T10,Unknown,43,63,Rev Prof James Whyte,Person-Name,,AA5,Title,3,Papers of The Very Rev Prof James Whyte (1920-...
14855,Annotator 0,AA5_00100.ann,T12,Masculine,661,689,Professor James Aitken White,Person-Name,,AA5,Biographical / Historical,4,Professor James Aitken White was a leading Sco...


In [52]:
descs = pd.read_csv("../annot-post/data/all_descriptions.csv", index_col=0)
descs = descs.drop(columns=["field", "eadid"])
descs.head()

Unnamed: 0,description,desc_id
0,Professor James Aitken White was a leading Sco...,0
1,Papers of The Very Rev Prof James Whyte (1920-...,1
2,Rev Thomas Allan was born on 16 August 1916 in...,2
3,Papers of Rev Tom Allan (1916-1965)\n\n,3
4,Alec Cheyne was born on 1 June 1924 in Errol i...,4


Join the two DataFrames by `desc_id` (description ID):

In [53]:
# clf_data = df.join(descs.set_index('desc_id'), on='desc_id')
clf_data = df.join(descs.set_index('description'), on='description')
clf_data.head()

Unnamed: 0,annotator,file,entity,label,start,end,text,category,note,eadid,field,id,description,desc_id
76483,Annotator 4,AA5_00100.ann,T7,Stereotype,34,63,The Very Rev Prof James Whyte,Contextual,form of address characteristic of male homosoc...,AA5,Title,0,Papers of The Very Rev Prof James Whyte (1920-...,1
42137,Annotator 1,AA5_00100.ann,T7,Masculine,34,63,The Very Rev Prof James Whyte,Person-Name,,AA5,Title,1,Papers of The Very Rev Prof James Whyte (1920-...,1
14854,Annotator 0,AA5_00100.ann,T7,Unknown,34,63,The Very Rev Prof James Whyte,Person-Name,,AA5,Title,2,Papers of The Very Rev Prof James Whyte (1920-...,1
48352,Annotator 2,AA5_00100.ann,T10,Unknown,43,63,Rev Prof James Whyte,Person-Name,,AA5,Title,3,Papers of The Very Rev Prof James Whyte (1920-...,1
14855,Annotator 0,AA5_00100.ann,T12,Masculine,661,689,Professor James Aitken White,Person-Name,,AA5,Biographical / Historical,4,Professor James Aitken White was a leading Sco...,0


The DataFrame above has a row for every label, so for descriptions with multiple labels, there are multiple rows of data.

In [54]:
clf_data.description.isnull().values.any()

False

For every description, get the labels (non-repeating) they were annotated with:

In [57]:
agg_cols = ["description", "field", "desc_id"]
anns_cols = ["description", "field", "desc_id", "annotator", "eadid"]

df_grouped = clf_data.groupby(anns_cols).agg(
    {"label": lambda label_name: set(label_name)}  #",".join(label_name)}
    ).reset_index()
df_grouped.sort_values(by="desc_id", inplace=True)
df_grouped.head()

Unnamed: 0,description,field,desc_id,annotator,eadid,label
26980,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,Annotator 0,AA5,"{Occupation, Masculine, Gendered-Pronoun}"
26984,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,Annotator 4,AA5,"{Stereotype, Occupation}"
26983,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,Annotator 3,AA5,{Occupation}
26982,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,Annotator 2,AA5,"{Gendered-Pronoun, Masculine}"
26981,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,Annotator 1,AA5,"{Gendered-Pronoun, Masculine}"


Make sure the sequences of labels in each row appear correctly:

In [49]:
valid_label_names = clf_data.label.unique()
print(valid_label_names)

['Stereotype' 'Masculine' 'Unknown' 'Occupation' 'Gendered-Pronoun'
 'Omission' 'Feminine' 'Generalization' 'Gendered-Role' 'Empowering']


In [50]:
label_col = list(df_grouped.label)
invalid = []
for label_set in label_col:
    label_list = list(label_set)
    for label_name in label_list:
        if not label_name in valid_label_names:
            invalid += [label_name]
assert len(invalid) == 0, "Label names must be valid"

Looks good!

For the aggregated data, add a column for the EADIDs (collection, or fonds, identifier) of the descriptions:

In [12]:
descs = pd.read_csv("../annot-post/data/all_descriptions.csv", index_col=0)
eadids = descs.drop(columns=["description","field"])
eadids.head()

Unnamed: 0,eadid,desc_id
0,AA5,0
1,AA5,1
2,AA6,2
3,AA6,3
4,AA7,4


In [13]:
key = "desc_id"
df_joined = df_grouped.join(eadids.set_index(key), on=key)
df_joined.head()

Unnamed: 0,description,field,desc_id,label,eadid
8054,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,"{Occupation, Stereotype, Masculine, Gendered-P...",AA5
5560,Papers of The Very Rev Prof James Whyte (1920-...,Title,1,"{Stereotype, Masculine, Unknown}",AA5
8321,Rev Thomas Allan was born on 16 August 1916 in...,Biographical / Historical,2,"{Gendered-Pronoun, Stereotype, Unknown, Mascul...",AA6
5550,Papers of Rev Tom Allan (1916-1965)\n\n,Title,3,"{Masculine, Unknown}",AA6
633,Alec Cheyne was born on 1 June 1924 in Errol i...,Biographical / Historical,4,"{Gendered-Pronoun, Stereotype, Unknown, Mascul...",AA7


Write (or read) the resulting DataFrame as a file:

In [58]:
# df_joined.to_csv("clf_data/desc_field_descid_label_eadid.csv")
df_joined.to_csv("clf_data/perspectivist/desc_field_descid_annot_eadid_label.csv")

In [16]:
# df_joined = pd.read_csv("clf_data/desc_field_descid_label_eadid.csv", index_col=0)
# df_joined["label"] = df_joined["label"].apply(eval)

Create subsets of the DataFrame for each field type and, for the perspectivist data, for each annotator:

In [68]:
annotators = df_grouped.annotator.unique()
annotators.sort()  # ['Annotator 0' 'Annotator 1' 'Annotator 2' 'Annotator 3' 'Annotator 4']
df0 = df_grouped[df_grouped.annotator == annotators[0]]
df1 = df_grouped[df_grouped.annotator == annotators[1]]
df2 = df_grouped[df_grouped.annotator == annotators[2]]
df3 = df_grouped[df_grouped.annotator == annotators[3]]
df4 = df_grouped[df_grouped.annotator == annotators[4]]
# df3.tail()

In [64]:
field_names = df_joined.field.unique()
print(field_names)

['Biographical / Historical' 'Title' 'Scope and Contents'
 'Processing Information']


In [None]:
# For Aggregated Data
train, validate, test = utils.getShuffledSplitData(df_joined)

In [67]:
# For Perspectivist Data
train0, validate0, test0 = utils.getShuffledSplitData(df0)
train1, validate1, test1 = utils.getShuffledSplitData(df1)
train2, validate2, test2 = utils.getShuffledSplitData(df2)
train3, validate3, test3 = utils.getShuffledSplitData(df3)
train4, validate4, test4 = utils.getShuffledSplitData(df4)

In [69]:
splits = [train1, validate1, test1] # [train, validate, test]
for split in splits:
    print(split.shape[0])

6023
2009
2010


In [71]:
# Check that the label names are valid
invalid = []
for df_split in splits:
    label_col = list(df_split.label)
    for label_set in label_col:
        label_list = list(label_set)
        for label_name in label_list:
            if not label_name in valid_label_names:
                invalid += [label_name]
    assert len(invalid) == 0

In [72]:
print(train.shape[0]/(train.shape[0]+validate.shape[0]+test.shape[0]))
print(validate.shape[0]/(train.shape[0]+validate.shape[0]+test.shape[0]))
print(test.shape[0]/(train.shape[0]+validate.shape[0]+test.shape[0]))

0.5997445721583653
0.20008514261387825
0.20017028522775648


Great!  We've shuffle the DataFrames and then added a column to each that assigns every row to a subset.  For each DataFrame: 
* 60% of the rows are for `training`
* 20% of the rows are for `validation`
* 20% of the rows are for `test`

Lastly, we can write the corresponding labels and descriptions to files for creating classification models.

#### Write the data to files
The files will separate labels by `\n` (a newline) and descriptions by `\n|\n` (a pipe character surrounded by newlines)

In [81]:
perspectivist_data = [[train0, validate0, test0], [train1, validate1, test1], [train2, validate2, test2], 
                      [train3, validate3, test3], [train4, validate4, test4]]

In [82]:
dir_name = data_dir2
print(dir_name)

./clf_data/perspectivist/


In [85]:
for i in range(len(perspectivist_data)):
    train = perspectivist_data[i][0]
    validate = perspectivist_data[i][1]
    test = perspectivist_data[i][2]
    utils.writeDocs(list(train.description), "train{}_docs.txt".format(i), dir_name)
    utils.writeLabels(list(train.label), "train{}_labels.txt".format(i), dir_name)
    utils.writeDocs(list(validate.description), "validate{}_docs.txt".format(i), dir_name)
    utils.writeLabels(list(validate.label), "validate{}_labels.txt".format(i), dir_name)
    utils.writeDocs(list(test.description), "blindtest{}_docs.txt".format(i), dir_name)
    utils.writeLabels(list(test.label), "blindtest{}_labels.txt".format(i), dir_name)

Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!
Your documents file has been written!
Your labels file has been written!


**Aggregated Data**

In [None]:
dir_name = data_dir

In [27]:
utils.writeDocs(list(train.description), "train_docs.txt", dir_name)
utils.writeLabels(list(train.label), "train_labels.txt", dir_name)

Your documents file has been written!
Your labels file has been written!


In [28]:
utils.writeDocs(list(validate.description), "validate_docs.txt", dir_name)
utils.writeLabels(list(validate.label), "validate_labels.txt", dir_name)

Your documents file has been written!
Your labels file has been written!


In [29]:
utils.writeDocs(list(test.description), "blindtest_docs.txt", dir_name)
utils.writeLabels(list(test.label), "blindtest_labels.txt", dir_name)

Your documents file has been written!
Your labels file has been written!
