# Gender Biased Document Classification

## Splitting Data into Train/Validation/Test Sets

In [1]:
import numpy as np
import pandas as pd
# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

Load and prepare the data for classification:

In [8]:
df = pd.read_csv("../annot-post/data/aggregated_with_eadid_descid_cols.csv", index_col=0)
df.head()

Unnamed: 0,file,offsets,text,label,category,eadid,field,id,desc_id
9,AA5_00100.ann,"(1032, 1043)",James Whyte,Masculine,Person-Name,AA5,Biographical / Historical,0,0
16,AA5_00100.ann,"(1129, 1177)",chair of practical theology and Christian ethics,Occupation,Contextual,AA5,Biographical / Historical,1,0
4,AA5_00100.ann,"(1217, 1219)",he,Gendered-Pronoun,Linguistic,AA5,Biographical / Historical,2,0
5,AA5_00100.ann,"(1241, 1244)",His,Gendered-Pronoun,Linguistic,AA5,Biographical / Historical,3,0
6,AA5_00100.ann,"(1315, 1317)",he,Gendered-Pronoun,Linguistic,AA5,Biographical / Historical,4,0


In [9]:
descs = pd.read_csv("../annot-post/data/descriptions.csv")
descs = descs.drop(columns=["eadid", "field"])
descs.head()

Unnamed: 0,description,desc_id
0,Professor James Aitken White was a leading Sco...,0
1,Papers of The Very Rev Prof James Whyte (1920-...,1
2,Rev Thomas Allan was born on 16 August 1916 in...,2
3,Papers of Rev Tom Allan (1916-1965)\n\n,3
4,Alec Cheyne was born on 1 June 1924 in Errol i...,4


In [10]:
# df = pd.read_csv("annot_data/aggregated_with_eadid_descid_desc_cols.csv", index_col=0)
# df.head()

In [11]:
# df[["field2", "description"]] = df["description"].str.split(":", n=1, expand=True)
# df.drop("field2", axis=1, inplace=True)
# df.head()
# df.drop(["file", "offsets", "text", "category", "eadid"], axis=1, inplace=True)
# descs = list(df.description)
# descs = [d.strip() for d in descs]
# df.description = descs
# df.to_csv("data/aggregated_data_for_classifier.csv")

Join the two DataFrames by `desc_id` (description ID):

In [12]:
clf_data = df.join(descs.set_index('desc_id'), on='desc_id')
clf_data.head()

Unnamed: 0,file,offsets,text,label,category,eadid,field,id,desc_id,description
9,AA5_00100.ann,"(1032, 1043)",James Whyte,Masculine,Person-Name,AA5,Biographical / Historical,0,0,Professor James Aitken White was a leading Sco...
16,AA5_00100.ann,"(1129, 1177)",chair of practical theology and Christian ethics,Occupation,Contextual,AA5,Biographical / Historical,1,0,Professor James Aitken White was a leading Sco...
4,AA5_00100.ann,"(1217, 1219)",he,Gendered-Pronoun,Linguistic,AA5,Biographical / Historical,2,0,Professor James Aitken White was a leading Sco...
5,AA5_00100.ann,"(1241, 1244)",His,Gendered-Pronoun,Linguistic,AA5,Biographical / Historical,3,0,Professor James Aitken White was a leading Sco...
6,AA5_00100.ann,"(1315, 1317)",he,Gendered-Pronoun,Linguistic,AA5,Biographical / Historical,4,0,Professor James Aitken White was a leading Sco...


The DataFrame above has a row for every label, so for descriptions with multiple labels, there are multiple rows of data.

In [13]:
clf_data.description.isnull().values.any()

False

For every description, get the labels (non-repeating) they were annotated with:

In [14]:
df_grouped = clf_data.groupby(["description", "field", "desc_id"]).agg(
    {"label": lambda label_name: set(label_name)}  #",".join(label_name)}
    ).reset_index()
df_grouped.sort_values(by="desc_id", inplace=True)
df_grouped.head()

Unnamed: 0,description,field,desc_id,label
8054,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,"{Stereotype, Masculine, Occupation, Gendered-P..."
5560,Papers of The Very Rev Prof James Whyte (1920-...,Title,1,"{Masculine, Stereotype, Unknown}"
8321,Rev Thomas Allan was born on 16 August 1916 in...,Biographical / Historical,2,"{Stereotype, Generalization, Feminine, Omissio..."
5550,Papers of Rev Tom Allan (1916-1965)\n\n,Title,3,"{Masculine, Unknown}"
633,Alec Cheyne was born on 1 June 1924 in Errol i...,Biographical / Historical,4,"{Generalization, Stereotype, Feminine, Omissio..."


Make sure the sequences of labels in each row appear correctly:

In [15]:
valid_label_names = clf_data.label.unique()
print(valid_label_names)

['Masculine' 'Occupation' 'Gendered-Pronoun' 'Stereotype' 'Unknown'
 'Feminine' 'Generalization' 'Omission' 'Gendered-Role' 'Nonbinary']


In [15]:
label_col = list(df_grouped.label)
invalid = []
for label_set in label_col:
    label_list = list(label_set)
    for label_name in label_list:
        if not label_name in valid_label_names:
            invalid += [label_name]
print(invalid)

[]


Looks good!

Add a column for the EADIDs (collection, or fonds, identifier) of the descriptions:

In [16]:
descs = pd.read_csv("../annot-post/data/descriptions.csv")
eadids = descs.drop(columns=["description","field"])
# descs.head()
eadids.head()

Unnamed: 0,eadid,desc_id
0,AA5,0
1,AA5,1
2,AA6,2
3,AA6,3
4,AA7,4


In [17]:
key = "desc_id"
df_joined = df_grouped.join(eadids.set_index(key), on=key)
df_joined.head()

Unnamed: 0,description,field,desc_id,label,eadid
8054,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,"{Occupation, Stereotype, Gendered-Pronoun, Mas...",AA5
5560,Papers of The Very Rev Prof James Whyte (1920-...,Title,1,"{Stereotype, Unknown, Masculine}",AA5
8321,Rev Thomas Allan was born on 16 August 1916 in...,Biographical / Historical,2,"{Gendered-Pronoun, Stereotype, Generalization,...",AA6
5550,Papers of Rev Tom Allan (1916-1965)\n\n,Title,3,"{Unknown, Masculine}",AA6
633,Alec Cheyne was born on 1 June 1924 in Errol i...,Biographical / Historical,4,"{Gendered-Pronoun, Stereotype, Generalization,...",AA7


Write (or read) the resulting DataFrame as a file:

In [18]:
# df_joined.to_csv("clf_data/desc_field_descid_label_eadid.csv")

In [16]:
df_joined = pd.read_csv("clf_data/desc_field_descid_label_eadid.csv", index_col=0)
df_joined["label"] = df_joined["label"].apply(eval)

Create subsets of the DataFrame for each field type:

In [17]:
field_names = df_joined.field.unique()
print(field_names)

['Biographical / Historical' 'Title' 'Scope and Contents'
 'Processing Information']


In [18]:
df_bh = df_joined.loc[df_joined.field == field_names[0]]
df_t = df_joined.loc[df_joined.field == field_names[1]]
df_sc = df_joined.loc[df_joined.field == field_names[2]]
df_pi = df_joined.loc[df_joined.field == field_names[3]]
df_pi.head()

Unnamed: 0,description,field,desc_id,label,eadid
833,Archivist's NoteNone Grant Buttars 28 April 2...,Processing Information,393,"{Occupation, Unknown}",BAI
821,Archivist's NoteNone Grant Buttars 22 May 200...,Processing Information,394,"{Occupation, Unknown}",BAI
810,Archivist's NoteNone Grant Buttars 15 May 200...,Processing Information,395,"{Occupation, Unknown}",BAI
790,Archivist's NoteNone Grant Buttars 03 June 20...,Processing Information,396,"{Occupation, Unknown}",BAI
811,Archivist's NoteNone Grant Buttars 16 April 2...,Processing Information,397,"{Occupation, Unknown}",BAI


Shuffle each DataFrame and then add a column to each that assigns every row to a subset.  For each DataFrame, assign: 
* 60% of the rows to `training`
* 20% of the rows to `validation`
* 20% of the rows to `test`

In [19]:
# INPUT: DataFrame, fraction of DF to shuffle, and random_state of shuffle
#        Note 1 - fraction defaults to 1 to shuffle the entire DataFrame; provide a value <1 to return that fraction of the DataFrame shuffled
#        Note 2 -random_state_value defaults to 7 for reproducibility
# OUTPUT: DataFrame with its rows shuffled
def shuffleDataFrame(df, fraction=1, random_state_value=7):
    return df.sample(frac=fraction, random_state=random_state_value)

In [20]:
# Shuffle the DataFrames for each metadata field type
df_bh_shuffled = shuffleDataFrame(df_bh)
df_t_shuffled = shuffleDataFrame(df_t)
df_sc_shuffled = shuffleDataFrame(df_sc)
df_pi_shuffled = shuffleDataFrame(df_pi)
df_t_shuffled.head()

Unnamed: 0,description,field,desc_id,label,eadid
4605,Letter to 'My dear little Rosie' from 'Your af...,Title,11843,{Omission},Coll-1490
3746,"Letter :: Bland, John H.\n\n",Title,7808,{Unknown},Coll-146
8194,"Reprint and cutting of Thomson's piece, Measur...",Title,2539,"{Masculine, Omission, Unknown}",Coll-1310
4500,"Letter from A. Campbell to Lady Campbell, Barc...",Title,6372,"{Feminine, Omission, Gendered-Role, Unknown}",Coll-1437
4389,"Letter :: Webster, D C\n\n",Title,8190,{Unknown},Coll-146


In [21]:
df_shuffled_list = [df_bh_shuffled, df_t_shuffled, df_sc_shuffled, df_pi_shuffled]
invalid = []
for df_shuffled in df_shuffled_list:
    label_col = list(df_shuffled.label)
    for label_set in label_col:
        label_list = list(label_set)
        for label_name in label_list:
            if not label_name in valid_label_names:
                invalid += [label_name]
    print(invalid)

[]
[]
[]
[]


All the label sequences are still looking good!

In [22]:
def getTrainValTestSizes(df):
    indeces = list(df.index)
    
    train = indeces[ : int(df.shape[0]*0.6) ]
    validate = indeces[ int(df.shape[0]*0.6) : (int(df.shape[0]*0.6) + round(df.shape[0]*0.2)) ]
    test = indeces[ (int(df.shape[0]*0.6) + round(df.shape[0]*0.2)) : ]

    return len(train), len(validate), len(test)

In [23]:
# Get the indeces of rows to assign to train, dev, and test
train_bh, validate_bh, test_bh = getTrainValTestSizes(df_bh_shuffled)
assert train_bh+validate_bh+test_bh == df_bh_shuffled.shape[0]

train_t, validate_t, test_t = getTrainValTestSizes(df_t_shuffled)
assert train_t+validate_t+test_t == df_t_shuffled.shape[0]

train_sc, validate_sc, test_sc = getTrainValTestSizes(df_sc_shuffled)
assert train_sc+validate_sc+test_sc == df_sc_shuffled.shape[0]

train_pi, validate_pi, test_pi = getTrainValTestSizes(df_pi_shuffled)
assert train_pi+validate_pi+test_pi == df_pi_shuffled.shape[0]

In [24]:
# Add a column to the input DataFrame that assigns each row to train, dev, and test
# using the three input sizes
def assignSubsets(df, train_size, validate_size, test_size):
    subset_col = ["train"]*train_size + ["dev"]*validate_size + ["test"]*test_size
    df.insert(len(df.columns)-1, "subset", subset_col)
    return df

In [25]:
df_bh = assignSubsets(df_bh_shuffled, train_bh, validate_bh, test_bh)
# df_bh.head()
df_t = assignSubsets(df_t_shuffled, train_t, validate_t, test_t)
df_sc = assignSubsets(df_sc_shuffled, train_sc, validate_sc, test_sc)
df_pi = assignSubsets(df_pi_shuffled, train_pi, validate_pi, test_pi)

Concatenate the rows assigned to each subset to create one DataFrame each for training, validation, and testing: 

In [26]:
def concatBySubset(df_list, subset):
    df_all = pd.DataFrame()
    for df in df_list:
        df_subset = df.loc[df["subset"] == subset]
        df_all = pd.concat([df_all, df_subset], axis=0)
    return df_all

In [27]:
dfs = [df_bh, df_t, df_sc, df_pi]

train = concatBySubset(dfs, "train")
assert train.subset.unique()[0] == "train"

validate = concatBySubset(dfs, "dev")
assert validate.subset.unique()[0] == "dev"

test = concatBySubset(dfs, "test")
assert test.subset.unique()[0] == "test"


In [28]:
splits = [train, validate, test]
for split in splits:
    print(split.shape[0])

7044
2350
2351


In [29]:
df_split_list = [train, validate, test]
invalid = []
for df_split in df_split_list:
    label_col = list(df_split.label)
    for label_set in label_col:
        label_list = list(label_set)
        for label_name in label_list:
            if not label_name in valid_label_names:
                invalid += [label_name]
    print(invalid)

[]
[]
[]


In [34]:
print(train.shape[0]/(train.shape[0]+validate.shape[0]+test.shape[0]))
print(validate.shape[0]/(train.shape[0]+validate.shape[0]+test.shape[0]))
print(test.shape[0]/(train.shape[0]+validate.shape[0]+test.shape[0]))

0.5997445721583653
0.20008514261387825
0.20017028522775648


Great!  Now that we've split the data up into three subsets of 60%, 20% and 20%, we can write the corresponding labels and descriptions to files for creating classification models.

#### Write the data to files
The files will separate labels by `\n` (a newline) and descriptions by `\n|\n` (a pipe character surrounded by newlines)

In [56]:
def writeDocs(docs, filename, directory="clf_data/"):
    filepath = directory+filename
    f = open(filepath, "a")
    for i,doc in enumerate(docs):    
        doc = doc.strip()                       # Remove leading and trailing whitespace
        if i < len(docs) - 1:
            f.write(doc+"\n|\n")
        else:
            f.write(doc)
    f.close() 
    print("Your documents file has been written!")
    
def writeLabels(labels, filename, directory="clf_data/"):
    filepath = directory+filename
    f = open(filepath, "a")
    for i,label_set in enumerate(labels):    
        label_names = str(label_set)              # Change data type to string
        label_names = label_names[1:-1]           # Remove curly braces
        label_names = label_names.replace("'","") # Remove single quotes surounding each label name
        if i < len(labels) - 1:
            f.write(label_names+"\n")
        else:
            f.write(label_names)
    f.close()
    print("Your labels file has been written!")

In [None]:
data_dir = "./clf_data/"
Path(data_dir).mkdir(parents=True, exist_ok=True)

In [57]:
writeDocs(list(train.description), "train_docs.txt")
writeLabels(list(train.label), "train_labels.txt")

Your documents file has been written!
Your labels file has been written!


In [58]:
writeDocs(list(validate.description), "validate_docs.txt")
writeLabels(list(validate.label), "validate_labels.txt")

Your documents file has been written!
Your labels file has been written!


In [59]:
writeDocs(list(test.description), "blindtest_docs.txt")
writeLabels(list(test.label), "blindtest_labels.txt")

Your documents file has been written!
Your labels file has been written!
