# Gender Biased Document Classification

## Splitting Data into Train/Validation/Test Sets

In [1]:
import numpy as np
import pandas as pd
# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

Load and prepare the data for classification:

In [48]:
# df = pd.read_csv("annot_data/aggregated_with_eadid_descid_desc_cols.csv", index_col=0)
# df.head()

Unnamed: 0,file,offsets,text,label,category,eadid,description,field,id,desc_id
9,AA5_00100.ann,"(1032, 1043)",James Whyte,Masculine,Person-Name,AA5,Biographical / Historical:\nProfessor James Ai...,Biographical / Historical,0,0
16,AA5_00100.ann,"(1129, 1177)",chair of practical theology and Christian ethics,Occupation,Contextual,AA5,Biographical / Historical:\nProfessor James Ai...,Biographical / Historical,1,0
4,AA5_00100.ann,"(1217, 1219)",he,Gendered-Pronoun,Linguistic,AA5,Biographical / Historical:\nProfessor James Ai...,Biographical / Historical,2,0
5,AA5_00100.ann,"(1241, 1244)",His,Gendered-Pronoun,Linguistic,AA5,Biographical / Historical:\nProfessor James Ai...,Biographical / Historical,3,0
6,AA5_00100.ann,"(1315, 1317)",he,Gendered-Pronoun,Linguistic,AA5,Biographical / Historical:\nProfessor James Ai...,Biographical / Historical,4,0


In [49]:
# df[["field2", "description"]] = df["description"].str.split(":", n=1, expand=True)
# df.drop("field2", axis=1, inplace=True)
# df.head()
# df.drop(["file", "offsets", "text", "category", "eadid"], axis=1, inplace=True)
# descs = list(df.description)
# descs = [d.strip() for d in descs]
# df.description = descs
# df.to_csv("data/aggregated_data_for_classifier.csv")

In [2]:
df = pd.read_csv("data/aggregated_data_for_classifier.csv", index_col=0)
df.head()

Unnamed: 0,label,description,field,id,desc_id
9,Masculine,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,0
16,Occupation,Professor James Aitken White was a leading Sco...,Biographical / Historical,1,0
4,Gendered-Pronoun,Professor James Aitken White was a leading Sco...,Biographical / Historical,2,0
5,Gendered-Pronoun,Professor James Aitken White was a leading Sco...,Biographical / Historical,3,0
6,Gendered-Pronoun,Professor James Aitken White was a leading Sco...,Biographical / Historical,4,0


In [3]:
df.label.unique()

array(['Masculine', 'Occupation', 'Gendered-Pronoun', 'Stereotype',
       'Unknown', 'Feminine', 'Generalization', 'Omission',
       'Gendered-Role'], dtype=object)

The DataFrame above has a row for every label, so for descriptions with multiple labels, there are multiple rows of data.

In [4]:
df.description.isnull().values.any()

False

For every description, get the labels (non-repeating) they were annotated with:

In [26]:
df_grouped = df.groupby(["description", "field", "desc_id"]).agg(
    {"label": lambda label_name: set(label_name)}  #",".join(label_name)}
    ).reset_index()
df_grouped.sort_values(by="desc_id", inplace=True)
df_grouped.head()

Unnamed: 0,description,field,desc_id,label
8054,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,"{Masculine, Gendered-Pronoun, Occupation, Ster..."
5560,Papers of The Very Rev Prof James Whyte (1920-...,Title,1,"{Masculine, Stereotype, Unknown}"
8325,Rev Thomas Allan was born on 16 August 1916 in...,Biographical / Historical,2,"{Generalization, Omission, Masculine, Feminine..."
5550,Papers of Rev Tom Allan (1916-1965),Title,3,"{Masculine, Unknown}"
633,Alec Cheyne was born on 1 June 1924 in Errol i...,Biographical / Historical,4,"{Generalization, Omission, Masculine, Feminine..."


Add a column for the EADIDs (collection, or fonds, identifier) of the descriptions:

In [22]:
descs = pd.read_csv("data/descriptions.csv", index_col=0)
eadids = descs.drop(columns=["description","field"])
# descs.head()
eadids.head()

Unnamed: 0,eadid,desc_id
9,AA5,0
17,AA5,1
39,AA6,2
47,AA6,3
70,AA7,4


In [27]:
key = "desc_id"
df_joined = df_grouped.join(eadids.set_index(key), on=key)
df_joined.head()

Unnamed: 0,description,field,desc_id,label,eadid
8054,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,"{Masculine, Gendered-Pronoun, Occupation, Ster...",AA5
5560,Papers of The Very Rev Prof James Whyte (1920-...,Title,1,"{Masculine, Stereotype, Unknown}",AA5
8325,Rev Thomas Allan was born on 16 August 1916 in...,Biographical / Historical,2,"{Generalization, Omission, Masculine, Feminine...",AA6
5550,Papers of Rev Tom Allan (1916-1965),Title,3,"{Masculine, Unknown}",AA6
633,Alec Cheyne was born on 1 June 1924 in Errol i...,Biographical / Historical,4,"{Generalization, Omission, Masculine, Feminine...",AA7


Write the resulting DataFrame to a file:

In [29]:
df_joined.to_csv("data/desc_field_descid_label_eadid.csv")

### OPTION 1:

In [None]:
# ---------------------------------
# Uncomment if need to reload data
# ---------------------------------
# df_joined = pd.read_csv("data/desc_field_descid_label_eadid.csv")

Create subsets of the DataFrame for each field type:

In [37]:
field_names = df_joined.field.unique()
print(field_names)


['Biographical / Historical' 'Title' 'Scope and Contents'
 'Processing Information']


In [38]:
df_bh = df_joined.loc[df_joined.field == field_names[0]]
df_t = df_joined.loc[df_joined.field == field_names[1]]
df_sc = df_joined.loc[df_joined.field == field_names[2]]
df_pi = df_joined.loc[df_joined.field == field_names[3]]
df_pi.head()

Unnamed: 0,description,field,desc_id,label,eadid
833,Archivist's NoteNone Grant Buttars 28 April 2003,Processing Information,393,"{Unknown, Occupation}",BAI
821,Archivist's NoteNone Grant Buttars 22 May 2003,Processing Information,394,"{Unknown, Occupation}",BAI
810,Archivist's NoteNone Grant Buttars 15 May 2003,Processing Information,395,"{Unknown, Occupation}",BAI
790,Archivist's NoteNone Grant Buttars 03 June 2003,Processing Information,396,"{Unknown, Occupation}",BAI
811,Archivist's NoteNone Grant Buttars 16 April 2003,Processing Information,397,"{Unknown, Occupation}",BAI


Shuffle each DataFrame and then add a column to each that assigns every row to a subset.  For each DataFrame, assign: 
* 60% of the rows to `training`
* 20% of the rows to `validation`
* 20% of the rows to `test`

Shuffle the DataFrame:

In [31]:
df_shuffled = df_joined.sample(frac=1, random_state=7)
df_shuffled.head()

Unnamed: 0,description,field,desc_id,label,eadid
4735,Letters to Pilliner from Godfrey H Thomson and...,Title,6389,"{Omission, Unknown}",Coll-1443
10268,The 'passport' letters written in Arabic are n...,Scope and Contents,2021,{Generalization},Coll-1258
3326,Includes lectures delivered by Thomson largely...,Scope and Contents,3008,"{Omission, Masculine, Unknown}",Coll-1310
11634,arias by various composers in full score,Scope and Contents,1057,{Occupation},Coll-1061
382,"3 (40), Thomson to Ledermann",Title,1162,"{Omission, Masculine, Unknown}",Coll-1064


In [36]:
X_train, X_test, y_train, y_test = train_test_split(docs, labels, test_size=0.2, random_state=0, shuffle=True)

Shuffle and split the train dataset into train and validation sets:

In [37]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0, shuffle=True)

### OPTION 2: 

array for documents and array for labels

Format data as a numpy array of strings, where the input data are each one description as a string, and the target data are strings, one for each description, with comma-separated label names begins with the description and ending with comma-separated labels.  For example:

* Input (document): `"Professor James Aitken White was a leading Scottish Theologian."`
* Target (document's labels): `"Stereotype,Masculine,Gendered-Pronoun,Occupation"`

In [19]:
docs = list(dl_dict.keys())   # input documents (descriptions)
labels = list(dl_dict.values()) # target labels

Shuffle and split data into train and test sets: 

*Note: least populated class has too few groups to stratify*

In [20]:
X_train, X_test, y_train, y_test = train_test_split(docs, labels, test_size=0.2, random_state=0, shuffle=True)

Shuffle and split the train dataset into train and validation sets:

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0, shuffle=True)

### Write the data to files:

In [25]:
# directory: clf_data for option 1 / clf_data2 for option2
train_file = "clf_data2/train_docs.txt"
for i,doc in enumerate(X_train):
    f_train = open(train_file, "a")
    f_train.write(doc+"\n|\n")
f_train.close()

In [26]:
train_file = "clf_data2/train_labels.txt"
for i,labels in enumerate(y_train):
    f_train = open(train_file, "a")
    f_train.write(labels+"\n")
f_train.close()

In [27]:
val_file = "clf_data2/validation_docs.txt"
for i,doc in enumerate(X_val):
    f_val = open(val_file, "a")
    f_val.write(doc+"\n|\n")
f_val.close()

In [28]:
val_file = "clf_data2/validation_labels.txt"
for i,labels in enumerate(y_val):
    f_val = open(val_file, "a")
    f_val.write(labels+"\n")
f_val.close()

In [29]:
test_file = "clf_data2/blindtest_doc.txt"
for i,doc in enumerate(X_test):
    f_test = open(test_file, "a")
    f_test.write(doc+"\n|\n")
f_test.close()

In [30]:
test_file = "clf_data2/blindtest_labels.txt"
for i,labels in enumerate(y_test):
    f_test = open(test_file, "a")
    f_test.write(labels+"\n")
f_test.close()

In [11]:
# vectorizer = TfidfVectorizer(lowercase=False)               # defaults input="content", encoding="utf8"

In [12]:
# X = vectorizer.fit_transform(docs)   # archival metadata descriptions (one description = one document)

In [14]:
# y = vectorizer.fit_transform(labels)         # corresponding gender biased language labels for each description
# y = np.array(labels)

In [21]:
# sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=0) # set random_state for reproducibility
# fold = 1
# for train,test in sss.split(X, y):
#     train_file = "clf_data/fold"+str(fold)+"/train.txt"
#     f_train = open(train_file, "w")
#     f_train.write(train)
#     f_train.close()
    
#     test_file = "clf_data/fold"+str(fold)+"test.txt"
#     f_test = open(test_file, "w")
#     f_test.write(test)
#     f_test.close()
    
#     fold += 1

In [None]:
# # INPUT: DataFrame of annotated descriptions with columns for description, field, and label
# # OUTPUT: 3D array with columns for the description, metadata field name, and label
# def desc_field_label_extractor(df):
#     descs = list(df.description)
#     fields = list(df.field)
#     labels = list(df.label)
#     features = np.empty(shape=(len(descs), 3), dtype=object)
#     for i,desc in enumerate(descs):    
#         features[i, 0] = desc
#         features[i, 1] = fields[i]
#         features[i, 2] = labels[i]
#     return features


# desc_field_label_transformer = FunctionTransformer(desc_field_label_extractor)