# Gender Biased Document Classification

In [1]:
import numpy as np
import pandas as pd
# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

Load and prepare the data for classification:

In [2]:
df = pd.read_csv("aggregated_with_eadid_descid_desc_cols.csv", index_col=0)
# df.head()

In [3]:
df[["field2", "description"]] = df["description"].str.split(":", n=1, expand=True)
df.drop("field2", axis=1, inplace=True)
df.head()
df.drop(["file", "offsets", "text", "category", "eadid"], axis=1, inplace=True)
descs = list(df.description)
descs = [d.strip() for d in descs]
df.description = descs
df.to_csv("aggregated_data_for_classifier.csv")

In [6]:
df = pd.read_csv("aggregated_data_for_classifier.csv", index_col=0)
df.head()

Unnamed: 0,label,description,field,id,desc_id
9,Masculine,Professor James Aitken White was a leading Sco...,Biographical / Historical,0,0
16,Occupation,Professor James Aitken White was a leading Sco...,Biographical / Historical,1,0
4,Gendered-Pronoun,Professor James Aitken White was a leading Sco...,Biographical / Historical,2,0
5,Gendered-Pronoun,Professor James Aitken White was a leading Sco...,Biographical / Historical,3,0
6,Gendered-Pronoun,Professor James Aitken White was a leading Sco...,Biographical / Historical,4,0


In [7]:
df.label.unique()

array(['Masculine', 'Occupation', 'Gendered-Pronoun', 'Stereotype',
       'Unknown', 'Feminine', 'Generalization', 'Omission',
       'Gendered-Role', 'Nonbinary'], dtype=object)

The DataFrame above has a row for every label, so for descriptions with multiple labels, there are multiple rows of data.

In [8]:
df.description.isnull().values.any()

False

For every description, get the labels (non-repeating) they were annotated with:

In [9]:
# label_list = df.label.unique()
# label_list.sort()
# label_dict = dict.fromkeys(label_list, 0)
# print(label_dict)

In [10]:
dl_dict = dict.fromkeys(list(df.description))
dl_keys = dl_dict.keys()
for k in dl_keys:
    labels = list(set(df[df.description == k].label))
#     has_labels = label_dict
#     for l in labels:
#         has_labels[l] = 1
#     label_numbers = np.array(list(label_dict.values()))
    labels.sort()
    labels = ",".join(labels)

    dl_dict[k] = labels # label_numbers
    
print(dl_dict["Professor James Aitken White was a leading Scottish Theologian and Moderator of the General Assembly of the Church of Scotland. He was educated at Daniel Stewart's College and the University of Edinburgh where he studied philosophy and divinity. After his ordination he spent three years as an army Chaplain and then in 1948 was inducted to Dunollie Road Church in Oban. James Whyte moved to Mayfield North Church in Edinburgh in 1954 and in 1958 was appointed to the chair of practical theology and Christian ethics at the University of St Andrew's where he remained until 1987. His primary interests were in liturgy and ecclesiastical architecture and he also lectured on pastoral care.\nJames Whyte was called upon to preach at the memorial service for the victims of the Lockerbie disaster on 4th January 1989. The service was relayed around the world and was widely cited in the press having had a great impact. The full text of this sermon was published in Laughter and Tears: Thoughts on Faith in the Face of Suffering (Edinburgh, St Andrew's Press, 1993)."])

Gendered-Pronoun,Masculine,Occupation,Stereotype


In [12]:
desc_df = pd.read_csv("descriptions.csv", index_col=0)
desc_df.drop("field", axis=1, inplace=True)
desc_df[["field","description"]] = desc_df["description"].str.split(":", n=1, expand=True)
old_descs = list(desc_df.description)
new_descs = [d.strip() for d in old_descs]
desc_df["description"] = new_descs
desc_df.head()

Unnamed: 0,eadid,description,desc_id,field
9,AA5,Professor James Aitken White was a leading Sco...,0,Biographical / Historical
17,AA5,Papers of The Very Rev Prof James Whyte (1920-...,1,Title
39,AA6,Rev Thomas Allan was born on 16 August 1916 in...,2,Biographical / Historical
47,AA6,Papers of Rev Tom Allan (1916-1965),3,Title
70,AA7,Alec Cheyne was born on 1 June 1924 in Errol i...,4,Biographical / Historical


In [13]:
desc_labels = []
desc_list = list(desc_df.description)
for d in desc_list:
    desc_labels += [dl_dict[d]]

assert len(desc_labels) == desc_df.shape[0], "There should be string of labels for every description."

In [16]:
desc_df["label"] = desc_labels
desc_df.head()

Unnamed: 0,eadid,description,desc_id,field,label
9,AA5,Professor James Aitken White was a leading Sco...,0,Biographical / Historical,"Gendered-Pronoun,Masculine,Occupation,Stereotype"
17,AA5,Papers of The Very Rev Prof James Whyte (1920-...,1,Title,"Masculine,Stereotype,Unknown"
39,AA6,Rev Thomas Allan was born on 16 August 1916 in...,2,Biographical / Historical,"Feminine,Gendered-Pronoun,Generalization,Mascu..."
47,AA6,Papers of Rev Tom Allan (1916-1965),3,Title,"Masculine,Unknown"
70,AA7,Alec Cheyne was born on 1 June 1924 in Errol i...,4,Biographical / Historical,"Feminine,Gendered-Pronoun,Gendered-Role,Genera..."


### OPTION 1: didn't work...

DataFrame for documents and array for labels

In [35]:
docs = desc_df.drop("label", axis=1, inplace=False)
labels = np.array(list(desc_df.label))

In [36]:
docs.head()

Unnamed: 0,eadid,description,desc_id,field
9,AA5,Professor James Aitken White was a leading Sco...,0,Biographical / Historical
17,AA5,Papers of The Very Rev Prof James Whyte (1920-...,1,Title
39,AA6,Rev Thomas Allan was born on 16 August 1916 in...,2,Biographical / Historical
47,AA6,Papers of Rev Tom Allan (1916-1965),3,Title
70,AA7,Alec Cheyne was born on 1 June 1924 in Errol i...,4,Biographical / Historical


Shuffle and split data into train and test sets: 

*Note: least populated class has too few groups to stratify*

In [18]:
X_train, X_test, y_train, y_test = train_test_split(docs, labels, test_size=0.2, random_state=0, shuffle=True)

Shuffle and split the train dataset into train and validation sets:

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0, shuffle=True)

### OPTION 2: 

array for documents and array for labels

Format data as a numpy array of strings, where the input data are each one description as a string, and the target data are strings, one for each description, with comma-separated label names begins with the description and ending with comma-separated labels.  For example:

* Input (document): `"Professor James Aitken White was a leading Scottish Theologian."`
* Target (document's labels): `"Stereotype,Masculine,Gendered-Pronoun,Occupation"`

In [26]:
docs = list(dl_dict.keys())   # input documents (descriptions)
labels = list(dl_dict.values()) # target labels

Shuffle and split data into train and test sets: 

*Note: least populated class has too few groups to stratify*

In [27]:
X_train, X_test, y_train, y_test = train_test_split(docs, labels, test_size=0.2, random_state=0, shuffle=True)

Shuffle and split the train dataset into train and validation sets:

In [28]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0, shuffle=True)

### Write the data to files:

In [29]:
datadir = "clf_data2" #"clf_data"
for i,doc in enumerate(X_train):
    train_file = datadir+"/train/doc"+str(i)+".txt"
    f_train = open(train_file, "w")
    f_train.write(doc)
    f_train.close()

In [30]:
for i,labels in enumerate(y_train):
    train_file = datadir+"/train/labels"+str(i)+".txt"
    f_train = open(train_file, "w")
    f_train.write(labels)
    f_train.close()

In [31]:
for i,doc in enumerate(X_val):
    val_file = datadir+"/validation/doc"+str(i)+".txt"
    f_val = open(val_file, "w")
    f_val.write(doc)
    f_val.close()

In [32]:
for i,labels in enumerate(y_val):
    val_file = datadir+"/validation/labels"+str(i)+".txt"
    f_val = open(val_file, "w")
    f_val.write(labels)
    f_val.close()

In [33]:
for i,doc in enumerate(X_test):
    test_file = datadir+"/blindtest/doc"+str(i)+".txt"
    f_test = open(test_file, "w")
    f_test.write(doc)
    f_test.close()

In [34]:
for i,labels in enumerate(y_test):
    test_file = datadir+"/blindtest/labels"+str(i)+".txt"
    f_test = open(test_file, "w")
    f_test.write(labels)
    f_test.close()

In [11]:
# vectorizer = TfidfVectorizer(lowercase=False)               # defaults input="content", encoding="utf8"

In [12]:
# X = vectorizer.fit_transform(docs)   # archival metadata descriptions (one description = one document)

In [14]:
# y = vectorizer.fit_transform(labels)         # corresponding gender biased language labels for each description
# y = np.array(labels)

In [21]:
# sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=0) # set random_state for reproducibility
# fold = 1
# for train,test in sss.split(X, y):
#     train_file = "clf_data/fold"+str(fold)+"/train.txt"
#     f_train = open(train_file, "w")
#     f_train.write(train)
#     f_train.close()
    
#     test_file = "clf_data/fold"+str(fold)+"test.txt"
#     f_test = open(test_file, "w")
#     f_test.write(test)
#     f_test.close()
    
#     fold += 1

In [None]:
# # INPUT: DataFrame of annotated descriptions with columns for description, field, and label
# # OUTPUT: 3D array with columns for the description, metadata field name, and label
# def desc_field_label_extractor(df):
#     descs = list(df.description)
#     fields = list(df.field)
#     labels = list(df.label)
#     features = np.empty(shape=(len(descs), 3), dtype=object)
#     for i,desc in enumerate(descs):    
#         features[i, 0] = desc
#         features[i, 1] = fields[i]
#         features[i, 2] = labels[i]
#     return features


# desc_field_label_transformer = FunctionTransformer(desc_field_label_extractor)