# Analysis: Label and Tag Distributions

Calculate the total number of labels and B-/I- tags across the data splits for the experiments (split 60/20/20 for training, validation, and test) and cascades (split 20/20/20/20/20 for 5-fold cross-validation).

In [1]:
import config
import pandas as pd

In [2]:
def getTagTotals(df, split):
    subdf = df.loc[df.tag != "O"]
    subdf = subdf.loc[subdf.tag != "B-Nonbinary"]  # mistaken annotation
    subdf = subdf.loc[subdf.tag != "I-Nonbinary"]  # mistaken annotation
#     subdf = subdf.loc[subdf.tag != "B-Omission"]
#     subdf = subdf.loc[subdf.tag != "I-Omission"]
#     subdf = subdf.loc[subdf.tag != "B-Stereotype"]
#     subdf = subdf.loc[subdf.tag != "I-Stereotype"]
    # Reference: https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2
    df_tag_totals = subdf.groupby('tag').size().reset_index(name="{}_total".format(split))
    df_tag_totals = df_tag_totals.sort_values(by="{}_total".format(split))
    df_tag_totals = df_tag_totals.reset_index()
    df_tag_totals.drop("index", axis=1)
    tag_totals = df_tag_totals.tag.str.split("-", n=1, expand=True)
    df_tag_totals.insert(2, "label", tag_totals[[1]])
    df_tag_totals = df_tag_totals.drop(columns=["index"])
    df_tag_totals = df_tag_totals.sort_values(by=["label","tag"])
    return df_tag_totals

def implodeDataFrame(df, cols_to_groupby):
    cols_to_agg = list(df.columns)
    for col in cols_to_groupby:
        cols_to_agg.remove(col)
    agg_dict = dict.fromkeys(cols_to_agg, lambda x: x.tolist())
    return df.groupby(cols_to_groupby).agg(agg_dict).reset_index().set_index(cols_to_groupby)

def getLabelColFromTagCol(df, col):
    col_list = list(df[col])
    new_col = []
    for value_list in col_list:
        new_value_list = []
        for value in value_list:
            if value != "O":
                new_value = value[2:]
                new_value_list += [new_value]
            else:
                new_value_list += [value]
        # Remove any duplicates from the list of labels
        unique_values = list(set(new_value_list))
        # Remove "O" from the list if there are labels present
        if ("O" in unique_values) and (len(unique_values) > 1):
            unique_values.remove("O")
        # Sort the list of labels alphabetically
        unique_values.sort()
        new_col += [unique_values]
    assert len(new_col) == len(col_list)
    return new_col

In [13]:
# def getDescTotals(df, split):
#     subdf = df[["description_id", "tag"]].loc[df.tag.isin(["B-Omission","I-Omission","B-Stereotype","I-Stereotype"])]
#     subdf = subdf.drop_duplicates()
#     subdf[["tag"]] = subdf[["tag"]].replace(["B-Omission","I-Omission"],"Omission")
#     subdf[["tag"]] = subdf[["tag"]].replace(["B-Stereotype","I-Stereotype"],"Stereotype")
#     subdf = df[["description_id", "tag"]].loc[df.tag.isin(["Omission","I-Omission","B-Stereotype","I-Stereotype"])]
#     subdf = subdf.drop_duplicates()
#     subdf = subdf.rename(columns={"tag":"label"})
#     df_totals = subdf.groupby('label').size().reset_index(name="{}_total".format(split))
#     df_totals = df_totals.sort_values(by=["label"])
#     return df_totals

## Experiments (60/20/20)

### Tokens

#### Tags

In [3]:
# Token labels
dir_path = config.tokc_path + "model_input/"
train_file = dir_path+"token_train.csv"
validate_file = dir_path+"token_validate.csv"
test_file = dir_path+"token_test.csv"

In [4]:
df_train = pd.read_csv(train_file, index_col=0)
df_validate = pd.read_csv(validate_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

In [5]:
print(df_train.shape)
print(df_validate.shape)
print(df_test.shape)
print("Total rows:", df_train.shape[0]+df_validate.shape[0]+df_test.shape[0])  # Looks good

(467564, 10)
(157740, 10)
(153966, 10)
Total rows: 779270


In [13]:
train_totals = getTagTotals(df_train, "train")
dev_totals = getTagTotals(df_validate, "dev")
test_totals = getTagTotals(df_test, "test")

In [19]:
multiindex = ["tag", "label"]
joined = train_totals.join(dev_totals.set_index(multiindex), on=multiindex, how="outer")
joined_again = joined.join(test_totals.set_index(multiindex), on=multiindex, how="outer")
joined_again

Unnamed: 0,tag,label,train_total,dev_total,test_total
5,B-Feminine,Feminine,840,323,298
8,I-Feminine,Feminine,1827,846,696
10,B-Gendered-Pronoun,Gendered-Pronoun,2186,744,748
0,I-Gendered-Pronoun,Gendered-Pronoun,24,16,14
6,B-Gendered-Role,Gendered-Role,1674,590,517
1,I-Gendered-Role,Gendered-Role,361,134,116
4,B-Generalization,Generalization,780,245,258
2,I-Generalization,Generalization,449,146,183
13,B-Masculine,Masculine,3390,1024,1096
14,I-Masculine,Masculine,4693,1378,1366


Write the totals data to files:

In [21]:
joined_again.to_csv(config.tokc_path+"model_input/token_tag_totals_by_split.csv")

#### Labels

In [34]:
# Token labels
dir_path = config.tokc_path + "model_input/"
train_file = dir_path+"token_train.csv"
validate_file = dir_path+"token_validate.csv"
test_file = dir_path+"token_test.csv"

In [35]:
df_train = pd.read_csv(train_file, index_col=0)
df_validate = pd.read_csv(validate_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

In [36]:
print(df_train.shape)
print(df_validate.shape)
print(df_test.shape)
print("Total rows:", df_train.shape[0]+df_validate.shape[0]+df_test.shape[0])  # Looks good

(467564, 10)
(157740, 10)
(153966, 10)
Total rows: 779270


In [37]:
df = pd.concat([df_train, df_validate, df_test])
df = df.loc[df.tag != "O"]  # remove unannotated rows
df = df.loc[df.tag != "B-Nonbinary"]  # mistaken annotation
df = df.loc[df.tag != "I-Nonbinary"]  # mistaken annotation
df.shape

(80136, 10)

In [38]:
# Group data by token
df = df.drop(columns=["description_id", "sentence_id", "ann_id", "field", "token_offsets", "pos"])
df_imploded = implodeDataFrame(df, ["token_id", "token", "subset"])
df_imploded = df_imploded.reset_index()
df_imploded.head()

Unnamed: 0,token_id,token,subset,tag
0,7,The,train,"[B-Unknown, B-Masculine, B-Stereotype]"
1,8,Very,train,"[I-Unknown, I-Stereotype, I-Masculine]"
2,9,Rev,train,"[I-Unknown, I-Masculine, B-Unknown, I-Stereotype]"
3,10,Prof,train,"[I-Stereotype, I-Masculine, I-Unknown, I-Unknown]"
4,11,James,train,"[I-Masculine, I-Unknown, I-Stereotype, I-Unknown]"


In [39]:
label_col = getLabelColFromTagCol(df_imploded, "tag")
df_imploded.insert(len(df_imploded.columns), "label", label_col)
df_imploded.head()

Unnamed: 0,token_id,token,subset,tag,label
0,7,The,train,"[B-Unknown, B-Masculine, B-Stereotype]","[Masculine, Stereotype, Unknown]"
1,8,Very,train,"[I-Unknown, I-Stereotype, I-Masculine]","[Masculine, Stereotype, Unknown]"
2,9,Rev,train,"[I-Unknown, I-Masculine, B-Unknown, I-Stereotype]","[Masculine, Stereotype, Unknown]"
3,10,Prof,train,"[I-Stereotype, I-Masculine, I-Unknown, I-Unknown]","[Masculine, Stereotype, Unknown]"
4,11,James,train,"[I-Masculine, I-Unknown, I-Stereotype, I-Unknown]","[Masculine, Stereotype, Unknown]"


In [40]:
token_labels = df_imploded[["token_id","subset","label"]]
token_labels = token_labels.explode("label")
token_labels.head()

Unnamed: 0,token_id,subset,label
0,7,train,Masculine
0,7,train,Stereotype
0,7,train,Unknown
1,8,train,Masculine
1,8,train,Stereotype


In [48]:
subset = "train"
train_labels = token_labels.loc[token_labels.subset == subset].groupby(
    ["subset","label"]).size().reset_index(name="{}_total".format(subset))
train_labels = train_labels.drop(columns=["subset"])

In [49]:
subset = "dev"
validate_labels = token_labels.loc[token_labels.subset == subset].groupby(
    ["subset","label"]).size().reset_index(name="{}_total".format(subset))
validate_labels = validate_labels.drop(columns=["subset"])

In [50]:
subset = "test"
test_labels = token_labels.loc[token_labels.subset == subset].groupby(
    ["subset","label"]).size().reset_index(name="{}_total".format(subset))
test_labels = test_labels.drop(columns=["subset"])

In [56]:
joined = train_labels.join(validate_labels.set_index("label"), on="label", how="outer")
joined_again = joined.join(test_labels.set_index("label"), on="label", how="outer")
joined_again

Unnamed: 0,label,train_total,dev_total,test_total
0,Feminine,2331,944,861
1,Gendered-Pronoun,2210,759,762
2,Gendered-Role,1954,693,607
3,Generalization,1203,385,430
4,Masculine,7268,2195,2276
5,Occupation,3981,1435,1039
6,Omission,7683,2455,2461
7,Stereotype,2884,962,940
8,Unknown,14751,4919,4737


Write the totals data to files:

In [58]:
joined_again.to_csv(config.tokc_path+"model_input/token_label_totals_by_split.csv")

### Descriptions

In [14]:
# Description labels
dir_path = config.docc_path+"model_input/so_model_input/splits_as_csv/"
train_file = dir_path+"aggregated_final_train.csv"
validate_file = dir_path+"aggregated_final_validate.csv"
test_file = dir_path+"aggregated_final_test.csv"

In [16]:
df_train = pd.read_csv(train_file, index_col=0)
df_validate = pd.read_csv(validate_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

In [17]:
print(df_train.shape)
print(df_validate.shape)
print(df_test.shape)

(16541, 7)
(5514, 7)
(5515, 7)


In [23]:
df = pd.concat([df_train,df_validate,df_test])
df = df.loc[~df.label.isna()]
df.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,subset,label
4699,4699,1853,2066,Biographical / Historical,"Labelled Apparently some chapters, amounting t...",train,{'Omission'}
3474,3474,3608,8549,Biographical / Historical,Margaret Winifred Bartholomew was born on 21 A...,train,"{'Omission', 'Stereotype'}"
4769,4769,2378,2576,Biographical / Historical,Blacker and Thomson became close friends throu...,train,{'Omission'}
3027,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,train,{'Stereotype'}
3397,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,train,{'Omission'}


In [24]:
subdf = df[["description_id","label","subset"]]

In [25]:
def getColumnValuesAsLists(df, col_name):
    col_values = list(df[col_name])
    col_list_values = [value[2:-2].split("', '") for value in col_values]
    col_i = list(df.columns).index(col_name)
    df = df.drop(columns=[col_name])
    df.insert(col_i, col_name, col_list_values)
    return df

In [26]:
desc_labels = ["Omission", "Stereotype"]

In [27]:
subdf = getColumnValuesAsLists(subdf, "label")
subdf = subdf.explode("label")
subdf = subdf.loc[subdf.label.isin(desc_labels)]
subdf = subdf.drop_duplicates()

In [28]:
subdf.head(2)

Unnamed: 0,description_id,label,subset
1,1,Stereotype,train
3,3,Stereotype,train


In [30]:
desc_df = pd.DataFrame()
for label in desc_labels:
    label_df = subdf.loc[subdf.label == label]
    label_df = label_df.groupby(['subset','label']).size().reset_index(name=label).T.drop("label")
    desc_df = pd.concat([desc_df,label_df])
# desc_df = desc_df.drop("subset")
desc_df

Unnamed: 0,0,1,2
subset,dev,test,train
Omission,804,828,2400
subset,dev,test,train
Stereotype,315,329,957


Write the data:

In [31]:
desc_df.to_csv(dir_path+"desc_label_totals_by_split.csv")

## Cascades (20/20/20/20/20)

### Tokens

#### Tags

In [6]:
# Tokens
df = pd.read_csv(config.tokc_path+"experiment_input/token_5fold.csv", index_col=0)
df.shape # Looks good

(779270, 10)

In [7]:
df.head(2)

Unnamed: 0,description_id,sentence_id,ann_id,token_id,token,token_offsets,pos,tag,field,fold
0,0,0,99999,0,Identifier,"(0, 10)",NN,O,Identifier,split4
1,0,0,99999,1,:,"(10, 11)",:,O,Identifier,split4


In [156]:
tags = ['B-Feminine','I-Feminine', 'B-Masculine', 
        'I-Masculine', 'B-Unknown', 'I-Unknown', 
        "B-Occupation","I-Occupation",'B-Omission', 
        'I-Omission', 'B-Stereotype', 'I-Stereotype',
        'B-Generalization', 'I-Generalization','B-Gendered-Pronoun',
        'I-Gendered-Pronoun', 'B-Gendered-Role', 'I-Gendered-Role', 
       ]

In [157]:
tag_df = pd.DataFrame()
for tag in tags:
    subdf = df.loc[df.tag == tag]
    subdf = subdf.groupby(['fold','tag']).size().reset_index(name=tag).T.drop("tag")
    tag_df = pd.concat([tag_df,subdf])
# tag_df

In [158]:
tag_df = tag_df.drop("fold")
tag_df

Unnamed: 0,0,1,2,3,4
B-Feminine,264,302,274,323,298
I-Feminine,582,661,584,846,696
B-Masculine,1037,1098,1255,1024,1096
I-Masculine,1371,1512,1810,1378,1366
B-Unknown,1996,2143,2094,2060,2024
I-Unknown,3179,3510,3521,3506,3235
B-Occupation,587,590,650,655,474
I-Occupation,683,688,785,781,565
B-Omission,1082,1091,1157,1082,1093
I-Omission,1783,1631,1884,1728,1674


In [83]:
tag_df.to_csv(config.tokc_path+"experiment_input/tag_totals_per_split.csv")

#### Labels

In [64]:
df = pd.read_csv(config.tokc_path+"experiment_input/token_5fold.csv", index_col=0)
df = df[["token_id","fold","tag"]]
df = df.loc[df.tag != "O"]  # remove unannotated tokens
df = df.loc[df.tag != "B-Nonbinary"]  # mistaken annotation
df = df.loc[df.tag != "I-Nonbinary"]  # mistaken annotation
# df = implodeDataFrame(df, ["token_id","fold"])
tags_col = list(df.tag)
labels = [tag[2:] for tag in tags_col]  #[[tag[2:] for tag in tags] for tags in tags_col]
# labels = [label_list[0] for label_list in labels]
df.insert(len(df.columns), "label", labels)
df = df.drop(columns=["tag"])
df.head()

Unnamed: 0,token_id,fold,label
7,7,split2,Unknown
8,7,split2,Masculine
9,7,split2,Stereotype
10,8,split2,Unknown
11,8,split2,Stereotype


In [68]:
print(df.shape)
df = df.drop_duplicates()
print(df.shape)

(80136, 3)
(73125, 3)


In [69]:
labels = list(df.label.unique())
print(labels)

['Unknown', 'Masculine', 'Stereotype', 'Occupation', 'Gendered-Pronoun', 'Omission', 'Feminine', 'Generalization', 'Gendered-Role']


In [70]:
label_df = pd.DataFrame()
for label in labels:
    subdf = df.loc[df.label == label]
    subdf = subdf.groupby(['fold','label']).size().reset_index(name=label).T.drop("label")
    label_df = pd.concat([label_df,subdf])
label_df = label_df.drop("fold")
label_df

Unnamed: 0,0,1,2,3,4
Unknown,4707,5006,5038,4919,4737
Masculine,2184,2356,2728,2195,2276
Stereotype,994,938,952,962,940
Occupation,1270,1278,1433,1435,1039
Gendered-Pronoun,728,689,793,759,762
Omission,2525,2456,2702,2455,2461
Feminine,706,860,765,944,861
Generalization,373,406,424,385,430
Gendered-Role,588,638,728,693,607


In [71]:
label_df.to_csv(config.tokc_path+"experiment_input/label_totals_per_split.csv")

#### Annotations

In [78]:
df_by_ann = pd.read_csv(config.tokc_path+"experiment_input/token_5fold.csv", index_col=0)
df_by_ann = df_by_ann[["ann_id","tag"]]
df_by_ann = df_by_ann.loc[df_by_ann.ann_id != 99999]
df_by_ann = df_by_ann.drop_duplicates()
df_by_ann = implodeDataFrame(df_by_ann, ["ann_id",])
tags_col = list(df_by_ann.tag)
labels = [[tag[2:] for tag in tags] for tags in tags_col]
labels = [label_list[0] for label_list in labels]
df_by_ann.insert(len(df_by_ann.columns), "label", labels)
df_by_ann = df_by_ann.loc[df_by_ann.label != "Nonbinary"]  # mistaken annotation
df_by_ann = df_by_ann.drop(columns=["tag"])
df_by_ann = df_by_ann.sort_values(by="ann_id")
df_by_ann = df_by_ann.reset_index()
df_by_ann = df_by_ann.drop_duplicates()
df_by_ann.head()

Unnamed: 0,ann_id,label
0,0,Gendered-Role
1,1,Gendered-Role
2,2,Gendered-Role
3,3,Gendered-Role
4,4,Gendered-Role


In [79]:
labels = list(df_by_ann.label.unique())
print(labels)

['Gendered-Role', 'Feminine', 'Unknown', 'Masculine', 'Gendered-Pronoun', 'Stereotype', 'Omission', 'Occupation', 'Generalization']


In [83]:
ann_df = df_by_ann.groupby(['label']).size().reset_index(name="total")
ann_df

Unnamed: 0,label,total
0,Feminine,1655
1,Gendered-Pronoun,3682
2,Gendered-Role,2785
3,Generalization,1293
4,Masculine,5586
5,Occupation,2958
6,Omission,5597
7,Stereotype,1279
8,Unknown,10511


In [84]:
ann_df.to_csv(config.tokc_path+"experiment_input/ann_totals_per_split.csv")

### Descriptions

In [72]:
# Description labels
df = pd.read_csv(config.tokc_path+"experiment_input/document_5fold.csv", index_col=0)
df.head(2)

Unnamed: 0,description_id,start_offset,end_offset,field,description,subset,label,fold
0,4699,1853,2066,Biographical / Historical,"Labelled Apparently some chapters, amounting t...",train,['Omission'],split3
1,8942,384,540,Biographical / Historical,James Aikman of Perth signed his name to a vol...,train,[''],split2


In [73]:
subdf = df[["description_id","label","fold"]]

In [74]:
def getColumnValuesAsLists(df, col_name):
    col_values = list(df[col_name])
    col_list_values = [value[2:-2].split("', '") for value in col_values]
    col_i = list(df.columns).index(col_name)
    df = df.drop(columns=[col_name])
    df.insert(col_i, col_name, col_list_values)
    return df

In [75]:
desc_labels = ["Omission", "Stereotype"]

In [76]:
subdf = getColumnValuesAsLists(subdf, "label")
subdf.label.value_counts()

[]                        22747
[Omission]                 2964
[Omission, Stereotype]     1068
[Stereotype]                533
Name: label, dtype: int64

In [8]:
subdf = subdf.explode("label")
subdf = subdf.loc[subdf.label.isin(desc_labels)]
subdf = subdf.drop_duplicates()

In [9]:
subdf.head(2)

Unnamed: 0,description_id,label,fold
0,4699,Omission,split3
3,3474,Omission,split0


In [10]:
desc_df = pd.DataFrame()
for label in desc_labels:
    label_df = subdf.loc[subdf.label == label]
    label_df = label_df.groupby(['fold','label']).size().reset_index(name=label).T.drop("label")
    desc_df = pd.concat([desc_df,label_df])
desc_df = desc_df.drop("fold")
desc_df

Unnamed: 0,0,1,2,3,4
Omission,798,749,834,813,838
Stereotype,341,290,325,302,343


In [125]:
desc_df.to_csv(config.tokc_path+"experiment_input/doc_label_totals_by_split.csv")