# Analysis: Label and Tag Distributions

Calculate the total number of labels and B-/I- tags across the data splits for the experiments (split 60/20/20 for training, validation, and test) and cascades (split 20/20/20/20/20 for 5-fold cross-validation).

In [11]:
import config
import pandas as pd

In [12]:
def getTagTotals(df, split):
    subdf = df.loc[df.tag != "O"]
    subdf = subdf.loc[subdf.tag != "B-Nonbinary"]
    subdf = subdf.loc[subdf.tag != "I-Nonbinary"]
    subdf = subdf.loc[subdf.tag != "B-Omission"]
    subdf = subdf.loc[subdf.tag != "I-Omission"]
    subdf = subdf.loc[subdf.tag != "B-Stereotype"]
    subdf = subdf.loc[subdf.tag != "I-Stereotype"]
    # Reference: https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2
    df_tag_totals = subdf.groupby('tag').size().reset_index(name="{}_total".format(split))
    df_tag_totals = df_tag_totals.sort_values(by="{}_total".format(split))
    df_tag_totals = df_tag_totals.reset_index()
    df_tag_totals.drop("index", axis=1)
    tag_totals = df_tag_totals.tag.str.split("-", n=1, expand=True)
    df_tag_totals.insert(2, "label", tag_totals[[1]])
    df_tag_totals = df_tag_totals.drop(columns=["index"])
    df_tag_totals = df_tag_totals.sort_values(by=["label","tag"])
    return df_tag_totals

In [13]:
# def getDescTotals(df, split):
#     subdf = df[["description_id", "tag"]].loc[df.tag.isin(["B-Omission","I-Omission","B-Stereotype","I-Stereotype"])]
#     subdf = subdf.drop_duplicates()
#     subdf[["tag"]] = subdf[["tag"]].replace(["B-Omission","I-Omission"],"Omission")
#     subdf[["tag"]] = subdf[["tag"]].replace(["B-Stereotype","I-Stereotype"],"Stereotype")
#     subdf = df[["description_id", "tag"]].loc[df.tag.isin(["Omission","I-Omission","B-Stereotype","I-Stereotype"])]
#     subdf = subdf.drop_duplicates()
#     subdf = subdf.rename(columns={"tag":"label"})
#     df_totals = subdf.groupby('label').size().reset_index(name="{}_total".format(split))
#     df_totals = df_totals.sort_values(by=["label"])
#     return df_totals

## Experiments (60/20/20)

### Tokens

In [15]:
# Token labels
dir_path = config.tokc_path + "model_input/"
train_file = dir_path+"token_train.csv"
validate_file = dir_path+"token_validate.csv"
test_file = dir_path+"token_test.csv"

In [16]:
df_train = pd.read_csv(train_file, index_col=0)
df_validate = pd.read_csv(validate_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

In [17]:
print(df_train.shape)
print(df_validate.shape)
print(df_test.shape)

(467564, 10)
(157740, 10)
(153966, 10)


In [6]:
train_totals = getTagTotals(df_train, "train")
train_labels = train_totals.groupby("label").sum().reset_index()
dev_totals = getTagTotals(df_validate, "dev")
dev_labels = dev_totals.groupby("label").sum().reset_index()
test_totals = getTagTotals(df_test, "test")
test_labels = test_totals.groupby("label").sum().reset_index()

In [9]:
train_totals.insert(len(train_totals.columns),"dev_total",dev_totals.dev_total)
train_totals.insert(len(train_totals.columns),"test_total",test_totals.test_total)
train_totals = train_totals.drop(columns=["label"])
train_totals

Unnamed: 0,tag,train_total,dev_total,test_total
4,B-Feminine,840,323,298
7,I-Feminine,1827,744,565
9,B-Gendered-Pronoun,2186,846,748
0,I-Gendered-Pronoun,24,16,14
5,B-Gendered-Role,1674,590,474
1,I-Gendered-Role,361,134,116
3,B-Generalization,780,245,258
2,I-Generalization,449,146,183
10,B-Masculine,3390,1024,1096
11,I-Masculine,4693,1378,1366


In [10]:
train_labels.insert(len(train_labels.columns),"dev_total",dev_labels.dev_total)
train_labels.insert(len(train_labels.columns),"test_total",test_labels.test_total)
train_labels

Unnamed: 0,label,train_total,dev_total,test_total
0,Feminine,2667,1169,994
1,Gendered-Pronoun,2210,760,762
2,Gendered-Role,2035,724,633
3,Generalization,1229,391,441
4,Masculine,8083,2402,2462
5,Occupation,3983,1436,1039
6,Unknown,16443,5566,5259


Write the totals data to files:

In [11]:
train_totals.to_csv(config.tokc_path+"model_input/token_tag_totals_by_split.csv")
train_labels.to_csv(config.tokc_path+"model_input/token_label_totals_by_split.csv")

### Descriptions

In [14]:
# Description labels
dir_path = config.docc_path+"model_input/so_model_input/splits_as_csv/"
train_file = dir_path+"aggregated_final_train.csv"
validate_file = dir_path+"aggregated_final_validate.csv"
test_file = dir_path+"aggregated_final_test.csv"

In [16]:
df_train = pd.read_csv(train_file, index_col=0)
df_validate = pd.read_csv(validate_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

In [17]:
print(df_train.shape)
print(df_validate.shape)
print(df_test.shape)

(16541, 7)
(5514, 7)
(5515, 7)


In [23]:
df = pd.concat([df_train,df_validate,df_test])
df = df.loc[~df.label.isna()]
df.head()

Unnamed: 0,description_id,start_offset,end_offset,field,description,subset,label
4699,4699,1853,2066,Biographical / Historical,"Labelled Apparently some chapters, amounting t...",train,{'Omission'}
3474,3474,3608,8549,Biographical / Historical,Margaret Winifred Bartholomew was born on 21 A...,train,"{'Omission', 'Stereotype'}"
4769,4769,2378,2576,Biographical / Historical,Blacker and Thomson became close friends throu...,train,{'Omission'}
3027,3027,627,1162,Biographical / Historical,Thomas Young was probably born in 1725. By the...,train,{'Stereotype'}
3397,3397,8095,8334,Biographical / Historical,Andrew Tait worked on Paramecium in Beale's la...,train,{'Omission'}


In [24]:
subdf = df[["description_id","label","subset"]]

In [25]:
def getColumnValuesAsLists(df, col_name):
    col_values = list(df[col_name])
    col_list_values = [value[2:-2].split("', '") for value in col_values]
    col_i = list(df.columns).index(col_name)
    df = df.drop(columns=[col_name])
    df.insert(col_i, col_name, col_list_values)
    return df

In [26]:
desc_labels = ["Omission", "Stereotype"]

In [27]:
subdf = getColumnValuesAsLists(subdf, "label")
subdf = subdf.explode("label")
subdf = subdf.loc[subdf.label.isin(desc_labels)]
subdf = subdf.drop_duplicates()

In [28]:
subdf.head(2)

Unnamed: 0,description_id,label,subset
1,1,Stereotype,train
3,3,Stereotype,train


In [30]:
desc_df = pd.DataFrame()
for label in desc_labels:
    label_df = subdf.loc[subdf.label == label]
    label_df = label_df.groupby(['subset','label']).size().reset_index(name=label).T.drop("label")
    desc_df = pd.concat([desc_df,label_df])
# desc_df = desc_df.drop("subset")
desc_df

Unnamed: 0,0,1,2
subset,dev,test,train
Omission,804,828,2400
subset,dev,test,train
Stereotype,315,329,957


Write the data:

In [31]:
desc_df.to_csv(dir_path+"desc_label_totals_by_split.csv")

## Cascades (20/20/20/20/20)

### Tokens

In [154]:
# Token labels
df = pd.read_csv(config.tokc_path+"experiment_input/token_5fold.csv", index_col=0)

In [155]:
df.head(2)

Unnamed: 0,description_id,sentence_id,ann_id,token_id,token,token_offsets,pos,tag,field,fold
0,0,0,99999,0,Identifier,"(0, 10)",NN,O,Identifier,split4
1,0,0,99999,1,:,"(10, 11)",:,O,Identifier,split4


In [156]:
tags = ['B-Feminine','I-Feminine', 'B-Masculine', 
        'I-Masculine', 'B-Unknown', 'I-Unknown', 
        "B-Occupation","I-Occupation",'B-Omission', 
        'I-Omission', 'B-Stereotype', 'I-Stereotype',
        'B-Generalization', 'I-Generalization','B-Gendered-Pronoun',
        'I-Gendered-Pronoun', 'B-Gendered-Role', 'I-Gendered-Role', 
       ]

In [157]:
tag_df = pd.DataFrame()
for tag in tags:
    subdf = df.loc[df.tag == tag]
    subdf = subdf.groupby(['fold','tag']).size().reset_index(name=tag).T.drop("tag")
    tag_df = pd.concat([tag_df,subdf])
# tag_df

In [158]:
tag_df = tag_df.drop("fold")
tag_df

Unnamed: 0,0,1,2,3,4
B-Feminine,264,302,274,323,298
I-Feminine,582,661,584,846,696
B-Masculine,1037,1098,1255,1024,1096
I-Masculine,1371,1512,1810,1378,1366
B-Unknown,1996,2143,2094,2060,2024
I-Unknown,3179,3510,3521,3506,3235
B-Occupation,587,590,650,655,474
I-Occupation,683,688,785,781,565
B-Omission,1082,1091,1157,1082,1093
I-Omission,1783,1631,1884,1728,1674


In [83]:
tag_df.to_csv(config.tokc_path+"experiment_input/tag_totals_per_split.csv")

In [167]:
tag_df = tag_df.reset_index()
label_df = tag_df.rename(columns={"index":"tag"})
label_df = label_df.tag.str.split("-", n=1, expand=True)
tag_df.insert(len(tag_df.columns), "label", label_df[[1]])
tag_df = tag_df.drop(columns=["index"])

In [169]:
label_totals = tag_df.groupby("label").sum().reset_index()

In [170]:
label_totals

Unnamed: 0,label,0,1,2,3,4
0,Feminine,846,963,858,1169,994
1,Gendered-Pronoun,728,689,793,760,762
2,Gendered-Role,608,664,763,724,633
3,Generalization,385,410,434,391,441
4,Masculine,2408,2610,3065,2402,2462
5,Occupation,1270,1278,1435,1436,1039
6,Omission,2865,2722,3041,2810,2767
7,Stereotype,1056,1015,1045,1126,1001
8,Unknown,5175,5653,5615,5566,5259


In [171]:
label_totals.to_csv(config.tokc_path+"experiment_input/label_totals_per_split.csv")

### Descriptions

In [4]:
# Description labels
df = pd.read_csv(config.tokc_path+"experiment_input/document_5fold.csv", index_col=0)
df.head(2)

Unnamed: 0,description_id,start_offset,end_offset,field,description,subset,label,fold
0,4699,1853,2066,Biographical / Historical,"Labelled Apparently some chapters, amounting t...",train,['Omission'],split3
1,8942,384,540,Biographical / Historical,James Aikman of Perth signed his name to a vol...,train,[''],split2


In [5]:
subdf = df[["description_id","label","fold"]]

In [6]:
def getColumnValuesAsLists(df, col_name):
    col_values = list(df[col_name])
    col_list_values = [value[2:-2].split("', '") for value in col_values]
    col_i = list(df.columns).index(col_name)
    df = df.drop(columns=[col_name])
    df.insert(col_i, col_name, col_list_values)
    return df

In [7]:
desc_labels = ["Omission", "Stereotype"]

In [8]:
subdf = getColumnValuesAsLists(subdf, "label")
subdf = subdf.explode("label")
subdf = subdf.loc[subdf.label.isin(desc_labels)]
subdf = subdf.drop_duplicates()

In [9]:
subdf.head(2)

Unnamed: 0,description_id,label,fold
0,4699,Omission,split3
3,3474,Omission,split0


In [10]:
desc_df = pd.DataFrame()
for label in desc_labels:
    label_df = subdf.loc[subdf.label == label]
    label_df = label_df.groupby(['fold','label']).size().reset_index(name=label).T.drop("label")
    desc_df = pd.concat([desc_df,label_df])
desc_df = desc_df.drop("fold")
desc_df

Unnamed: 0,0,1,2,3,4
Omission,798,749,834,813,838
Stereotype,341,290,325,302,343


In [125]:
desc_df.to_csv(config.tokc_path+"experiment_input/doc_label_totals_by_split.csv")