# Analysis: Token BIO Tags

## Post Annotation and Aggregation

Determine which description each annotated text span occurs in and then determine which tokens are in an annotated text span.

***

**Table of Contents**

[0](#0). Load libraries

[1](#1). Load and Transform Data

[2](#2). Assign BIO Tags

***

### 0. Load libraries:

In [1]:
import utils  # import custom functions
import config # import directory path variables

from pathlib import Path

import pandas as pd
import numpy as np
import string, csv, re, os, sys

<a id="1"></a>
### 1. Load and Transform Data

**Load description and annotation data and transform the datasets to more easily associate description IDs to annotation IDs.**

In [3]:
df_tokens = pd.read_csv(config.tokc_path+"tokens_sents_descs.csv", index_col=0)
df_tokens.head()

Unnamed: 0,sentence_id,token_id,token,token_offsets,description_id
0,0,0,Identifier,"(0, 10)",0
0,0,1,:,"(10, 11)",0
0,0,2,AA5,"(12, 15)",0
1,1,3,Title,"(0, 5)",1
1,1,4,:,"(5, 6)",1


In [5]:
assert df_tokens.loc[df_tokens.token.isna()].shape[0] == 0

Transform the offsets column's string values to tuples of ints.

In [7]:
token_desc_ids = list(df_tokens.description_id)
tokens = list(df_tokens.token)
token_offsets = list(df_tokens.token_offsets)
token_offsets_clean = [offsets[1:-1].split(", ") for offsets in token_offsets]
token_offsets_tuples = [tuple((int(offsets[0]), int(offsets[1]))) for offsets in token_offsets_clean]
df_tokens = df_tokens.drop(columns=["token_offsets"])
df_tokens.insert(len(df_tokens.columns), "token_offsets", token_offsets_tuples)
df_tokens.tail()

Unnamed: 0,sentence_id,token_id,token,description_id,token_offsets
42029,27907,455465,cases,27907,"(557, 562)"
42029,27907,455466,involving,27907,"(563, 572)"
42029,27907,455467,homosexual,27907,"(573, 583)"
42029,27907,455468,offences,27907,"(584, 592)"
42029,27907,455469,.,27907,"(592, 593)"


Associate description tokens and annotated text spans' text and offsets to description IDs.

In [8]:
df_tokens_imploded = utils.implodeDataFrame(df_tokens, ["description_id"])
df_tokens_imploded.head()

Unnamed: 0_level_0,sentence_id,token_id,token,token_offsets
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[0, 0, 0]","[0, 1, 2]","[Identifier, :, AA5]","[(0, 10), (10, 11), (12, 15)]"
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]","[Title, :, Papers, of, The, Very, Rev, Prof, J...","[(0, 5), (5, 6), (7, 13), (14, 16), (17, 20), ..."
2,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...","[Scope, and, Contents, :, Sermons, and, addres...","[(0, 5), (6, 9), (10, 18), (18, 19), (20, 27),..."
3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[109, 110, 111, 112, 113, 114, 115, 116, 117, ...","[The, full, text, of, this, sermon, was, publi...","[(943, 946), (947, 951), (952, 956), (957, 959..."
4,"[4, 4, 4]","[141, 142, 143]","[Identifier, :, AA6]","[(0, 10), (10, 11), (12, 15)]"


Load the data description data.

In [11]:
df_descs = pd.read_csv(config.crc_meta_path+"annot_descs.csv", index_col=0)
# Remove columns not needed for linking
df_descs = df_descs.drop(columns=["clean_desc", "word_count", "sent_count"])
# # Ignore rows for Identifer fields (the text of this field wasn't annotated)
# df_descs = df_descs.loc[df_descs.field != "Identifier"]

# Remove file extensions
desc_filenames = list(df_descs.file)
desc_filenames = [f[:-4] for f in desc_filenames]
df_descs.insert(1, "filename", desc_filenames)

# Make sure offsets are in one column as tuples of ints
start_offsets = list(df_descs.start_offset)
end_offsets = list(df_descs.end_offset)
offsets_strs = list(zip(list(df_descs.start_offset),list(df_descs.end_offset)))
offsets_int_tuples = utils.turnStrTuplesToIntTuples(offsets_strs)
df_descs = df_descs.drop(columns=["start_offset", "end_offset"])
df_descs.insert(3, "desc_offsets", offsets_int_tuples)

# # Remove rows with a NaN clean description (their description is in another row under the next file)
# df_descs = df_descs.loc[~df_descs.clean_desc.isna()]

df_descs.head()

Unnamed: 0,description_id,filename,description,desc_offsets,file,field
0,0,AA5_00100,Identifier: AA5,"(0, 16)",AA5_00100.txt,Identifier
1,1,AA5_00100,Title:\nPapers of The Very Rev Prof James Whyt...,"(17, 76)",AA5_00100.txt,Title
2,2,AA5_00100,"Scope and Contents:\nSermons and addresses, 19...","(77, 633)",AA5_00100.txt,Scope and Contents
3,3,AA5_00100,Biographical / Historical:\nProfessor James Ai...,"(634, 1725)",AA5_00100.txt,Biographical / Historical
4,4,AA6_00100,Identifier: AA6,"(0, 16)",AA6_00100.txt,Identifier


In [13]:
assert df_descs.loc[df_descs.description.isna()].shape[0] == 0
assert df_descs.shape[0] == df_tokens_imploded.shape[0]

Associate the imploded token data to the description data (using the `description_id` columns).

In [16]:
df_descs = df_descs.set_index("description_id")
descs_to_tokens = df_descs.join(df_tokens_imploded, on="description_id", how="left")
print(descs_to_tokens.shape)
descs_to_tokens = descs_to_tokens.drop(columns=["file"])
descs_to_tokens.head()

(27908, 9)


Unnamed: 0_level_0,filename,description,desc_offsets,field,sentence_id,token_id,token,token_offsets
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,AA5_00100,Identifier: AA5,"(0, 16)",Identifier,"[0, 0, 0]","[0, 1, 2]","[Identifier, :, AA5]","[(0, 10), (10, 11), (12, 15)]"
1,AA5_00100,Title:\nPapers of The Very Rev Prof James Whyt...,"(17, 76)",Title,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]","[Title, :, Papers, of, The, Very, Rev, Prof, J...","[(0, 5), (5, 6), (7, 13), (14, 16), (17, 20), ..."
2,AA5_00100,"Scope and Contents:\nSermons and addresses, 19...","(77, 633)",Scope and Contents,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...","[Scope, and, Contents, :, Sermons, and, addres...","[(0, 5), (6, 9), (10, 18), (18, 19), (20, 27),..."
3,AA5_00100,Biographical / Historical:\nProfessor James Ai...,"(634, 1725)",Biographical / Historical,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[109, 110, 111, 112, 113, 114, 115, 116, 117, ...","[The, full, text, of, this, sermon, was, publi...","[(943, 946), (947, 951), (952, 956), (957, 959..."
4,AA6_00100,Identifier: AA6,"(0, 16)",Identifier,"[4, 4, 4]","[141, 142, 143]","[Identifier, :, AA6]","[(0, 10), (10, 11), (12, 15)]"


Load the annotation data and associate it to the description-token data joined above.

In [21]:
df_anns = pd.read_csv(config.agg_path+"aggregated_final.csv")
# Remove unnecessary columns
df_anns = df_anns.drop(columns=["category", "associated_genders"])

# Remove file extensions
desc_filenames = list(df_anns.file)
desc_filenames = [f[:-4] for f in desc_filenames]
df_anns.insert(1, "filename", desc_filenames)
df_anns = df_anns.drop(columns=["file"])

# Make sure offsets are in one column as tuples of ints
offsets_strs = list(df_anns.ann_offsets)
offsets_int_tuples = utils.turnStrTuplesToIntTuples(offsets_strs)
df_anns = df_anns.drop(columns=["ann_offsets"])
df_anns.insert(3, "ann_offsets", offsets_int_tuples)

df_anns.head()

Unnamed: 0,agg_ann_id,filename,text,ann_offsets,label,description_id
0,0,Coll-1157_00100,knighted,"(1407, 1415)",Gendered-Role,2364
1,1,Coll-1310_02300,knighthood,"(9625, 9635)",Gendered-Role,4542
2,2,Coll-1281_00100,Prince Regent,"(2426, 2439)",Gendered-Role,3660
3,3,Coll-1310_02700,knighthood,"(9993, 10003)",Gendered-Role,4678
4,4,Coll-1310_02900,Sir,"(7192, 7195)",Gendered-Role,4732


In [23]:
ann_file_col = (df_anns_imploded.filename)
new_col = []
for file_list in ann_file_col:
    assert len(set(file_list)) == 1, "File lists should only have one unique value"
    new_col += [file_list[0]]
df_anns = df_anns.drop(columns=["filename"])
df_anns_imploded = utils.implodeDataFrame(df_anns, ["description_id"])
df_anns_imploded.insert(1, "filename", new_col)
print(df_anns_imploded.shape)
df_anns_imploded.head()

(14779, 5)


Unnamed: 0_level_0,agg_ann_id,filename,text,ann_offsets,label
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"[14384, 24275, 26233, 52952]",AA5_00100,"[The Very Rev Prof James Whyte, The Very Rev P...","[(34, 63), (34, 63), (43, 63), (34, 63)]","[Unknown, Masculine, Unknown, Stereotype]"
3,"[14377, 14378, 14379, 14380, 14381, 14382, 143...",AA5_00100,"[He, he, his, he, he, His, he, Professor James...","[(789, 791), (871, 873), (913, 916), (928, 930...","[Gendered-Pronoun, Gendered-Pronoun, Gendered-..."
5,"[9531, 23084]",AA6_00100,"[Rev Tom Allan, Rev Tom Allan]","[(34, 47), (34, 47)]","[Unknown, Masculine]"
7,"[55, 9516, 9517, 9518, 9519, 9520, 9521, 9522,...",AA6_00100,"[Billy Graham, He, he, he, he, He, his, his, h...","[(1778, 1790), (677, 679), (920, 922), (1222, ...","[Masculine, Gendered-Pronoun, Gendered-Pronoun..."
9,"[14000, 24207]",AA7_00100,"[Rev Prof Alec Campbell Cheyne, Rev Prof Alec ...","[(34, 63), (34, 63)]","[Unknown, Masculine]"


In [48]:
# Join the data, keeping only the rows with annotation data (right join)
sub_descs_to_tokens = descs_to_tokens[["sentence_id", "token_id", "token_offsets"]]
descs_anns_tokens = sub_descs_to_tokens.join(df_anns_imploded, on=["description_id"], how="outer")
print(descs_anns_tokens.shape)
descs_anns_tokens.head()

(27908, 8)


Unnamed: 0_level_0,sentence_id,token_id,token_offsets,agg_ann_id,filename,text,ann_offsets,label
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,"[0, 0, 0]","[0, 1, 2]","[(0, 10), (10, 11), (12, 15)]",,,,,
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]","[(0, 5), (5, 6), (7, 13), (14, 16), (17, 20), ...","[14384, 24275, 26233, 52952]",AA5_00100,"[The Very Rev Prof James Whyte, The Very Rev P...","[(34, 63), (34, 63), (43, 63), (34, 63)]","[Unknown, Masculine, Unknown, Stereotype]"
2,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2...","[(0, 5), (6, 9), (10, 18), (18, 19), (20, 27),...",,,,,
3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[109, 110, 111, 112, 113, 114, 115, 116, 117, ...","[(943, 946), (947, 951), (952, 956), (957, 959...","[14377, 14378, 14379, 14380, 14381, 14382, 143...",AA5_00100,"[He, he, his, he, he, His, he, Professor James...","[(789, 791), (871, 873), (913, 916), (928, 930...","[Gendered-Pronoun, Gendered-Pronoun, Gendered-..."
4,"[4, 4, 4]","[141, 142, 143]","[(0, 10), (10, 11), (12, 15)]",,,,,


Write the data to a file, replacing `NaN` values with empty strings:

In [49]:
descs_anns_tokens = descs_anns_tokens.fillna("")
descs_anns_tokens.to_csv(config.agg_path+"descs_sents_tokens_anns.csv")

In [93]:
descs_anns_tokens.loc[descs_anns_tokens.index == 14]

Unnamed: 0_level_0,sentence_id,token_id,token_offsets,agg_ann_id,filename,text,ann_offsets,label
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14,"[14, 14, 14, 14, 14, 14]","[268, 269, 270, 271, 272, 273]","[(0, 5), (5, 6), (7, 13), (14, 16), (17, 21), ...",[17476],BAI_00100,[John Baillie],"[(102, 114)]",[Unknown]


In [101]:
list(descs_anns_tokens.loc[descs_anns_tokens.index == 14].token_offsets)[0]

[(0, 5), (5, 6), (7, 13), (14, 16), (17, 21), (22, 29)]

In [104]:
df_descs.loc[df_descs.index == 14]

Unnamed: 0_level_0,filename,description,desc_offsets,file,field
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14,BAI_00100,Title:\nPapers of John Baillie,"(85, 115)",BAI_00100.txt,Title


# The token offsets must be wrong!  Or the sentence offsets...?

<a id="2"></a>
### 3. Assign BIO Tags

**Compare the descriptions' tokens' offsets to the annotated text spans' offsets to determine which tokens to mark as the beginning of an annotation (`B-[LABELNAME]`), inside an annotation (`I-[LABELNAME]`), and unannotated, or outisde of an annotation (`O`).**

In [50]:
# Remove columns without IDs and offsets
subdf = descs_anns_tokens.drop(columns=["text", "filename", "label"])
print(subdf.shape)

(27908, 5)


#### 3.1 Review Tokens in Annotated Descriptions

For description IDs that do have annotations, assign their tokens in annotated text spans tags of `B` and `I` for *beginning* and *inside* of an annotation, and assign tokens outside of annotated text spans a tag of `O`.

In [53]:
# Get only the descriptions with annotations
subdf_withann = subdf.loc[subdf.agg_ann_id != ""]
print(subdf_withann.shape)
# Create a dictionary of the remaining offsets and ID data
withann_dict = subdf_withann.to_dict(orient="index")
print(withann_dict[1])

(14779, 5)
{'sentence_id': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'token_id': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'token_offsets': [(0, 5), (5, 6), (7, 13), (14, 16), (17, 20), (21, 25), (26, 29), (30, 34), (35, 40), (41, 46), (47, 48), (48, 57), (57, 58)], 'agg_ann_id': [14384, 24275, 26233, 52952], 'ann_offsets': [(34, 63), (34, 63), (43, 63), (34, 63)]}


In [54]:
desc_ids = list(withann_dict.keys()) #[:10]
# One token can be in multiple annotations, so give each token a list of tag values
desc_to_anntokentags = dict.fromkeys(desc_ids, dict())

# THIS CODE IS BUGGY - DESC ID 14 SHOULD HAVE DATA

In [95]:
for did in desc_ids[:15]:
    # Get the description's token data
    token_ids = withann_dict[did]["token_id"]
    token_offsets = withann_dict[did]["token_offsets"]
    # Get the description's annotation data
    ann_ids = withann_dict[did]["agg_ann_id"]
    ann_offsets = withann_dict[did]["ann_offsets"]
    
    # Determine which tokens begin or are inside of annotated text spans
    tagged_token_ids, tagged_ann_ids, tags = [],[],[]
    for i in range(len(token_ids)):
        token_id = token_ids[i]
        token_offset_pair = token_offsets[i]
        for j in range(len(ann_offsets)):
            ann_offset_pair = ann_offsets[j]
            # If the token's start offset equals the annotation's start offset, give it a B
            if (token_offset_pair[0] == ann_offset_pair[0]):
                tagged_token_ids += [token_id]
                tagged_ann_ids += [ann_ids[j]]
                tags += ["B"]
            # If the token's start offset is in between the annotation's offsets, give it an I
            elif (token_offset_pair[0] > ann_offset_pair[0]) and (token_offset_pair[0] <= ann_offset_pair[1]):
                tagged_token_ids += [token_id]
                tagged_ann_ids += [ann_ids[j]]
                tags += ["I"]

    desc_to_anntokentags[did] = {"token_ids":tagged_token_ids, "ann_ids":tagged_ann_ids,"tags":tags}
    
# print(desc_to_anntokentags[1])

In [100]:
print(desc_to_anntokentags[13])
print(desc_to_anntokentags[14])
print(withann_dict[14])

{'token_ids': [263, 264, 265, 266], 'ann_ids': [17475, 17475, 17475, 17475], 'tags': ['I', 'I', 'I', 'I']}
{'token_ids': [], 'ann_ids': [], 'tags': []}
{'sentence_id': [14, 14, 14, 14, 14, 14], 'token_id': [268, 269, 270, 271, 272, 273], 'token_offsets': [(0, 5), (5, 6), (7, 13), (14, 16), (17, 21), (22, 29)], 'agg_ann_id': [17476], 'ann_offsets': [(102, 114)]}


In [79]:
df = pd.DataFrame.from_dict(desc_to_anntokentags, orient="index").reset_index()
df = df.rename(columns={"index":"description_id"})
df.head()

Unnamed: 0,description_id,token_ids,ann_ids,tags
0,1,"[11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 1...","[14384, 24275, 52952, 14384, 24275, 52952, 143...","[I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
1,3,"[112, 113, 114, 127, 128, 129, 112, 113, 114, ...","[41262, 41262, 41262, 14386, 14386, 14386, 412...","[I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
2,5,"[153, 153]","[9531, 23084]","[I, I]"
3,7,"[165, 166, 167, 165, 166, 167, 165, 166, 167, ...","[40588, 40588, 40588, 40588, 40588, 40588, 405...","[B, I, I, B, I, I, B, I, I, B, I, I, B, I, I, ..."
4,9,"[182, 182, 183, 183, 184, 184, 185, 185]","[14000, 24207, 14000, 24207, 14000, 24207, 140...","[I, I, I, I, I, I, I, I]"


In [80]:
df.shape

(14779, 4)

In [81]:
df_exploded = df.apply(pd.Series.explode)
print(df_exploded.shape)
df_exploded.head()

(38389, 4)


Unnamed: 0,description_id,token_ids,ann_ids,tags
0,1,11,14384,I
0,1,11,24275,I
0,1,11,52952,I
0,1,12,14384,I
0,1,12,24275,I


In [89]:
df_exploded = df_exploded.drop_duplicates()
print(df_exploded.shape)
print(df_exploded.loc[df_exploded.token_id.isna()].shape)

(38389, 4)
(13882, 4)


In [90]:
df_exploded.loc[df_exploded.token_id.isna()].head()

Unnamed: 0,description_id,token_id,ann_id,tag
7,14,,,
8,16,,,
9,17,,,
10,22,,,
11,27,,,


Ignore descriptions without tokens

In [91]:
df_exploded.loc[df_exploded.description_id == 14].head()

Unnamed: 0,description_id,token_id,ann_id,tag
7,14,,,


#### 3.2 Assign O Tags to All Tokens in Unannotated Descriptions

Join the B and I tag data to the entire token dataset and assign all tokens without tags an O.

In [64]:
# Get the descriptions without any annotations
subdf_withoutann = subdf.loc[subdf.agg_ann_id == ""]
print(subdf_withoutann.shape)
# Create a dictionary of the remaining offsets and ID data
withoutann_dict = subdf_withoutann.to_dict(orient="index")
print(withoutann_dict[0])

(13129, 5)
{'sentence_id': [0, 0, 0], 'token_id': [0, 1, 2], 'token_offsets': [(0, 10), (10, 11), (12, 15)], 'agg_ann_id': '', 'ann_offsets': ''}


In [74]:
remaining_tokens = pd.DataFrame.from_dict(withoutann_dict, orient="index")
remaining_tokens = remaining_tokens.reset_index()
remaining_tokens = remaining_tokens.rename(columns={"index":"description_id", "agg_ann_id":"ann_id"})
remaining_tokens = remaining_tokens.drop(columns=["sentence_id","token_offsets", "ann_offsets"])
remaining_tokens_exploded = remaining_tokens.apply(pd.Series.explode)
tags = ["O"]*(remaining_tokens_exploded.shape[0])
remaining_tokens_exploded.insert(3, "tag", tags)
remaining_tokens_exploded.head()

Unnamed: 0,description_id,token_id,ann_id,tag
0,0,0,,O
0,0,1,,O
0,0,2,,O
1,2,16,,O
1,2,17,,O


#### 3.3 Add Label Names to B and I Tags

Join the annotation data to the token data get the label associated with each B and I tag.

In [82]:
# df_anns.head()
subdf_anns = df_anns[["agg_ann_id","text","description_id","label"]]
subdf_anns = subdf_anns.rename(columns={"agg_ann_id":"ann_id"})
subdf_anns.head()

Unnamed: 0,ann_id,text,description_id,label
0,0,knighted,2364,Gendered-Role
1,1,knighthood,4542,Gendered-Role
2,2,Prince Regent,3660,Gendered-Role
3,3,knighthood,4678,Gendered-Role
4,4,Sir,4732,Gendered-Role


In [86]:
df_exploded = df_exploded.rename(columns={"token_ids":"token_id", "ann_ids":"ann_id", "tags":"tag"})
df_exploded.set_index(["ann_id", "description_id"])
all_tokens_labeled = df_exploded.join(subdf_anns.set_index(["ann_id", "description_id"]), on=["ann_id","description_id"], how="left")
all_tokens_labeled.head()

Unnamed: 0,description_id,token_id,ann_id,tag,text,label
0,1,11,14384,I,The Very Rev Prof James Whyte,Unknown
0,1,11,24275,I,The Very Rev Prof James Whyte,Masculine
0,1,11,52952,I,The Very Rev Prof James Whyte,Stereotype
0,1,12,14384,I,The Very Rev Prof James Whyte,Unknown
0,1,12,24275,I,The Very Rev Prof James Whyte,Masculine


In [87]:
print(all_tokens_labeled.loc[all_tokens_labeled.token_id.isna()].shape)

(13882, 6)


In [88]:
df_exploded.loc[df_exploded.token_id.isna()].shape

(13882, 4)

In [162]:
o_tags = all_tokens_labeled.loc[all_tokens_labeled.tag == "O"]
o_tags = o_tags.drop(columns=["label"])
o_tags.head()

Unnamed: 0,description_id,token_id,ann_id,tag,token,offsets,text
285,639,,,O,,,
14778,639,11466.0,,O,Scope,"(3303, 3308)",
14778,639,11467.0,,O,and,"(3309, 3312)",
14778,639,11468.0,,O,Contents,"(3313, 3321)",
14778,639,11469.0,,O,:,"(3321, 3322)",


In [163]:
bi_tags = all_tokens_labeled.loc[all_tokens_labeled.tag != "O"]
complete_tags = bi_tags["tag"] +"-"+bi_tags["label"]
bi_tags = bi_tags.drop(columns=["tag","label"])
bi_tags.insert(3, "tag", complete_tags)
bi_tags.head()

Unnamed: 0,description_id,token_id,ann_id,tag,token,offsets,text
0,1,7.0,14384.0,B-Unknown,The,"(34, 37)",The Very Rev Prof James Whyte
0,1,8.0,14384.0,I-Unknown,Very,"(38, 42)",The Very Rev Prof James Whyte
0,1,9.0,14384.0,I-Unknown,Rev,"(43, 46)",The Very Rev Prof James Whyte
0,1,10.0,14384.0,I-Unknown,Prof,"(47, 51)",The Very Rev Prof James Whyte
0,1,11.0,14384.0,I-Unknown,James,"(52, 57)",The Very Rev Prof James Whyte


#### 3.3 Combine the Data

In [None]:
df_exploded = df_exploded.rename(columns={"token_ids":"token_id", "ann_ids":"ann_id", "tags":"tag"})
df_exploded.head()

In [77]:
all_tokens = pd.concat([remaining_tokens_exploded,df_exploded])
print(all_tokens.shape)
all_tokens = all_tokens.sort_values(by=["description_id","token_id", "ann_id", "tag"])
all_tokens.head()

(237161, 4)


Unnamed: 0,description_id,token_id,ann_id,tag
0,0,0,,O
0,0,1,,O
0,0,2,,O
0,1,11,14384.0,I
0,1,11,24275.0,I


In [78]:
print(all_tokens.loc[all_tokens.tag.isna()].shape) # assign these tag `O`
# all_tokens[["tag"]] = all_tokens[["tag"]].fillna("O")
# print(all_tokens.loc[all_tokens.tag.isna()].shape)

(13882, 4)
(13882, 4)


In [136]:
print(all_tokens.shape)
all_tokens = all_tokens.drop_duplicates()
print(all_tokens.shape)

(784375, 6)
(784375, 6)


In [164]:
df = pd.concat([bi_tags,o_tags], sort=True)
df.head()

Unnamed: 0,ann_id,description_id,offsets,tag,text,token,token_id
0,14384.0,1,"(34, 37)",B-Unknown,The Very Rev Prof James Whyte,The,7.0
0,14384.0,1,"(38, 42)",I-Unknown,The Very Rev Prof James Whyte,Very,8.0
0,14384.0,1,"(43, 46)",I-Unknown,The Very Rev Prof James Whyte,Rev,9.0
0,14384.0,1,"(47, 51)",I-Unknown,The Very Rev Prof James Whyte,Prof,10.0
0,14384.0,1,"(52, 57)",I-Unknown,The Very Rev Prof James Whyte,James,11.0


In [165]:
df = df.sort_values(by=["description_id","token_id"])
df.head()

Unnamed: 0,ann_id,description_id,offsets,tag,text,token,token_id
14778,,0,"(0, 10)",O,,Identifier,0.0
14778,,0,"(10, 11)",O,,:,1.0
14778,,0,"(12, 15)",O,,AA5,2.0
14778,,1,"(17, 22)",O,,Title,3.0
14778,,1,"(22, 23)",O,,:,4.0


Write the resulting data for token classification:

In [167]:
df.to_csv(config.tokc_path+"tagged_tokens.csv")