In [1]:
import csv
import json
import re
import pandas as pd
from pathlib import Path
from humannotator import Annotator

In [2]:
pd.set_option('display.max_columns', None)

# Load Szeged Uncertainty Corpus
## token-level, multi-class labels
source `merged_data` file: [http://people.rc.rit.edu/~bsm9339/corpora/szeged_uncertainty/](http://people.rc.rit.edu/~bsm9339/corpora/szeged_uncertainty/)

In [3]:
path = Path('../data')

The following cell parses the tsv file line by line and extracts each feature into a dictionary (the key is the feature name, the value is the feature value); the dictionary is then loaded into a pandas DataFrame so that each feature is in its own column. This procedure is necessary (in contrast to loading the table directly into pandas df) because --
- the features are not arranged in a fixed order in the tsv file
- if a feature is not relevant for a specific row, it is not mentioned at all; this results in varying number of columns for each row

In [4]:
data = dict()
with open(path / 'merged_data.tsv', 'r', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter='\t', quoting=3)
    for idx, line in enumerate(tsvreader):
        if not line:
            continue
        row = dict()
        row['sen_tok_id'] = line[0]
        row['words'] = line[1]
        row['stem'] = line[2]
        row['pos'] = line[3]
        row['labels'] = line[5]
        for colitem in line[6:]:
            if colitem.startswith('L_'):
                continue
            regex = re.compile('([a-z]+_(?:-*\d|[a-z]+))')
            r = regex.split(colitem, maxsplit=1)
            row[r[1]] = r[2][1:-4]
        data[idx] = row

In [5]:
szeged = pd.DataFrame.from_dict(data, orient='index'
).assign(
    sentence_id=lambda df: df.sen_tok_id.str.extract(r'sent(\d+)token', expand=False).astype(int),
    sentence = lambda df: df.groupby('sentence_id').words.transform(lambda s: s.str.cat()),
).drop(labels=['sen_tok_id', 'lemma_0', 'pos_0'], axis=1)

In [8]:
szeged.labels.value_counts()

C    1821704
E       8213
U       5740
I       1930
D       1408
N        831
Name: labels, dtype: int64

# Deduplicate sentences

In [9]:
szeged = szeged.assign(
    first_sent_id = lambda df: df.groupby('sentence').sentence_id.transform('min')
).query("sentence_id == first_sent_id")

In [10]:
szeged.labels.value_counts()

C    1055973
E       4174
U       3536
D        915
I        881
N        501
Name: labels, dtype: int64

# Fix labels (re-label U class)
source `json` files: [http://people.rc.rit.edu/~bsm9339/corpora/szeged_uncertainty/](http://people.rc.rit.edu/~bsm9339/corpora/szeged_uncertainty/)

## Step 1: Collect all uncertainty cues from the json files

In [11]:
jpath = path / 'json'

utexts = list()
utypes = list()
utails = list()
cue_dict = dict()

for filename in jpath.glob('*.json'):
    with open(filename, 'r') as f:
        data = json.load(f)
    print(f"Processing {filename.stem}")
    for item in data['Annotation']['DocumentSet']['Document']:
        for dictionary in item['DocumentPart']:
            if not isinstance(dictionary, dict) or dictionary.get('Sentence') is None:
                continue
            if isinstance(dictionary['Sentence'], list):
                for sent_dict in dictionary['Sentence']:
                    ccue = sent_dict.get('ccue')
                    if ccue is None:
                        continue
                    ccue = [ccue] if isinstance(ccue, dict) else ccue
                    for item in ccue:
                        utypes.append(item['@type'])
                        utexts.append(item['#text'])
                        utails.append(item['#tail'])

cue_dict['utext'] = utexts
cue_dict['utype'] = utypes
cue_dict['utail'] = utails

cues = pd.DataFrame.from_dict(cue_dict)

Processing bio_bmc
Processing bio_fly
Processing bio_hbc
Processing factbank
Processing wiki


In [12]:
cues.utype.unique()

array(['speculation_modal_probable_', 'speculation_hypo_investigation _',
       'speculation_hypo_doxastic _', 'speculation_hypo_condition _',
       'speculation_modal_possible_'], dtype=object)

In [13]:
cues.shape

(9176, 3)

## Step 2: explode cues with multiple tokens
### so that each token has its own row (like in the szeged df)

In [14]:
cues = cues.assign(
    list_text = lambda df: df.utext.str.split(),
    len_text = lambda df: df.list_text.apply(len),
).explode('list_text')

cues.shape

(9603, 5)

## Step 3: convert label names

In [15]:
label_map = {
    "speculation_modal_possible_": "E", 
    "speculation_modal_probable_": "E", 
    "speculation_hypo_doxastic _": "D", 
    "speculation_hypo_investigation _": "I", 
    "speculation_hypo_condition _": "N",
}

cues = cues.assign(
    new_label = lambda df: df.utype.map(label_map),
    u = 'U',
    text_n_tail = lambda df: df.utext + df.utail,
)

## Step 4: re-label based on matched sentence text

In [16]:
NONMATCHED = list()
def add_sent_id(row, sent_df):
    """
    Find the sentence(s) in `sent_df` that contain the cue and its tail.
    If only one sentence is matched - return its sentence_id.
    If multiple sentences or no sentences are matched - add row to `UNMATCHED`.
    """
    search_str = row.text_n_tail.replace(" ", "")
    crit = sent_df.sentence.str.contains(search_str, regex=False)
    if crit.sum() == 1:
        return sent_df.loc[crit].sentence_id.iloc[0]
    elif crit.sum() > 1:
        row['matched_ids'] = sent_df.loc[crit].sentence_id.to_list()
    NONMATCHED.append(row.copy())
    return None

In [17]:
sent_df = szeged.query("labels == 'U'").drop_duplicates(subset=['sentence'])
cues['sentence_id'] = cues.apply(lambda row: add_sent_id(row, sent_df), axis=1)
not_matched = pd.concat(NONMATCHED, axis=1).T

In [18]:
print(
f"""The sentence_id was matched for 
{len(cues.query("sentence_id.notna()"))} out of {len(szeged.query("labels == 'U'"))} U-labeled tokens in szeged df. 
These tokens can now be re-labeled with the correct label from the json.
"""
)

The sentence_id was matched for 
3051 out of 3536 U-labeled tokens in szeged df. 
These tokens can now be re-labeled with the correct label from the json.



In [19]:
szeged = szeged.merge(
    cues[['list_text', 'new_label', 'u', 'sentence_id']],
    how='left',
    left_on=['words', 'labels', 'sentence_id'],
    right_on=['list_text', 'u', 'sentence_id'],
)

szeged = szeged.drop(['list_text', 'u'], axis=1)

In [20]:
szeged.new_label = szeged.new_label.fillna(szeged.labels)

In [21]:
szeged.new_label.value_counts()

C    1055973
E       6029
D       1417
I       1263
N        860
U        624
Name: new_label, dtype: int64

## Step 5: fix non-matched

### (a) check whether all appearances of a cue in the corpus are assigned the same label; if so, it can be safely re-labeled

In [22]:
def ambig_check(row, cues_df):
    return cues_df.query(f'utext == "{row.utext}"').new_label.unique()

not_matched['possible_labels'] = not_matched.apply(lambda row: ambig_check(row, cues), axis=1)

In [23]:
print(f"Unique non-matched cues: {not_matched.utext.nunique()}")
print(f"""Always have the same label: {not_matched.assign(n_p_labels=lambda df: df.possible_labels.apply(len)).query("n_p_labels==1").utext.nunique()}
""")

Unique non-matched cues: 286
Always have the same label: 267



In [24]:
unambig = not_matched.assign(n_p_labels=lambda df: df.possible_labels.apply(len)
).query("n_p_labels==1"
)[['utext', 'list_text', 'new_label', 'u']
].drop_duplicates(ignore_index=True)

In [25]:
szeged = szeged.merge(
    unambig,
    how='left',
    left_on=['words', 'new_label'],
    right_on=['list_text', 'u'],
)

szeged = szeged.drop(['utext', 'list_text', 'u'], axis=1)

In [26]:
szeged.new_label_y = szeged.new_label_y.fillna(szeged.new_label_x)

In [27]:
szeged.new_label_y.value_counts()

C    1055973
E       6455
D       1538
I       1493
N        908
U        102
Name: new_label_y, dtype: int64

### (b) fix remaining U labels manually

In [28]:
szeged.rename({'labels': 'original_labels'}, axis=1, inplace=True)

In [29]:
fix = szeged.assign(
    sent_w_space = lambda df: df.groupby('sentence_id').words.transform(lambda s: s.str.cat(sep=' ')),
).query("new_label_y == 'U'")

In [30]:
# labels = ['E', 'D', 'I', 'N']
# a = Annotator(fix[['words', 'sentence_id', 'sent_w_space']], name='Uncertainty labels')
# a.tasks['Label'] = labels, None, True

In [31]:
# a()

In [32]:
# a.save(path / 'manual_fix_annot.pkl')
b = Annotator.load(path / 'manual_fix_annot.pkl')

In [33]:
fix = fix.merge(
    b.annotated[['Label']],
    left_index=True,
    right_index=True,
)

In [34]:
szeged = szeged.merge(
    fix[['words', 'original_labels', 'sentence_id', 'Label']],
    how='left',
    left_on=['words', 'original_labels', 'sentence_id'],
    right_on=['words', 'original_labels', 'sentence_id'],
)

In [35]:
szeged.Label = szeged.Label.fillna(szeged.new_label_y)

In [36]:
szeged.Label.value_counts()

C    1055973
E       6525
D       1546
I       1496
N        931
Name: Label, dtype: int64

# Save fixed Szeged Uncertainty Corpus

In [38]:
final = szeged.rename({'Label': 'labels'}, axis=1
).drop(['original_labels', 'sentence', 'first_sent_id', 'new_label_x', 'new_label_y'], axis=1)

In [40]:
final.to_pickle(path / 'szeged_fixed.pkl')