In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import re
import sys
sys.path.insert(0, '../src')
from dict import replacement_dict

In [2]:
dataset = load_dataset("json", data_files="../data/english_balanced_10k.jsonl", split="train")

In [3]:
dataset

Dataset({
    features: ['masked_text', 'unmasked_text', 'token_entity_labels', 'tokenised_unmasked_text'],
    num_rows: 10912
})

In [4]:
print(dataset[0])

{'masked_text': "[PREFIX_1] [FIRSTNAME_1] [MIDDLENAME_1] [LASTNAME_1], as a [JOBDESCRIPTOR_1] [JOBTITLE_1] at [COMPANY_NAME_1], your knowledge of change management is vital for our company's transformation. We request you to create a change management strategy.", 'unmasked_text': "Mr. Adolphus Reagan Ziemann, as a Central Principal Applications Executive at McLaughlin, Nader and Purdy, your knowledge of change management is vital for our company's transformation. We request you to create a change management strategy.", 'token_entity_labels': ['B-PREFIX', 'I-PREFIX', 'B-FIRSTNAME', 'I-FIRSTNAME', 'B-MIDDLENAME', 'B-LASTNAME', 'I-LASTNAME', 'I-LASTNAME', 'O', 'O', 'O', 'B-JOBDESCRIPTOR', 'B-JOBTITLE', 'I-JOBTITLE', 'I-JOBTITLE', 'O', 'B-COMPANY_NAME', 'I-COMPANY_NAME', 'I-COMPANY_NAME', 'I-COMPANY_NAME', 'I-COMPANY_NAME', 'I-COMPANY_NAME', 'I-COMPANY_NAME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'token

In [5]:
ds = pd.DataFrame.from_dict(dataset[:])
ds.head()

Unnamed: 0,masked_text,unmasked_text,token_entity_labels,tokenised_unmasked_text
0,[PREFIX_1] [FIRSTNAME_1] [MIDDLENAME_1] [LASTN...,"Mr. Adolphus Reagan Ziemann, as a Central Prin...","[B-PREFIX, I-PREFIX, B-FIRSTNAME, I-FIRSTNAME,...","[mr, ., adolph, ##us, reagan, z, ##ie, ##mann,..."
1,"Hello [FIRSTNAME_1], would you please investig...","Hello Hannah, would you please investigate the...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, O, O,...","[hello, hannah, ,, would, you, please, investi..."
2,We also request a review of our policies with ...,We also request a review of our policies with ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[we, also, request, a, review, of, our, polici..."
3,"Dear [FIRSTNAME_1], a company-wide presentatio...","Dear Devan, a company-wide presentation is req...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, dev, ##an, ,, a, company, -, wide, pres..."
4,Can we also have a session on how to manage st...,Can we also have a session on how to manage st...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[can, we, also, have, a, session, on, how, to,..."


In [6]:
new_df = []
document_id = 0
for row_num in range(len(ds)):
    document = document_id
    full_text = ds['unmasked_text'][row_num]
    
    tokens = ds['tokenised_unmasked_text'][row_num]

    trailing_whitespace = []
    for token in tokens:
        pattern = re.escape(token.replace('##','')) + r'\s'
        if re.search(pattern, full_text):
            trailing_whitespace.append(True)
        else:
            trailing_whitespace.append(False)
    

    labels = ds['token_entity_labels'][row_num]
    for word_pos in range(len(labels)):
        word = labels[word_pos]
        # if the label exists in the dictionary, replace it
        if word in replacement_dict:
            labels[word_pos] = replacement_dict[word]
    
    new_df.append([row_num, full_text, tokens, trailing_whitespace, labels])
    
new_df = pd.DataFrame(new_df, columns =['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])

In [7]:
new_df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,0,"Mr. Adolphus Reagan Ziemann, as a Central Prin...","[mr, ., adolph, ##us, reagan, z, ##ie, ##mann,...","[False, True, False, True, False, False, False...","[O, O, B-NAME_STUDENT, I-NAME_STUDENT, I-NAME_..."
1,1,"Hello Hannah, would you please investigate the...","[hello, hannah, ,, would, you, please, investi...","[False, False, True, True, True, True, True, T...","[O, B-NAME_STUDENT, O, O, O, O, O, O, O, O, O,..."
2,2,We also request a review of our policies with ...,"[we, also, request, a, review, of, our, polici...","[False, True, True, True, True, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,3,"Dear Devan, a company-wide presentation is req...","[dear, dev, ##an, ,, a, company, -, wide, pres...","[False, False, False, True, True, False, False...","[O, B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O..."
4,4,Can we also have a session on how to manage st...,"[can, we, also, have, a, session, on, how, to,...","[False, True, True, True, True, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [8]:
new_df.to_csv('../data/pii_parsed_10k.csv')