In [1]:
import csv
import random
import jsonlines
import numpy as np
import pandas as pd
from collections import defaultdict

seed = 1
random.seed(seed)
np.random.seed(seed)

## Dataset Setup

### Test Inputs

- 16 name sets with 20 names in each set.
- 100 clinical templates with an average of 3.5 unique names and 7.3 name mentions per template. Their lengths range from 3956 to 24836 characters, with a mean of 12893.0 characters.
- Generate 10 copies for each name set, fill up the names by randomly sampling from the corresponding name set.
- Total: 16000 input notes for auditing = 100 templates * 10 copies per set per template * 16 name sets

### Name Tags

- Convention for labeling name tags: **\*\*NAME-1A\*\***
    - 1st position: person ID in one template -> 1: Person 1
    - 2nd position: gender -> A: arbitrary gender | F: female | M: male
- Each name consists of a first name and a surname.
- Inputs: {(template ID, set ID, copy ID): note}
- Labels: {(template ID, set ID, copy ID): {(start index, end index): (name, name tag, name set)}}

In [2]:
# prepare the unannotated templates

templates_all = pd.read_csv('../Data/General/Raw/Note/discharges-all.csv')
patients_unseen = pd.read_csv('../Data/General/Raw/Note/subjectids-unseen.csv')
templates_unseen = templates_all[templates_all['subject_id'].isin(patients_unseen['subject_id'].unique())].drop_duplicates('subject_id')
patients_valid = patients_unseen[patients_unseen['subject_id'].isin(templates_unseen['subject_id'].unique())].drop_duplicates('subject_id')

size = 100
genders = ['M', 'F']
patients_selected = []
for gender in genders:
    patients_selected += list(patients_valid[patients_valid['gender']==gender]['subject_id'].sample(size//len(genders)))
templates_selected = templates_unseen[templates_unseen['subject_id'].isin(patients_selected)]
templates_selected.to_csv('../Data/General/Raw/Note/notes-raw.csv', index=False)

In [3]:
class Namer:
    
    def __init__(self):
        
        self.copy_size = 10
        self.name_tag = ('**NAME-', 11)
        self.other_tags = {'**AGE**':'38', '**CONTACT**':'0123456789', '**DATE**':'2100/01/01', 
                           '**HOSPITAL**':'General Hospital', '**ID**':'100', '**LANGUAGE**':'English', 
                           '**LOCATION**':'Building 1', '**OTHER**':'_', '**PROFESSION**':'worker'}
        
        self.first2last = {1:1, 2:1, 3:2, 4:2, 5:3, 6:3, 7:4, 8:4, 9:5, 10:5, 11:6, 12:6, 13:1, 14:1, 15:1, 16:1}
        self.first_sets, self.last_sets = defaultdict(list), defaultdict(list)
        for sets, filename in zip([self.first_sets, self.last_sets], ['names-first.csv', 'names-last.csv']):            
            with open(f'../Data/General/Input/{filename}') as file:
                reader = csv.reader(file)
                _ = next(reader)
                for row in reader:
                    setID, name = int(row[0]), row[-3]
                    sets[setID].append(name)
                    
    def replace_irrelevant(self, template):
        
        for tag, fake in self.other_tags.items():
            template = template.replace(tag, fake)
        return template
                    
    def find_occurrences(self, note, tag):
        
        pattern, length = tag
        occurrences = []
        index = note.find(pattern)
        while index != -1:
            occurrences.append((index, index+length, note[index:index+length]))
            index = note.find(pattern, index+length)
        return occurrences
                    
    def sample_name(self, setID):
        
        names = {'F':self.first_sets[setID], 'L': self.last_sets[self.first2last[setID]]}         
        for each in names.values():
            random.shuffle(each)
        return names
                        
    # replace the irrelevant tags, identify the name tags, sample the names, replace the name tags, return the notes and names
    def __call__(self, templates):
        
        input_notes, input_labels = {}, defaultdict(dict)
        for templateID, row in templates.iterrows():
            template = self.replace_irrelevant(row['text'])
            tags = {tag for _, _, tag in self.find_occurrences(template, self.name_tag)}
            for setID in self.first2last:
                for copyID in range(self.copy_size):
                    names = self.sample_name(setID)
                    
                    name2tag, name2setID = {}, {}
                    input_notes[(templateID, setID, copyID)] = template
                    for tag in tags:
                        personID = int(tag[7])
                        name = ' '.join([names['F'][personID], names['L'][personID]])
                        
                        name2tag[name] = tag
                        name2setID[name] = setID
                        input_notes[(templateID, setID, copyID)] = input_notes[(templateID, setID, copyID)].replace(tag, name)
                        
                    for name in name2tag:
                        for start, end, _ in self.find_occurrences(input_notes[(templateID, setID, copyID)], (name, len(name))):
                            input_labels[(templateID, setID, copyID)][(start, end)] = (name, name2tag[name], name2setID[name])
                            
                    name_overlap = set()
                    positions = list(input_labels[(templateID, setID, copyID)].keys())
                    for i in range(len(positions)):
                        for j in range(i+1, len(positions)):
                            if positions[i][0]<=positions[j][0] and positions[i][1]>=positions[j][1]: name_overlap.add(positions[j])
                    for start, end in name_overlap:
                        del input_labels[(templateID, setID, copyID)][(start, end)]
                        
        return input_notes, input_labels

In [4]:
# generate the inputs

namer = Namer()
templates = pd.read_csv('../Data/General/Input/notes-base.csv')
notes_input, notes_label = namer(templates)

In [5]:
# save the inputs

with jsonlines.open('../Data/General/Input/notes-input.jsonl', 'w') as writer:
    writer.write_all([{'ID':ID, 'note':note} for ID, note in notes_input.items()])
with jsonlines.open('../Data/General/Input/notes-label.jsonl', 'w') as writer:
    writer.write_all([{'ID':ID, 'position':position, 'name':name} for ID, labels in notes_label.items() for position, name in labels.items()])

In [6]:
# analyze the templates

lengths, uniques, mentions = [], [], []
for _, row in templates.iterrows():
    occurrences = namer.find_occurrences(row['text'], namer.name_tag)
    lengths.append(len(row['text']))
    uniques.append(len({name for _, _, name in occurrences}))
    mentions.append(len(occurrences))

print(f'# Unique Names: Mean {np.mean(uniques):.1f}')
print(f'# Name Mentions: Mean {np.mean(mentions):.1f}')
print(f'Template Length: Mean {np.mean(lengths):.1f}, Min {np.min(lengths)}, Max {np.max(lengths)}')

# Unique Names: Mean 3.5
# Name Mentions: Mean 7.3
Template Length: Mean 12893.0, Min 3956, Max 24836
