In [1]:
import os
import csv
import json
import jsonlines
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from collections import defaultdict

seed = 1
np.random.seed(seed)

### General Context

- DocRED: 1000 for the train set, 100 for the dev set

In [2]:
# mark the name mentions in each document

total = defaultdict(list)
for split in ['train', 'dev']:
    for doc in json.load(open(f'../Data/Finetune/Raw/General/{split}.json', 'r')):
        names = defaultdict(list)
        for entity in doc['vertexSet']:
            for mention in entity:
                if mention['type'] == 'PER': names[mention['sent_id']].append(tuple(mention['pos']))
        if len(names) == 0: continue
        
        names = {k:[v for v in sorted(vs, key=lambda v:v[0], reverse=True)] for k,vs in names.items()}
        nameID = 1
        for sid, spans in names.items():
            for span in spans:
                doc['sents'][sid][span[0]] = f'**NAME-{nameID}A**'
                nameID += 1
            for span in spans:
                del doc['sents'][sid][span[0]+1:span[1]]
        total[split].append(' '.join([' '.join(sent) for sent in doc['sents']]))

In [3]:
# randomly split the dataset into a train set (1000 docs) and a dev set (100 docs)

with jsonlines.open('../Data/Finetune/Input/context-general.jsonl', 'w') as writer:
    for split, size in zip(['train', 'dev'], [1000, 100]):
        np.random.shuffle(total[split])
        writer.write_all([{'split':split, 'template':template} for template in total[split][:size]])

### Clinical Context

- 2014 i2b2 de-identification challenge: 1000 for the train set, 100 for the dev set

In [4]:
# mark the name mentions in each document

total = []
for filename in os.listdir('../Data/Finetune/Raw/Clinical'):
    doc = BeautifulSoup(open(f'../Data/Finetune/Raw/Clinical/{filename}', 'r').read(), 'xml')
    names = [(int(name.get('start')), int(name.get('end'))) for name in doc.find('TAGS').find_all('NAME')]
    if names == 0: continue
    
    text = list(doc.find('TEXT').text)
    for nameID, (start, end) in enumerate(sorted(names, key=lambda x:x[0], reverse=True)):
        text[start:end] = f'**NAME-{nameID+1}A**'
    total.append(''.join(text))

In [5]:
# randomly split the dataset into a train set (1000 docs) and a dev set (100 docs)

np.random.shuffle(total)
train, dev = total[:1000], total[1000:1100]
with jsonlines.open('../Data/Finetune/Input/context-clinical.jsonl', 'w') as writer:
    writer.write_all([{'split':'train', 'template':template} for template in train])
    writer.write_all([{'split':'dev', 'template':template} for template in dev])

### Diverse Names

- 10 first/last names from each of the 16 sets in names-first/last.csv

In [6]:
# load the 16 name sets (first and last names)

names = defaultdict(lambda: defaultdict(list))
for part in ['first', 'last']:
    with open(f'../Data/General/Input/names-{part}.csv', 'r') as file:
        reader = csv.reader(file)
        _ = next(reader)
        for row in reader:
            setID, name = int(row[0]), row[-3]
            names[part][setID].append(name)

In [7]:
# split each set into half

file_train, file_test = open('../Data/Finetune/Input/names-diverse.csv', 'w'), open('../Data/Finetune/Input/names-test.csv', 'w')
writer_train, writer_test = csv.writer(file_train), csv.writer(file_test)
for part in ['first', 'last']:
    for setID, set_ in names[part].items():
        np.random.shuffle(set_)
        
        for name in set_[:10]:
            writer_train.writerow([part, setID, name])
        for name in set_[10:]:
            writer_test.writerow([part, setID, name])    
file_train.close(); file_test.close()

### Popular Names

- 160 most popular first names from the 1940s, 1970s, 2000s that do not appear in names-first.csv
- 160 most popular surnames from the 2000s that do not appear in names-last.csv

In [8]:
# get the count of first and last names

years, duration = [1940, 1970, 2000], 10
first_count = defaultdict(int)
for year in years:
    for i in range(duration):
        with open(f'../Data/General/Raw/Name/firstnames-socialsecurity-{year+i}.txt', 'r') as file:
            for line in file:
                first, _, count = line[:-1].split(',')
                first_count[first] += int(count)
first_count = [(first, count) for first, count in sorted(first_count.items(), key=lambda x:x[1], reverse=True)]

last_count = []
with open('../Data/General/Raw/Name/surnames-census-2000.csv', 'r') as file:
    reader = csv.reader(file)
    _ = next(reader)
    for row in reader:
        last, count = row[0], int(row[2])
        last = last[0] + last[1:].lower()
        last_count.append((last, count))

In [9]:
# identify the 160 most popular first/last names that do not appear in names-first/last.csv

popular = defaultdict(list)
diverse = {part: {each for _, set_ in names[part].items() for each in set_} for part in names}
for name_count, part in zip([first_count, last_count], ['first', 'last']):
    for name, count in name_count:
        if name not in diverse[part]:
            popular[part].append((name, count))
            if len(popular[part]) >= 160: break
            
with open('../Data/Finetune/Input/names-popular.csv', 'w') as file:
    writer = csv.writer(file)
    for part in popular:
        for name, count in popular[part]:
            writer.writerow([part, count, name])

### Input

- inputs-{type_context}+{type_name}.csv: {(train/dev, index): note}; 
- labels-{type_context}+{type_name}.csv: {(train/dev, index): {(start, end): (name, tag)}}
- 4 Settings: 
    - General Context + Popular Names
    - General Context + Diverse Names
    - Clinical Context + Popular Names
    - Clinical Context + Diverse Names

In [10]:
class Namer:
    
    def __init__(self):

        self.name_tag = ('**NAME-', 11)
                    
    def find_occurrences(self, note, tag):
        
        pattern, length = tag
        occurrences = []
        index = note.find(pattern)
        while index != -1:
            occurrences.append((index, index+length, note[index:index+length]))
            index = note.find(pattern, index+length)
        return occurrences

    def sample_name(self, names):
        
        for each in names.values():
            np.random.shuffle(each)
        return names
                        
    # identify the name tags, sample the names, replace the name tags, return the notes and names
    def __call__(self, names, templates):
        
        input_notes, input_labels = {}, defaultdict(dict)
        for templateID, template in templates.items():
            tags = {tag for _, _, tag in self.find_occurrences(template, self.name_tag)}
            names = self.sample_name(names)
                    
            name2tag = {}
            input_notes[templateID] = template
            for tag in tags:
                personID = int(tag[7])
                name = ' '.join([names['first'][personID], names['last'][personID]])
                name2tag[name] = tag
                input_notes[templateID] = input_notes[templateID].replace(tag, name)

            for name in name2tag:
                for start, end, _ in self.find_occurrences(input_notes[templateID], (name, len(name))):
                    input_labels[templateID][(start, end)] = (name, name2tag[name])
                            
            name_overlap = set()
            positions = list(input_labels[templateID].keys())
            for i in range(len(positions)):
                for j in range(i+1, len(positions)):
                    if positions[i][0]<=positions[j][0] and positions[i][1]>=positions[j][1]: name_overlap.add(positions[j])
            for start, end in name_overlap:
                del input_labels[templateID][(start, end)]
                        
        return input_notes, input_labels

In [11]:
# load the templates and names, populate the templates with names, save the inputs and labels

for type_context in ['general', 'clinical']:
    for type_name in ['popular', 'diverse']:

        templates = {}
        with jsonlines.open(f'../Data/Finetune/Input/context-{type_context}.jsonl', 'r') as reader:
            for templateID, line in enumerate(reader):
                templates[(line['split'], templateID if line['split']=='train' else templateID-1000)] = line['template']

        names = defaultdict(list)
        with open(f'../Data/Finetune/Input/names-{type_name}.csv', 'r') as file:
            reader = csv.reader(file)
            for row in reader:
                part, _, name = row
                names[part].append(name)

        namer = Namer()
        inputs, labels = namer(names, templates)

        with jsonlines.open(f'../Data/Finetune/Input/inputs-{type_context}+{type_name}.jsonl', 'w') as writer:
            writer.write_all([{'ID':ID, 'note':note} for ID, note in inputs.items()])
        with jsonlines.open(f'../Data/Finetune/Input/labels-{type_context}+{type_name}.jsonl', 'w') as writer:
            writer.write_all([{'ID':ID, 'position':position, 'name':name} for ID, mentions in labels.items() for position, name in mentions.items()])

### Test

- 100 templates * 1 copy * 16 sets = 1600 notes for evaluation

In [12]:
class Namer:
    
    def __init__(self):
        
        self.copy_size = 1
        self.name_tag = ('**NAME-', 11)
        self.other_tags = {'**AGE**':'38', '**CONTACT**':'0123456789', '**DATE**':'2100/01/01', 
                           '**HOSPITAL**':'General Hospital', '**ID**':'100', '**LANGUAGE**':'English', 
                           '**LOCATION**':'Building 1', '**OTHER**':'_', '**PROFESSION**':'worker'}
        
        self.first2last = {1:1, 2:1, 3:2, 4:2, 5:3, 6:3, 7:4, 8:4, 9:5, 10:5, 11:6, 12:6, 13:1, 14:1, 15:1, 16:1}
        self.first_sets, self.last_sets = defaultdict(list), defaultdict(list)
        with open('../Data/Finetune/Input/names-test.csv', 'r') as file:
            reader = csv.reader(file)
            for row in reader:
                part, setID, name = row
                if part == 'first': self.first_sets[int(setID)].append(name)
                elif part == 'last': self.last_sets[int(setID)].append(name)
                    
    def replace_irrelevant(self, template):
        
        for tag, fake in self.other_tags.items():
            template = template.replace(tag, fake)
        return template
                    
    def find_occurrences(self, note, tag):
        
        pattern, length = tag
        occurrences = []
        index = note.find(pattern)
        while index != -1:
            occurrences.append((index, index+length, note[index:index+length]))
            index = note.find(pattern, index+length)
        return occurrences
                    
    def sample_name(self, setID):
        
        names = {'F':self.first_sets[setID], 'L': self.last_sets[self.first2last[setID]]}         
        for each in names.values():
            np.random.shuffle(each)
        return names
                        
    # replace the irrelevant tags, identify the name tags, sample the names, replace the name tags, return the notes and names
    def __call__(self, templates):
        
        input_notes, input_labels = {}, defaultdict(dict)
        for templateID, row in templates.iterrows():
            template = self.replace_irrelevant(row['text'])
            tags = {tag for _, _, tag in self.find_occurrences(template, self.name_tag)}
            for setID in self.first2last:
                for copyID in range(self.copy_size):
                    names = self.sample_name(setID)
                    
                    name2tag, name2setID = {}, {}
                    input_notes[(templateID, setID, copyID)] = template
                    for tag in tags:
                        personID = int(tag[7])
                        name = ' '.join([names['F'][personID], names['L'][personID]])
                        name2tag[name] = tag
                        name2setID[name] = setID
                        input_notes[(templateID, setID, copyID)] = input_notes[(templateID, setID, copyID)].replace(tag, name)
                        
                    for name in name2tag:
                        for start, end, _ in self.find_occurrences(input_notes[(templateID, setID, copyID)], (name, len(name))):
                            input_labels[(templateID, setID, copyID)][(start, end)] = (name, name2tag[name], name2setID[name])
                            
                    name_overlap = set()
                    positions = list(input_labels[(templateID, setID, copyID)].keys())
                    for i in range(len(positions)):
                        for j in range(i+1, len(positions)):
                            if positions[i][0]<=positions[j][0] and positions[i][1]>=positions[j][1]: name_overlap.add(positions[j])
                    for start, end in name_overlap:
                        del input_labels[(templateID, setID, copyID)][(start, end)]
                        
        return input_notes, input_labels

In [13]:
# generate and save the test inputs and labels

namer = Namer()
templates = pd.read_csv('../Data/General/Input/notes-base.csv')
test_inputs, test_labels = namer(templates)

with jsonlines.open(f'../Data/Finetune/Input/inputs-test.jsonl', 'w') as writer:
    writer.write_all([{'ID':ID, 'note':note} for ID, note in test_inputs.items()])
with jsonlines.open(f'../Data/Finetune/Input/labels-test.jsonl', 'w') as writer:
    writer.write_all([{'ID':ID, 'position':position, 'name':name} for ID, mentions in test_labels.items() for position, name in mentions.items()])