In [1]:
import csv
import random
import jsonlines
import numpy as np
import pandas as pd
from collections import defaultdict

seed = 1
random.seed(seed)
np.random.seed(seed)

In [2]:
# white/black/asian male names with medium popularity in the 2000s

race2polysemies = {'white': ['Sydney', 'Faith', 'Forest', 'Cliff', 'June'],
                   'black': ['Quincy', 'Cleveland', 'Kenya', 'Prince', 'Ivory'],
                   'asian': ['Thai', 'King', 'Long', 'Young', 'Can']}

In [3]:
class Namer:
    
    def __init__(self):
        
        self.copy_size = 1
        self.name_tag = ('**NAME-', 11)
        self.other_tags = {'**AGE**':'38', '**CONTACT**':'0123456789', '**DATE**':'2100/01/01', 
                           '**HOSPITAL**':'General Hospital', '**ID**':'100', '**LANGUAGE**':'English', 
                           '**LOCATION**':'Building 1', '**OTHER**':'_', '**PROFESSION**':'worker'}
        
        # white/black/asian male names with medium popularity in the 2000s
        self.first2last = {3:2, 7:4, 9:5}
        self.first_sets = {3: race2polysemies['white'], 7: race2polysemies['black'], 9: race2polysemies['asian']}
        self.last_sets = defaultdict(list)           
        with open('../Data/General/Input/names-last.csv') as file:
            reader = csv.reader(file)
            _ = next(reader)
            for row in reader:
                setID, name = int(row[0]), row[-3]
                self.last_sets[setID].append(name)
                    
    def replace_irrelevant(self, template):
        
        for tag, fake in self.other_tags.items():
            template = template.replace(tag, fake)
        return template
                    
    def find_occurrences(self, note, tag):
        
        pattern, length = tag
        occurrences = []
        index = note.find(pattern)
        while index != -1:
            occurrences.append((index, index+length, note[index:index+length]))
            index = note.find(pattern, index+length)
        return occurrences
                    
    def sample_name(self, setID):
        
        names = {'F':self.first_sets[setID]*4, 'L': self.last_sets[self.first2last[setID]]}         
        for each in names.values():
            random.shuffle(each)
        return names
                        
    # replace the irrelevant tags, identify the name tags, sample the names, replace the name tags, return the notes and names
    def __call__(self, templates):
        
        input_notes, input_labels = {}, defaultdict(dict)
        for templateID, row in templates.iterrows():
            template = self.replace_irrelevant(row['text'])
            tags = {tag for _, _, tag in self.find_occurrences(template, self.name_tag)}
            for setID in self.first2last:
                for copyID in range(self.copy_size):
                    names = self.sample_name(setID)
                    
                    name2tag, name2setID = {}, {}
                    input_notes[(templateID, setID, copyID)] = template
                    for tag in tags:
                        personID = int(tag[7])
                        name = ' '.join([names['F'][personID], names['L'][personID]])
                        
                        name2tag[name] = tag
                        name2setID[name] = setID
                        input_notes[(templateID, setID, copyID)] = input_notes[(templateID, setID, copyID)].replace(tag, name)
                        
                    for name in name2tag:
                        for start, end, _ in self.find_occurrences(input_notes[(templateID, setID, copyID)], (name, len(name))):
                            input_labels[(templateID, setID, copyID)][(start, end)] = (name, name2tag[name], name2setID[name])
                            
                    name_overlap = set()
                    positions = list(input_labels[(templateID, setID, copyID)].keys())
                    for i in range(len(positions)):
                        for j in range(i+1, len(positions)):
                            if positions[i][0]<=positions[j][0] and positions[i][1]>=positions[j][1]: name_overlap.add(positions[j])
                    for start, end in name_overlap:
                        del input_labels[(templateID, setID, copyID)][(start, end)]
                        
        return input_notes, input_labels

In [4]:
# generate the inputs

namer = Namer()
templates = pd.read_csv('../Data/General/Input/notes-base.csv')
polysemies_input, polysemies_label = namer(templates)

In [5]:
# save the inputs

with jsonlines.open('../Data/Polysemy/Input/polysemies-input.jsonl', 'w') as writer:
    writer.write_all([{'ID':ID, 'note':note} for ID, note in polysemies_input.items()])
with jsonlines.open('../Data/Polysemy/Input/polysemies-label.jsonl', 'w') as writer:
    writer.write_all([{'ID':ID, 'position':position, 'name':name} for ID, labels in polysemies_label.items() for position, name in labels.items()])