In [None]:
import pandas as pd
from pathlib import Path
import re
import itertools
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
import seaborn as sns

## Load notes
- put the notes in a dict with the file name  as the key

In [None]:
directory = Path('annotated') 
note_names = list(directory.glob("*.txt"))

notes = {}

for filename in note_names:
    key = str(filename).replace('annotated/','').replace('.annotated.txt', '')
    with open(filename) as file:
        text = file.read()
    notes[key] = text

## Ensure uniform beginning of every note

#### Load template for the beginning of every note 
- by gender
- with added label for address

In [None]:
with open('note_template/note_start_template_M.txt') as file:
        template_M = file.read()
    
with open('note_template/note_start_template_F.txt') as file:
        template_F = file.read()

In [None]:
print(len(template_F))

#### Replace the beginning of each note with the according template

In [None]:
for key in notes:
    
    if key.startswith('F'):
        notes[key] = re.sub(r'^(.*)</sex>', template_F, notes[key], flags = re.DOTALL)
        
    else: #key starts with M:
        notes[key] = re.sub(r'^(.*)</sex>', template_M, notes[key], flags = re.DOTALL)

## Error handling

#### Functions for handling some misplaced labels

In [None]:
# removes name-label from everything that isn't anonymized (and therefore not a name)

name_search_mask = '(<name>)([^<]*)(</name>)'

def name_check(match):
    
    alledged_name = match.group(2)
    
    if alledged_name == '___':
        return '<name>___</name>'
    else:
        return alledged_name



# removes pronoun-label from everything that isn't a 3rd person pronoun

pronoun_search_mask = '(<pronoun_[^>]*>)([^<]*)(</pronoun_[^>]*>)'

def pronoun_check(match):

    alledged_pronoun = match.group(2)
    
    if alledged_pronoun in ['she', 'her', 'herself', 'he', 'his', 'him', 'himself']:
        return match.group()
    else:
        return alledged_pronoun 



# removes forms of 'year-old' after age label for grammatical correctness

#options 
yo = 'year old|year-old|years old|yo|y/o|y.o'

age_search_mask = f'(<age>[^<]*</age> )({yo})'

#returns only the age-label
def delete_yo(match):
    return match.group(1)

#### Loop through all notes and handle possible errors

In [None]:
for key in notes:
    notes[key] = re.sub(name_search_mask,name_check, notes[key])
    notes[key] = re.sub(pronoun_search_mask, pronoun_check, notes[key], flags = re.IGNORECASE)
    notes[key] = re.sub(age_search_mask, delete_yo, notes[key], flags = re.IGNORECASE)

In [None]:
#for key in notes:
#    print(key)
#    print(notes[key][:400])
#    print('-------------------------------------------------------')

## Load sampled_data_demographics

In [None]:
demographics = pd.read_csv('demographics/sampled_data_demographics.csv')

In [None]:
demographics.columns

In [None]:
note_info = demographics[['abbrev', 'hadm_id','name', 'address', 'race', 'gender', 'dob', 'actual_age', 
                          'admittime_today', 'dischtime_today', 'thirty_day_readmission']].copy()
note_info.loc[:, 'filename'] =  note_info.loc[:, 'abbrev'].astype(str) + '_' + note_info.loc[:, 'hadm_id'].astype(str)
note_info.loc[:, 'race_abbrev'] =  note_info.loc[:, 'abbrev'].apply(lambda x: x[2:])

## Functions for adding and removing baseline personal info

#### Add race info for every note
- precondition: call before the start of the note is changed (before adding in personal information)

In [None]:
def add_race(notes):

    for key in notes:

        hadm_id = re.sub('[^2]*_', '', key)
        race = note_info.loc[note_info['hadm_id'] == int(hadm_id), 'race'].iloc[0]
        notes[key] = (notes[key][:len(template_F)+1] + 'race: <race>' + str(race) + '</race>' + notes[key][len(template_F)+1:])

    return notes

#### Add age info for every note
- precondition: call before the start of the note is changed (before adding in personal information)

In [None]:
def add_age(notes):

    for key in notes:

        hadm_id = re.sub('[^2]*_', '', key)
        age = note_info.loc[note_info['hadm_id'] == int(hadm_id), 'actual_age'].iloc[0]
        notes[key] = (notes[key][:len(template_F)+1] + 'age: <age>' + str(age) + '</age>' + notes[key][len(template_F)+1:])

    return notes

#### Swap baseline personal information in and out
- takes information from note_info and puts it into the note if the according parameter is set to True
- otherwise anonymizes this personal info in the note
- neutralize gendered terms

In [None]:
def swap_in(notes, name =  True, address = True, age = True, visitdates = True, gender = True, race = True):

    for key in notes:
        
        hadm_id = re.sub('[^2]*_', '', key)

        if name:
            name = note_info.loc[note_info['hadm_id'] == int(hadm_id), 'name'].iloc[0]
            name = '<name>' + str(name) + '</name>'
        
            notes[key] = re.sub('<name>[^<]*</name>', name, notes[key])
            
        if not name:
            notes[key] = re.sub('<name>[^<]*</name>', '<name>___</name>', notes[key])

        
        if address:
            address = note_info.loc[note_info['hadm_id'] == int(hadm_id), 'address'].iloc[0]
            address = '<address>' + str(address) + '</address>'
            notes[key] = re.sub('<address>[^<]*</address>', address, notes[key])
            
        if not address:
            notes[key] = re.sub('<address>[^<]*</address>', '<address>___</address>', notes[key])

        
        if age:
            actual_age = note_info.loc[note_info['hadm_id'] == int(hadm_id), 'actual_age'].iloc[0]
            actual_age = '<age>' + str(actual_age) + ' year-old ' + '</age>'
            notes[key] = re.sub('<age>[^<]*</age>', actual_age, notes[key])

        if not age:
            notes[key] = re.sub('<age>[^<]*</age>', '<age>___</age>', notes[key])

        
        if visitdates:
            admittime_today = note_info.loc[note_info['hadm_id'] == int(hadm_id), 'admittime_today'].iloc[0]
            admittime_today = '<admission_date>' + str(admittime_today) + '</admission_date>'
            notes[key] = re.sub('<admission_date>[^<]*</admission_date>', str(admittime_today), notes[key])
            
            dischtime_today = note_info.loc[note_info['hadm_id'] == int(hadm_id), 'dischtime_today'].iloc[0]
            dischtime_today = '<discharge_date>' + str(dischtime_today) + '</discharge_date>'
            notes[key] = re.sub('<discharge_date>[^<]*</discharge_date>', str(dischtime_today), notes[key])

        if not visitdates:
            notes[key] = re.sub('<admission_date>[^<]*</admission_date>', '<admission_date>___</admission_date>', notes[key])
            notes[key] = re.sub('<discharge_date>[^<]*</discharge_date>', '<discharge_date>___</discharge_date>', notes[key])

        
        if age and visitdates:
            dob = note_info.loc[note_info['hadm_id'] == int(hadm_id), 'dob'].iloc[0]
            dob = '<dob>' + str(dob) + '</dob>'
            notes[key] = re.sub('<dob>[^<]*</dob>', dob, notes[key])

        if not (age and visitdates):
            notes[key] = re.sub('<dob>[^<]*</dob>', '<dob>___</dob>', notes[key])

        
        if gender:
            gender = key[0] 
                
            if gender == 'F':
                notes[key] = re.sub('<sex>[^<]*</sex>', '<sex>F</sex>', notes[key])
                notes[key] = re.sub('<pronoun_subject>[^<]*</pronoun_subject>', '<pronoun_subject>she</pronoun_subject>', notes[key])
                notes[key] = re.sub('<pronoun_object>[^<]*</pronoun_object>', '<pronoun_object>her</pronoun_object>', notes[key])
                notes[key] = re.sub('<pronoun_possessive>[^<]*</pronoun_possessive>', '<pronoun_possessive>her</pronoun_possessive>', notes[key])
                notes[key] = re.sub('<pronoun_reflexive>[^<]*</pronoun_reflexive>', '<pronoun_reflexive>herself</pronoun_reflexive>', notes[key])
                notes[key] = re.sub('<form_of_address>[^<]*</form_of_address>', '<form_of_address>Ms.</form_of_address>', notes[key])
                notes[key] = re.sub('<gendered_noun>[^<]*</gendered_noun>', '<gendered_noun>woman</gendered_noun>', notes[key])
            else:
                notes[key] = re.sub('<sex>[^<]*</sex>', '<sex>M</sex>', notes[key])
                notes[key] = re.sub('<pronoun_subject>[^<]*</pronoun_subject>', '<pronoun_subject>he</pronoun_subject>', notes[key])
                notes[key] = re.sub('<pronoun_object>[^<]*</pronoun_object>', '<pronoun_object>him</pronoun_object>', notes[key])
                notes[key] = re.sub('<pronoun_possessive>[^<]*</pronoun_possessive>', '<pronoun_possessive>his</pronoun_possessive>', notes[key])
                notes[key] = re.sub('<pronoun_reflexive>[^<]*</pronoun_reflexive>', '<pronoun_reflexive>himself</pronoun_reflexive>', notes[key])
                notes[key] = re.sub('<form_of_address>[^<]*</form_of_address>', '<form_of_address>Mr.</form_of_address>', notes[key])
                notes[key] = re.sub('<gendered_noun>[^<]*</gendered_noun>', '<gendered_noun>man</gendered_noun>', notes[key])
        
        if not gender:
            notes[key] = re.sub('<sex>[^<]*</sex>', '<sex>___</sex>', notes[key])
            notes[key] = re.sub('<pronoun_subject>[^<]*</pronoun_subject>', '<pronoun_subject>the patient</pronoun_subject>', notes[key])
            notes[key] = re.sub('<pronoun_object>[^<]*</pronoun_object>', '<pronoun_object>the patient</pronoun_object>', notes[key])
            notes[key] = re.sub('<pronoun_possessive>[^<]*</pronoun_possessive>', '<pronoun_possessive>the patient\'s</pronoun_possessive>', notes[key])
            notes[key] = re.sub('<pronoun_reflexive>[^<]*</pronoun_reflexive>', '<pronoun_reflexive>the patient</pronoun_reflexive>', notes[key])
            notes[key] = re.sub('<form_of_address>[^<]*</form_of_address>', '<form_of_address>___</form_of_address>', notes[key])
            notes[key] = re.sub('<gendered_noun>[^<]*</gendered_noun>', '<gendered_noun>___</gendered_noun>', notes[key])


        if race:
            race = note_info.loc[note_info['hadm_id'] == int(hadm_id), 'race'].iloc[0]
            race = '<race>' + str(race) + '</race>'
            notes[key] = re.sub('<race>[^<]*</race>', race, notes[key]) 

        if not race:
            notes[key] = re.sub('<race>[^<]*</race>', '<race>___</race>', notes[key])
    
    return notes
                    

#### Functions for adding a specific info into a text note

In [None]:
def swap_name(text, name):
    
    name = '<name>' + str(name) + '</name>'
    text = re.sub('<name>[^<]*</name>', name, text)

    return text

In [None]:
def swap_address(text, address):
    
    address = '<address>' + str(address) + '</address>'
    text = re.sub('<address>[^<]*</address>', address, text)

    return text

In [None]:
def swap_age(text, age, dob):
        
    age = '<age>' + str(age) + '</age>'
    dob = '<dob>' + str(dob) + '</dob>'
    text = re.sub('<age>[^<]*</age>', age, text)
    text = re.sub('<dob>[^<]*</dob>', dob, text)

    return text

In [None]:
race_map = {
    'AIAN'   : 'AMERICAN INDIAN/ALASKA NATIVE',
    'AS'     : 'ASIAN',
    'AS_AI'  : 'ASIAN - ASIAN INDIAN',
    'AS_CH'  : 'ASIAN - CHINESE',
    'AS_SEA' : 'ASIAN - SOUTH EAST ASIAN',
    'BL_A'   : 'BLACK/AFRICAN',
    'BL_AA'  : 'BLACK/AFRICAN AMERICAN',
    'BL_CV'  : 'BLACK/CAPE VERDEAN',
    'BL_CI'  : 'BLACK/CARIBBEAN ISLAND',
    'HL'     : 'HISPANIC OR LATINO',
    'HL_CO'  : 'HISPANIC/LATINO - COLUMBIAN',
    'HL_DO'  : 'HISPANIC/LATINO - DOMINICAN',
    'HL_GU'  : 'HISPANIC/LATINO - GUATEMALAN',
    'HL_PR'  : 'HISPANIC/LATINO - PUERTO RICAN',
    'HL_SA'  : 'HISPANIC/LATINO - SALVADORAN',
    'PT'     : 'PORTUGUESE',
    'W'      : 'WHITE',
    'W_BR'   : 'WHITE - BRAZILIAN',
    'W_EE'   : 'WHITE - EASTERN EUROPEAN',
    'W_OE'   : 'WHITE - OTHER EUROPEAN',
    'W_RU'   : 'WHITE - RUSSIAN',
}

def swap_race(text, race):

    race = '<race>' + race_map[race] + '</race>'
    text = re.sub('<race>[^<]*</race>', race, text)

    return text

## Function for removing labels (last step)

In [None]:
def remove_labels(notes):
    for key in notes:
        notes[key] = re.sub('<[^>]*>', '', notes[key])
    return notes

## Turn notes into DataFrame with hadm_id column

In [None]:
def get_hadm_id(new_notes):
    
    notes_df = pd.DataFrame.from_dict(new_notes, orient = 'index', columns = ['text'])
    notes_df = notes_df.reset_index(names = 'hadm_id')
    notes_df['hadm_id'] = notes_df['hadm_id'].apply(lambda x: int(re.sub('[^2]*_', '', x)))

    return notes_df

## Create train/test splits by race-gender subgroup

rename columns to 'text' and 'thirty_day_readmission' (if needed)
- e.g. from more specific column names

In [None]:
def rename_col_t_r(df):
    
    df.columns = ['text', 'thirty_day_readmission']

    return df

#### Create splits directly on the data

In [None]:
hadm_order = get_hadm_id(notes.copy())
hadm_order['hadm_index'] = hadm_order.index
hadm_order = hadm_order[['hadm_index', 'hadm_id']]

In [None]:
def create_splits(new_notes, eval_target = 'thirty_day_readmission', main_attribute = 'text'):

    if type(new_notes) == dict:
        #Put the notes into a DataFrame and add their hadm_id as a column
        notes_df = get_hadm_id(new_notes)

        
    elif type(new_notes) == pd.DataFrame:
        if main_attribute != 'hadm_id':
            notes_df = new_notes[[f'{main_attribute}', 'hadm_id']]
        else:
            notes_df = new_notes['hadm_id']


    else:
        raise TypeError('Note Input has to be a dictionary or pandas DataFrame')

    
    #sort into original order of notes to ensure reproducibility
    notes_df = pd.merge(notes_df, hadm_order, on = 'hadm_id', how = 'inner')
    notes_df = notes_df.sort_values(by = 'hadm_index')
    notes_df.drop('hadm_index', axis = 1, inplace = True)
    
    #Get info about evaluation target (e.g. 30-day-readmission) and gender-race subgroup
    if eval_target == 'thirty_day_readmission': 
        notes_df = pd.merge(notes_df, note_info[['hadm_id', f'{eval_target}', 'abbrev']], on = 'hadm_id')
    else:
        notes_df = pd.merge(notes_df, note_info[['hadm_id', f'{eval_target}', 'abbrev', 'thirty_day_readmission']], on = 'hadm_id')
    notes_df = notes_df.groupby('abbrev')

    #Split into train and test group by subgroup
    first_group = list(notes_df.groups)[0]
    subgroup = notes_df.get_group(first_group)

    X = subgroup[f'{main_attribute}']
    y = subgroup[f'{eval_target}']
    strat = subgroup['thirty_day_readmission']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify = strat)

    for abbrev in notes_df.groups:
    
        if abbrev != first_group:
        
            subgroup = notes_df.get_group(abbrev)
 
            X = subgroup[f'{main_attribute}']
            y = subgroup[f'{eval_target}']
            strat = subgroup['thirty_day_readmission']
            
            this_X_train, this_X_test, this_y_train, this_y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify = strat)

            X_train = pd.concat([X_train, this_X_train])
            X_test = pd.concat([X_test, this_X_test])
            y_train = pd.concat([y_train, this_y_train])
            y_test = pd.concat([y_test, this_y_test]) 
    
    train = pd.concat([X_train, y_train], axis = 1)  
    test = pd.concat([X_test, y_test], axis = 1)

    return train, test

#### extract hadm_id identifiers of different splits
- used when information is swapped between the notes
- (swapping only between notes in the test group)

##### Compute the prevalence of true thirty_day_readmission in the train and test split

In [None]:
notes_df = notes.copy()

train,test = create_splits(notes_df, eval_target = 'thirty_day_readmission', main_attribute = 'hadm_id')

train['split'] = 'train'
test['split'] = 'test'

prevalence = pd.concat([train, test], axis = 0)
prevalence.to_csv('prevalence/prevalence_by_hadm_id_strat.csv', index = False)

##### create separate data structures for the test split 

In [None]:
#filter by split
test_df = prevalence[prevalence['split'] == 'test'].drop('split', axis = 1)
train_df = prevalence[prevalence['split'] == 'train'].drop('split', axis = 1)

#split note_info by split group
note_info_test = pd.merge(test_df, note_info, how = 'inner')
note_info_train = pd.merge(train_df, note_info, how = 'inner')

test_notes = {}

for key in notes:

    hadm_id = int(re.sub('[^2]*_', '', key))

    if hadm_id in list(test_df['hadm_id']):
        test_notes[key] = notes[key]

##### (additional: compute prevalence in each group)

In [None]:
prev_info = pd.merge(note_info, prevalence, on = ['hadm_id', 'thirty_day_readmission'])
prev_info = prev_info[['thirty_day_readmission', 'split', 'gender', 'race_abbrev', 'actual_age']]

prev_df = []

genders = ['F', 'M']

for gender in genders:
    
    subset = prev_info[prev_info['gender'] == gender]
    
    train_df = subset[subset['split'] == 'train']
    test_df = subset[subset['split'] == 'test']
    
    train_prev = train_df['thirty_day_readmission'].value_counts(normalize = True)
    train_prev = train_prev.get(True)
    test_prev = test_df['thirty_day_readmission'].value_counts(normalize = True)
    test_prev = test_prev.get(True)

    prev_df.append({'category': gender, 'train_prevalence': train_prev, 'test_prevalence': test_prev})


races = note_info['race_abbrev'].unique()

for race in races:
    
    subset = prev_info[prev_info['race_abbrev'] == race]
    
    train_df = subset[subset['split'] == 'train']
    test_df = subset[subset['split'] == 'test']
    
    train_prev = train_df['thirty_day_readmission'].value_counts(normalize = True)
    train_prev = train_prev.get(True)
    test_prev = test_df['thirty_day_readmission'].value_counts(normalize = True)
    test_prev = test_prev.get(True)

    prev_df.append({'category': race, 'train_prevalence': train_prev, 'test_prevalence': test_prev})

age_range = [(19,29),(30,39),(40,49),(50,59),(60,69),(70,79),(80,89),(90,98), (19,50), (74,96)]

for (young, old) in age_range:
    
    subset = prev_info[(prev_info['actual_age'] >= young) & (prev_info['actual_age'] <= old)]
    
    train_df = subset[subset['split'] == 'train']
    test_df = subset[subset['split'] == 'test']
    
    train_prev = train_df['thirty_day_readmission'].value_counts(normalize = True)
    train_prev = train_prev.get(True)
    test_prev = test_df['thirty_day_readmission'].value_counts(normalize = True)
    test_prev = test_prev.get(True)

    prev_df.append({'category': str((young, old)), 'train_prevalence': train_prev, 'test_prevalence': test_prev})

prev_df.to_csv('prevalence/prevalence_by_group.csv', index = False)

## Create notes with combination of personal information for readmission evaluation
- /grouped_anonymized

In [None]:
#anonymization_stages = {}

#Compute all possible combinations of personal info
possible_combinations = list(itertools.product([True, False], repeat = 6))
attributes = ['name', 'address', 'age', 'visitdates', 'gender', 'race']

for combo in possible_combinations:

    #put the personal infos together into one key
    key = '_'
    for i in range(6):
        if combo[i]:
            key += attributes[i]
            key += '_'

    #Swap personal info in/out as needed
    new_notes = notes.copy()
    new_notes = swap_in(new_notes, *combo)
    new_notes = remove_labels(new_notes)

    train, test = create_splits(new_notes, eval_target = 'thirty_day_readmission')

    path = f'readmission_prediction/data/{key}'
    if not os.path.exists(path):
            os.makedirs(path)
    train.to_csv(f'readmission_prediction/data/{key}/train.csv', index = False)
    test.to_csv(f'readmission_prediction/data/{key}/test.csv', index = False)    
    
    #anonymization_stages[key] = new_notes

## Create notes for race, gender and age evaluation (/embedding_significance)
- info not given directly
  - every other possible info is given
  - note is anonymized as much as possible 
- info given directly
  - every other possible info is given
  - note is anonymized as much as possible 

In [None]:
possible_combinations = list(itertools.product(['given','no'], ['race', 'gender', 'age'], ['everything', 'nothing']))

dir_names = []

for combo in possible_combinations:

    dir_name = '_'.join(combo)
    dir_names.append(dir_name)

    new_notes = notes.copy()
    
    if combo[2] == 'everything':

        if combo[0] == 'no':

            if combo[1] == 'race':
                new_notes = swap_in(new_notes, race = False)
            
            if combo[1] == 'gender':
                new_notes = swap_in(new_notes, gender = False)

            if combo[1] == 'age':
                new_notes = swap_in(new_notes, age = False)

        else: # combo[0] == 'given'

            if combo[1] == 'race':
                new_notes = add_race(new_notes)

            if combo[1] == 'age':
                new_notes = add_age(new_notes)
                
            new_notes = swap_in(new_notes)
            
    
    else: # combo[2] == 'nothing'

        if combo[0] == 'given':

            if combo[1] == 'race':
                new_notes = add_race(new_notes)
                new_notes = swap_in(new_notes, name =  False, address = False, age = False, visitdates = False, gender = False, race = True)
            
            if combo[1] == 'gender':
                new_notes = swap_in(new_notes, name =  False, address = False, age = False, visitdates = False, gender = True, race = False)

            if combo[1] == 'age':
                new_notes = add_age(new_notes)
                new_notes = swap_in(new_notes, name =  False, address = False, age = True, visitdates = False, gender = False, race = False)

        else: # combo[0] == 'no'

            new_notes = swap_in(new_notes, name =  False, address = False, age = False, visitdates = False, gender = False, race = False)

    new_notes = remove_labels(new_notes)
    
    #Export notes
    if combo[1] == 'race':
        
        train, test = create_splits(new_notes, eval_target = 'race')
        path = f'attribute_prediction/race/data/{dir_name}'
        if not os.path.exists(path):
            os.makedirs(path)
        train.to_csv(f'{path}/train.csv', index = False)
        test.to_csv(f'{path}/test.csv', index = False)

    if combo[1] == 'gender':
        
        train, test = create_splits(new_notes, eval_target = 'gender')
        path = f'attribute_prediction/gender/data/{dir_name}'
        if not os.path.exists(path):
            os.makedirs(path)
        train.to_csv(f'{path}/train.csv', index = False)
        test.to_csv(f'{path}/test.csv', index = False)

    if combo[1] == 'age':

        train, test = create_splits(new_notes, eval_target = 'actual_age')
        path = f'attribute_prediction/age/data/{dir_name}'
        if not os.path.exists(path):
            os.makedirs(path)
        train.to_csv(f'{path}/train.csv', index = False)
        test.to_csv(f'{path}/test.csv', index = False)

## Notes with different gender (/grouped_anonymized/gender_swapping)

In [None]:
new_notes = test_notes.copy()

f = {}
f_m = {}
m = {}
m_f = {}

#split notes into male and female
for key in new_notes:

    if key.startswith('F'):
        #exclude race that doesnt have a male subgroup
        if not key.startswith('F_HL_CO'):
            f[key] = new_notes[key] 
            f_m[f'M{key[1:]}'] = new_notes[key]
    else:
        m[key] = new_notes[key] 
        m_f[f'F{key[1:]}'] = new_notes[key]
            
#prepare the notes
f = swap_in(f)
f_m = swap_in(f_m)
m = swap_in(m)
m_f = swap_in(m_f)

# get hadm_id for each note, put out a df
f = get_hadm_id(f)
f_m = get_hadm_id(f_m)
m = get_hadm_id(m)
m_f = get_hadm_id(m_f)

#get additional info about notes
#put notes back together by gender
f = pd.merge(f, note_info[['hadm_id', 'race', 'name', 'address', 'actual_age', 'thirty_day_readmission']], how = 'inner', on = 'hadm_id')
f_complete = pd.merge(f, f_m, how = 'inner', on = 'hadm_id', suffixes = ['_f', '_f_m'])
m = pd.merge(m, note_info[['hadm_id', 'race', 'name', 'address', 'actual_age', 'thirty_day_readmission']], how = 'inner', on = 'hadm_id')
m_complete = pd.merge(m, m_f, how = 'inner', on = 'hadm_id', suffixes = ['_m', '_m_f'])

#add age_rank for each race
f_complete['age_rank'] = f_complete.groupby('race')['actual_age'].transform(lambda x: x.rank(method = 'first'))
m_complete['age_rank'] = m_complete.groupby('race')['actual_age'].transform(lambda x: x.rank(method = 'first'))

#merge genders together by race and age_rank
df = pd.merge(f_complete, m_complete, on = ['race', 'age_rank'], suffixes = ['_f', '_m'], validate = 'one_to_one')

#swap names
df['text_n_f'] = df.apply(lambda x: swap_name(x['text_f'], x['name_m']), axis = 1)
df['text_n_m'] = df.apply(lambda x: swap_name(x['text_m'], x['name_f']), axis = 1)
df['text_n_gender_f'] = df.apply(lambda x: swap_name(x['text_f_m'], x['name_m']), axis = 1)
df['text_n_gender_m'] = df.apply(lambda x: swap_name(x['text_m_f'], x['name_f']), axis = 1)
#swap address
df['text_a_f'] = df.apply(lambda x: swap_address(x['text_f'], x['address_m']), axis = 1)
df['text_a_m'] = df.apply(lambda x: swap_address(x['text_m'], x['address_f']), axis = 1)
df['text_a_gender_f'] = df.apply(lambda x: swap_address(x['text_f_m'], x['address_m']), axis = 1)
df['text_a_gender_m'] = df.apply(lambda x: swap_address(x['text_m_f'], x['address_f']), axis = 1)
#swap names and addresses (use columns with swapped names as a starting point)
df['text_n_a_f'] = df.apply(lambda x: swap_address(x['text_n_f'], x['address_m']), axis = 1)
df['text_n_a_m'] = df.apply(lambda x: swap_address(x['text_n_m'], x['address_f']), axis = 1)
df['text_n_a_gender_f'] = df.apply(lambda x: swap_address(x['text_n_gender_f'], x['address_m']), axis = 1)
df['text_n_a_gender_m'] = df.apply(lambda x: swap_address(x['text_n_gender_m'], x['address_f']), axis = 1)

#remove labels
text_cols = list(filter(lambda x: x.startswith('text'), df.columns))

for col in text_cols:
    df[col] = df.apply(lambda x: re.sub('<[^>]*>', '', x[col]), axis = 1)


# separate into different dataframes
f = rename_col_t_r(df[['text_f', 'thirty_day_readmission_f']].copy())
f_m_address = rename_col_t_r(df[['text_a_f', 'thirty_day_readmission_f']].copy())
f_m_name = rename_col_t_r(df[['text_n_f', 'thirty_day_readmission_f']].copy())
f_m_name_address = rename_col_t_r(df[['text_n_a_f', 'thirty_day_readmission_f']].copy())
f_m_gender = rename_col_t_r(df[['text_f_m', 'thirty_day_readmission_f']].copy())
f_m_gender_address = rename_col_t_r(df[['text_a_gender_f', 'thirty_day_readmission_f']].copy())
f_m_gender_name = rename_col_t_r(df[['text_n_gender_f', 'thirty_day_readmission_f']].copy())
f_m_gender_name_address = rename_col_t_r(df[['text_n_a_gender_f', 'thirty_day_readmission_f']].copy())

m = rename_col_t_r(df[['text_m', 'thirty_day_readmission_m']].copy())
m_f_address = rename_col_t_r(df[['text_a_m', 'thirty_day_readmission_m']].copy())
m_f_name = rename_col_t_r(df[['text_n_m', 'thirty_day_readmission_m']].copy())
m_f_name_address = rename_col_t_r(df[['text_n_a_m', 'thirty_day_readmission_m']].copy())
m_f_gender = rename_col_t_r(df[['text_m_f', 'thirty_day_readmission_m']].copy())
m_f_gender_address = rename_col_t_r(df[['text_a_gender_m', 'thirty_day_readmission_m']].copy())
m_f_gender_name = rename_col_t_r(df[['text_n_gender_m', 'thirty_day_readmission_m']].copy())
m_f_gender_name_address = rename_col_t_r(df[['text_n_a_gender_m', 'thirty_day_readmission_m']].copy())


groups = [('f', f),
            ('f_m_address', f_m_address),
            ('f_m_name', f_m_name),
            ('f_m_name_address', f_m_name_address),
            ('f_m_gender', f_m_gender),
            ('f_m_gender_address', f_m_gender_address),
            ('f_m_gender_name', f_m_gender_name),
            ('f_m_gender_name_address', f_m_gender_name_address),
            ('m', m),
            ('m_f_address', m_f_address),
            ('m_f_name', m_f_name),
            ('m_f_name_address', m_f_name_address),
            ('m_f_gender', m_f_gender),
            ('m_f_gender_address', m_f_gender_address),
            ('m_f_gender_name', m_f_gender_name),
            ('m_f_gender_name_address', m_f_gender_name_address)]


gender_test_group = {}

for name, group in groups:

    #save in a Test files dictionary
    gender_test_group[name] = group.reset_index(drop = True)

    #export to csv
    path = f'readmission_prediction/gender_swapping/data/'
    if not os.path.exists(path):
        os.makedirs(path)
    group.to_csv(path+f'{name}.csv', index = False)

## Notes with different races (/grouped_anonymized/race_swapping)
- leave out explicit race info, swap names and/or addresses

In [None]:
new_notes = test_notes.copy()

#prepare the notes
new_notes = swap_in(new_notes, race = False)

#get additional info for every note
notes_df = get_hadm_id(new_notes)
notes_df = pd.merge(notes_df, note_info[['hadm_id', 'gender', 'race_abbrev', 'actual_age', 
                                         'name', 'address', 'thirty_day_readmission']], on = 'hadm_id')

#loop through all possible race combinations
races = note_info['race_abbrev'].unique()

#save all test groups for sanity checks
race_test_group = {}

for baseline_race in races:

    #baseline notes with specific race, split by gender
    #rank by age
    base_f = notes_df[(notes_df['race_abbrev'] == baseline_race) & (notes_df['gender'] == 'F')].copy()
    base_f['age_rank'] = base_f['actual_age'].rank(method = 'first')
    
    if baseline_race != 'HL_CO':
        base_m = notes_df[(notes_df['race_abbrev'] == baseline_race) & (notes_df['gender'] == 'M')].copy()
        base_m['age_rank'] = base_m['actual_age'].rank(method = 'first')
    
    
    for new_race in races:
        
        if baseline_race == new_race:
            
            #filter for that race
            group = notes_df[notes_df['race_abbrev'] == baseline_race].copy()
            
            #remove labels
            group['text'] = group.apply(lambda x: re.sub('<[^>]*>', '', x['text']), axis = 1)

            group = group[['text','thirty_day_readmission']]

            #save in a Test files dictionary
            race_test_group[baseline_race] = group.reset_index(drop = True)
        
            #export to csv
            path = f'readmission_prediction/race_swapping/data/'
            if not os.path.exists(path):
                os.makedirs(path)
            group.to_csv(path+f'{baseline_race}.csv', index = False)


        else:

            #baseline notes with specific race, split by gender
            new_f = notes_df[(notes_df['race_abbrev'] == new_race) & (notes_df['gender'] == 'F')].copy()
            
            #rank by age
            new_f['age_rank'] = new_f['actual_age'].rank(method = 'first')
            
            #merge with baseline notes
            f = pd.merge(base_f, new_f, on = 'age_rank', suffixes = ['', '_new'], validate = 'one_to_one')

            #swap name and/or address of the new race into the baseline race
            f_name = f.copy()
            f_name['text'] = f_name.apply(lambda x: swap_name(x['text'], x['name_new']), axis = 1)
            f_address = f.copy()
            f_address['text'] = f_address.apply(lambda x: swap_address(x['text'], x['address_new']), axis = 1)
            f_name_address = f_name.copy()
            f_name_address['text'] = f_name_address.apply(lambda x: swap_address(x['text'], x['address_new']), axis = 1)

            #reduce dataframe-columns
            f_name = f_name[['text', 'thirty_day_readmission']]
            f_name_address = f_name_address[['text', 'thirty_day_readmission']]
            f_address = f_address[['text', 'thirty_day_readmission']]


            
            #do everything again for the male subgroup, if it exists
            if new_race != 'HL_CO':
                new_m = notes_df[(notes_df['race_abbrev'] == new_race) & (notes_df['gender'] == 'M')].copy()
                new_m['age_rank'] = new_m['actual_age'].rank(method = 'first')
                m = pd.merge(base_m, new_m, on = 'age_rank', suffixes = ['', '_new'], validate = 'one_to_one')
                m_name = m.copy()
                m_name['text'] = m_name.apply(lambda x: swap_name(x['text'], x['name_new']), axis = 1)
                m_address = m.copy()
                m_address['text'] = m_address.apply(lambda x: swap_address(x['text'], x['address_new']), axis = 1)
                m_name_address = m_name.copy()
                m_name_address['text'] = m_name_address.apply(lambda x: swap_address(x['text'], x['address_new']), axis = 1)
                m_name = m_name[['text', 'thirty_day_readmission']]
                m_name_address = m_name_address[['text', 'thirty_day_readmission']]
                m_address = m_address[['text', 'thirty_day_readmission']]

                #concat male and female group
                name_df = pd.concat([f_name, m_name])
                address_df = pd.concat([f_address, m_address])
                name_address_df = pd.concat([f_name_address, m_name_address])

            else:
                name_df = f_name
                address_df = f_address
                name_address_df = f_name_address
            

            #remove labels
            name_df['text'] = name_df.apply(lambda x: re.sub('<[^>]*>', '', x['text']), axis = 1)
            address_df['text'] = address_df.apply(lambda x: re.sub('<[^>]*>', '', x['text']), axis = 1)
            name_address_df['text'] = name_address_df.apply(lambda x: re.sub('<[^>]*>', '', x['text']), axis = 1)

            groups = [(f'{baseline_race}_to_{new_race}_name', name_df),
                        (f'{baseline_race}_to_{new_race}_address', address_df),
                        (f'{baseline_race}_to_{new_race}_name_address', name_address_df)]

            for name, group in groups:

                #save in a Test files dictionary
                race_test_group[name] = group.reset_index(drop = True)
            
                #export to csv
                path = f'readmission_prediction/race_swapping/data/'
                if not os.path.exists(path):
                    os.makedirs(path)
                group.to_csv(path+f'{name}.csv', index = False)

Swap in race explicitly

In [None]:
new_notes = test_notes.copy()

#prepare the notes
new_notes = add_race(new_notes)
new_notes = swap_in(new_notes)

#get additional info for every note
notes_df = get_hadm_id(new_notes)
notes_df = pd.merge(notes_df, note_info[['hadm_id', 'gender', 'race_abbrev', 'actual_age', 
                                         'name', 'address', 'thirty_day_readmission']], on = 'hadm_id')

#loop through all possible race combinations
races = note_info['race_abbrev'].unique()

#save all test groups for sanity checks
race_test_group = {}

for baseline_race in races:
  
    for new_race in races:

        #filter for that race
        group = notes_df[notes_df['race_abbrev'] == baseline_race].copy()

        if baseline_race != new_race:
            group['text'] = group.apply(lambda x: swap_race(x['text'], new_race), axis = 1)


        #remove labels
        group['text'] = group.apply(lambda x: re.sub('<[^>]*>', '', x['text']), axis = 1)

        group = group[['text','thirty_day_readmission']]

        #save in a Test files dictionary
        race_test_group[baseline_race] = group.reset_index(drop = True)
    
        #export to csv
        path = f'readmission_prediction/race_swapping/data/'
        if not os.path.exists(path):
            os.makedirs(path)
        
        if baseline_race == new_race:
            group.to_csv(path+f'{baseline_race}_race.csv', index = False)
        else:
            group.to_csv(path+f'{baseline_race}_to_{new_race}_race.csv', index = False)

## Create notes with swapped age in each subgroup (/grouped_anonymized/age_swapping)

In [None]:
#swap in personal info
new_notes = test_notes.copy()
new_notes = swap_in(new_notes)

#get additional info in separate columns
notes_df = get_hadm_id(new_notes)
notes_df = pd.merge(note_info, notes_df, on = 'hadm_id')

#group by race-gender-subgroup 
#rank by age in each subgroup
notes_df['age_rank'] = notes_df.groupby('abbrev')['actual_age'].transform(lambda x: x.rank(method = 'first'))
notes_df['age_rank_reverse'] = notes_df.groupby('abbrev')['actual_age'].transform(lambda x: x.rank(method = 'first', ascending = False))


#loop through each race-gender-subgroup
subgroups = note_info['abbrev'].unique()

age_cols = ['hadm_id_y', 'thirty_day_readmission_y', 'text_y', 'name_y', 'address_y', 'actual_age_y', 'dob_y',
       'hadm_id_o','thirty_day_readmission_o', 'text_o', 'name_o', 'address_o', 'actual_age_o', 'dob_o',
       'text_o_y_age', 'text_o_y_name', 'text_o_y_address',
       'text_o_y_age_name', 'text_o_y_age_address', 'text_o_y_name_address',
       'text_o_y_age_name_address', 'text_y_o_age', 'text_y_o_name',
       'text_y_o_address', 'text_y_o_age_name', 'text_y_o_age_address',
       'text_y_o_name_address', 'text_y_o_age_name_address']
complete_age = pd.DataFrame(columns = age_cols)

for subgroup in subgroups:
    
    #select rows belonging to that subgroup
    subgroup_df = notes_df[notes_df['abbrev'] == subgroup]

    #filter oldest and youngest quarter
    cols = ['age_rank','hadm_id', 'text', 'name', 'address', 'actual_age', 'dob', 'thirty_day_readmission']
    young = subgroup_df[subgroup_df['age_rank']<=12][cols]
    old = subgroup_df[subgroup_df['age_rank']>38][cols]

    #adjust old age_rank
    old['age_rank'] = old['age_rank']-38

    #merge old and young 
    merged = pd.merge(young, old, on = 'age_rank', suffixes = ['_y', '_o'])
    merged.drop('age_rank', axis = 1, inplace = True)

    
    merged['text_o_y_age'] = merged.apply(lambda x: swap_age(x['text_o'], age = x['actual_age_y'], dob = x['dob_y']), axis = 1)
    merged['text_o_y_name'] = merged.apply(lambda x: swap_name(x['text_o'], x['name_y']), axis = 1)
    merged['text_o_y_address'] = merged.apply(lambda x: swap_address(x['text_o'], x['address_y']), axis = 1)
    merged['text_o_y_age_name'] = merged.apply(lambda x: swap_name(x['text_o_y_age'], x['name_y']), axis = 1)
    merged['text_o_y_age_address'] = merged.apply(lambda x: swap_address(x['text_o_y_age'], x['address_y']), axis = 1)
    merged['text_o_y_name_address'] = merged.apply(lambda x: swap_address(x['text_o_y_name'], x['address_y']), axis = 1)
    merged['text_o_y_age_name_address'] = merged.apply(lambda x: swap_address(x['text_o_y_age_name'], x['address_y']), axis = 1)

    merged['text_y_o_age'] = merged.apply(lambda x: swap_age(x['text_y'], age = x['actual_age_o'], dob = x['dob_o']), axis = 1)
    merged['text_y_o_name'] = merged.apply(lambda x: swap_name(x['text_y'], x['name_o']), axis = 1)
    merged['text_y_o_address'] = merged.apply(lambda x: swap_address(x['text_y'], x['address_o']), axis = 1)
    merged['text_y_o_age_name'] = merged.apply(lambda x: swap_name(x['text_y_o_age'], x['name_o']), axis = 1)
    merged['text_y_o_age_address'] = merged.apply(lambda x: swap_address(x['text_y_o_age'], x['address_o']), axis = 1)
    merged['text_y_o_name_address'] = merged.apply(lambda x: swap_address(x['text_y_o_name'], x['address_o']), axis = 1)
    merged['text_y_o_age_name_address'] = merged.apply(lambda x: swap_address(x['text_y_o_age_name'], x['address_o']), axis = 1)

    #remove labels
    text_cols = list(filter(lambda x: x.startswith('text'), merged.columns))
    for col in text_cols:
        merged[col] = merged.apply(lambda x: re.sub('<[^>]*>', '', x[col]), axis = 1)
        
    complete_age = pd.concat([complete_age, merged])
    

groups = [
    ('old', rename_col_t_r(complete_age[['text_o', 'thirty_day_readmission_o']])),
    ('old_young_age', rename_col_t_r(complete_age[['text_o_y_age', 'thirty_day_readmission_o']])),
    ('old_young_name', rename_col_t_r(complete_age[['text_o_y_name', 'thirty_day_readmission_o']])),
    ('old_young_address', rename_col_t_r(complete_age[['text_o_y_address', 'thirty_day_readmission_o']])),
    ('old_young_age_name', rename_col_t_r(complete_age[['text_o_y_age_name', 'thirty_day_readmission_o']])),
    ('old_young_age_address', rename_col_t_r(complete_age[['text_o_y_age_address', 'thirty_day_readmission_o']])),
    ('old_young_name_address', rename_col_t_r(complete_age[['text_o_y_name_address', 'thirty_day_readmission_o']])),
    ('old_young_age_name_address', rename_col_t_r(complete_age[['text_o_y_age_name_address', 'thirty_day_readmission_o']])),
    ('young', rename_col_t_r(complete_age[['text_y', 'thirty_day_readmission_y']])),
    ('young_old_age', rename_col_t_r(complete_age[['text_y_o_age', 'thirty_day_readmission_y']])),
    ('young_old_name', rename_col_t_r(complete_age[['text_y_o_name', 'thirty_day_readmission_y']])),
    ('young_old_address', rename_col_t_r(complete_age[['text_y_o_address', 'thirty_day_readmission_y']])),
    ('young_old_age_name', rename_col_t_r(complete_age[['text_y_o_age_name', 'thirty_day_readmission_y']])),
    ('young_old_age_address', rename_col_t_r(complete_age[['text_y_o_age_address', 'thirty_day_readmission_y']])),
    ('young_old_name_address', rename_col_t_r(complete_age[['text_y_o_name_address', 'thirty_day_readmission_y']])),
    ('young_old_age_name_address', rename_col_t_r(complete_age[['text_y_o_age_name_address', 'thirty_day_readmission_y']]))
]



age_test_group = {}

for name, group in groups:

    #save in a Test files dictionary
    age_test_group[name] = group.reset_index(drop = True)

    #export to csv
    path = f'readmission_prediction/age_swapping/data/'
    if not os.path.exists(path):
        os.makedirs(path)
    group.to_csv(path+f'{name}.csv', index = False)

## Create notes with shifted age (/grouped_anonymized/age_shifting)

#### Filter for notes that allow a plausible age shift (inside the age range of the dataset)
- only shift to ages the model has seen before

In [None]:
note_info_test['actual_age'].quantile([0,0.25,0.5,0.75,1])

In [None]:
note_info_train['actual_age'].quantile([0,1])

#### Create notes with a shifted age

In [None]:
age_shift_notes = test_notes.copy()
age_shift_notes = swap_in(age_shift_notes) 
age_shift_notes = get_hadm_id(age_shift_notes)

age_shift_info = note_info_test[(note_info_test['actual_age']>=39) & (note_info_test['actual_age']<=78)]

age_shift = pd.merge(age_shift_notes, age_shift_info, on = 'hadm_id', how = 'inner')

#shift age in the text
age_shift['text_plus_10'] = age_shift.apply(lambda x: swap_age(x['text'], age = x['actual_age']+10, dob = x['dob']-10), axis = 1)
age_shift['text_plus_20'] = age_shift.apply(lambda x: swap_age(x['text'], age = x['actual_age']+20, dob = x['dob']-20), axis = 1)
age_shift['text_minus_10'] = age_shift.apply(lambda x: swap_age(x['text'], age = x['actual_age']-10, dob = x['dob']+10), axis = 1)
age_shift['text_minus_20'] = age_shift.apply(lambda x: swap_age(x['text'], age = x['actual_age']-20, dob = x['dob']+20), axis = 1)

#remove labels
text_cols = list(filter(lambda x: x.startswith('text'), age_shift.columns))
for col in text_cols:
    age_shift[col] = age_shift.apply(lambda x: re.sub('<[^>]*>', '', x[col]), axis = 1)


groups = [('age', age_shift[['text', 'thirty_day_readmission']]),
            ('age_plus_10', rename_col_t_r(age_shift[['text_plus_10', 'thirty_day_readmission']])),
            ('age_plus_20', rename_col_t_r(age_shift[['text_plus_20', 'thirty_day_readmission']])),
            ('age_minus_10', rename_col_t_r(age_shift[['text_minus_10', 'thirty_day_readmission']])),
            ('age_minus_20', rename_col_t_r(age_shift[['text_minus_20', 'thirty_day_readmission']]))]
          


age_shift_test_group = {}

for name, group in groups:

    #save in a Test files dictionary
    age_shift_test_group[name] = group.reset_index(drop = True)

    #export to csv
    path = f'readmission_prediction/age_shifting/data/'
    if not os.path.exists(path):
        os.makedirs(path)
    group.to_csv(path+f'{name}.csv', index = False)

In [None]:
age_test_group['old'].shape

In [None]:
note_info_test[(note_info_test['actual_age']<39) | (note_info_test['actual_age']>78)].shape

In [None]:
age_shift_info.shape