In [None]:
import pandas as pd
from pathlib import Path
import json
import matplotlib.pyplot as plt
import seaborn as sns

## Define filepaths for output directory

In [None]:
filepath = {
    'anonymized':  Path('readmission_prediction/outputs/'),
    'gender': Path('attribute_prediction/gender/outputs/'), 
    'race':  Path('attribute_prediction/race/outputs/'),
    'age': Path('attribute_prediction/age/outputs/'),
    'gender_swapping_anonModel': Path('readmission_prediction/gender_swapping/outputs/anonymized'),
    'gender_swapping_fullModel': Path('readmission_prediction/gender_swapping/outputs/full_info'),
    'race_swapping_anonModel': Path('readmission_prediction/race_swapping/outputs/anonymized'),
    'race_swapping_fullModel': Path('readmission_prediction/race_swapping/outputs/full_info'),
    'age_swapping_anonModel': Path('readmission_prediction/age_swapping/outputs/anonymized'),
    'age_swapping_fullModel': Path('readmission_prediction/age_swapping/outputs/full_info'),
    'age_shifting_anonModel': Path('readmission_prediction/age_shifting/outputs/anonymized'),
    'age_shifting_fullModel': Path('readmission_prediction/age_shifting/outputs/full_info'),
}
output_files = {path: list(filepath[path].glob('*.json')) for path in filepath}

In [None]:
#output_files

## Read output into a list of dicts

In [None]:
output_contents = {}

for path in output_files:

    contents = []
    
    for file_path in output_files[path]:
    
        with open(file_path) as file:
            text = json.load(file)
    
        contents.append(text)

    output_contents[path] = contents

#### save name of each output

In [None]:
def multi_remove(string, removal_list):

    string = string.replace('.json', '')

    for text in removal_list:
       string = string.replace((str(text) + '/') , '')
        
    return string
    

output_names = {}

for path in output_files:

    names = []
    
    for file_path in output_files[path]:
        name = multi_remove(str(file_path), filepath.values())
        names.append(name)

    output_names[path] = names

In [None]:
output = {}

for key in output_names:
    
    single_output = dict(zip(output_names[key], output_contents[key])) 
    output[key] = single_output

In [None]:
#output.keys()
#output['filepath_anonymized']
#output['anonymized']['_address_age_gender_8b_cv5']['metrics']

## Create DataFrame from output maps

In [None]:
output_structured = {}

for key in output:
    
    structured = [] 
    
    for file in output[key]:
        
        metrics = output[key][file]['metrics']

        try:
            metrics['mean_proba'] = {'test': output[key][file]['mean_test_proba']}
        except:
            pass
        
        for metric in metrics:
    
            splits = metrics[metric]

            row = {'file': file, 'metric': metric}
                
            row.update(splits)
    
            structured.append(row)
                
    structured = pd.DataFrame.from_records(structured)
    output_structured[key] = structured

# grouped_anonymized

## Make filenames easier to read

In [None]:
personal_info_map = {'address': 'Address', 
                     'age': 'Age', 
                     'gender': 'Gender', 
                     'name': 'Name', 
                     'visitdates': 'Admission and Discharge Time', 
                     'race': 'Race'}

def make_readable(filename):

    filename = filename.replace('_', '')
    filename = filename.replace('8b', '')
    filename = filename.replace('cv5', '')

    if filename == '':
        return "NONE"

    
    info = []
    
    for key in personal_info_map:
        
        if key in filename:
            info.append(personal_info_map[key])

    #print(personal_info_map)
    long_name = ', '.join(info)
    return long_name

In [None]:
output_structured['anonymized']['file'] = output_structured['anonymized']['file'].apply(make_readable)

## Separate by evaluation metric

In [None]:
anonymized_auroc = output_structured['anonymized'][output_structured['anonymized']['metric'] == 'auroc'][['metric', 'file', 'train', 'test', 'cv_mean', 'cv_std']]
anonymized_auprc = output_structured['anonymized'][output_structured['anonymized']['metric'] == 'auprc'][['metric', 'file', 'train', 'test', 'cv_mean', 'cv_std']]
anonymized_brier = output_structured['anonymized'][output_structured['anonymized']['metric'] == 'brier'][['metric', 'file', 'train', 'test', 'cv_mean', 'cv_std']]

In [None]:
for attribute in ['Address', 'Age', 'Gender', 'Name', 'Admission and Discharge Time', 'Race']:
    anonymized_auroc[attribute] = False
    anonymized_auroc.loc[anonymized_auroc['file'].str.contains(attribute), attribute] = True

In [None]:
anon_address = anonymized_auroc[anonymized_auroc['Address'] == True].copy()
anon_age = anonymized_auroc[anonymized_auroc['Age'] == True].copy()
anon_gender = anonymized_auroc[anonymized_auroc['Gender'] == True].copy()
anon_name = anonymized_auroc[anonymized_auroc['Name'] == True].copy()
anon_time = anonymized_auroc[anonymized_auroc['Admission and Discharge Time'] == True].copy()
anon_race = anonymized_auroc[anonymized_auroc['Race'] == True].copy()

anon_address['source'] = 'Address'
anon_age['source'] = 'Age'
anon_gender['source'] = 'Gender'
anon_name['source'] = 'Name'
anon_time['source'] = 'Time'
anon_race['source'] = 'Race'

anon_concat = pd.concat([anon_address, anon_age, anon_gender, anon_name, anon_time, anon_race])

In [None]:
sns.scatterplot(
    data=anon_concat,
    x='source',
    y='test'
)
plt.title('Distribution comparison across datasets')
plt.show()

In [None]:
sns.kdeplot(
    data = anon_concat,
    x = 'test',
    hue = 'source',
)
plt.title('Distribution comparison across datasets')
plt.show()

In [None]:
sorted_anonymized_auroc = anonymized_auroc.sort_values(by = 'test')[['metric', 'file', 'train', 'test', 'cv_mean', 'cv_std']]
sorted_anonymized_auprc = anonymized_auprc.sort_values(by = 'test')[['metric', 'file', 'train', 'test', 'cv_mean', 'cv_std']]
sorted_anonymized_brier = anonymized_brier.sort_values(by = 'test')[['metric', 'file', 'train', 'test', 'cv_mean', 'cv_std']]

In [None]:
sorted_anonymized_auroc_min = sorted_anonymized_auroc.iloc[0]
sorted_anonymized_auroc_max = sorted_anonymized_auroc.iloc[-1]
sorted_anonymized_auprc_min = sorted_anonymized_auprc.iloc[0]
sorted_anonymized_auprc_max = sorted_anonymized_auprc.iloc[-1]
sorted_anonymized_brier_min = sorted_anonymized_brier.iloc[0]
sorted_anonymized_brier_max = sorted_anonymized_brier.iloc[-1]

min_max_anon = pd.concat([sorted_anonymized_auroc_min, sorted_anonymized_auroc_max,
                         sorted_anonymized_auprc_min, sorted_anonymized_auprc_max, 
                         sorted_anonymized_brier_min, sorted_anonymized_brier_max], axis = 1).T

In [None]:
print(min_max_anon.to_latex(index=False, float_format='%.3f'))

In [None]:
models = output_structured['anonymized'][output_structured['anonymized']['file'].isin(['Address, Age, Gender, Name, Admission and Discharge Time, Race', 'NONE'])]
models.loc[models['file'] == 'Address, Age, Gender, Name, Admission and Discharge Time, Race', 'file'] = 'full_model'
models.loc[models['file'] == 'NONE', 'file'] = 'anon_model'

In [None]:
print(models.sort_values(by='metric')[['metric', 'file', 'train', 'test', 'cv_mean', 'cv_std']].to_latex(index = False, float_format='%.3f'))

In [None]:
anon_one = sorted_anonymized_auroc[sorted_anonymized_auroc['file'].isin(
    ['Address', 'Age', 'Gender', 'Name', 'Admission and Discharge Time', 'Race'])]

In [None]:
print(anon_one[['file', 'test']].to_latex())

#### Influence of each personal info

In [None]:
sorted_anonymized_auroc['influence'] = list(range(1,65))
sorted_anonymized_auprc['influence'] = list(range(1,65))
sorted_anonymized_brier['influence'] = list(range(1,65))

In [None]:
auroc_influence = {'Address': 0, 'Age': 0, 'Gender': 0, 'Name': 0, 'Admission and Discharge Time': 0, 'Race': 0}
auprc_influence = {'Address': 0, 'Age': 0, 'Gender': 0, 'Name': 0, 'Admission and Discharge Time': 0, 'Race': 0}
brier_influence = {'Address': 0, 'Age': 0, 'Gender': 0, 'Name': 0, 'Admission and Discharge Time': 0, 'Race': 0}

for key in auroc_influence:
    rows = sorted_anonymized_auroc[sorted_anonymized_auroc['file'].str.contains(key)]
    auroc_influence[key] = rows['influence'].sum()

for key in auprc_influence:
    rows = sorted_anonymized_auprc[sorted_anonymized_auprc['file'].str.contains(key)]
    auprc_influence[key] = rows['influence'].sum()

for key in brier_influence:
    rows = sorted_anonymized_brier[sorted_anonymized_brier['file'].str.contains(key)]
    brier_influence[key] = rows['influence'].sum()

In [None]:
print(sorted(auroc_influence.items(), key=lambda x: x[1]))
print(sorted(auprc_influence.items(), key=lambda x: x[1]))
print(sorted(brier_influence.items(), key=lambda x: x[1], reverse = True))

# embedding_significance/gender

## Make file names easier to read

In [None]:
gender_info_map = {
    'no_gender_everything': 'other_info', 
    'given_gender_everything': 'gender_info, other_info', 
    'no_gender_nothing': 'no_info',
    'given_gender_nothing': 'gender_info'
    }

def make_readable_gender(filename):

    filename = filename.replace('_8b', '')

    if filename == '':
        return "NONE"

    
    info = []
    
    for key in gender_info_map:
        
        if key in filename:
            info.append(gender_info_map[key])

    #print(personal_info_map)
    long_name = ', '.join(info)
    return long_name

In [None]:
output_structured['gender']['file'] = output_structured['gender']['file'].apply(make_readable_gender)

## Separate by evaluation metric

In [None]:
gender_auroc = output_structured['gender'][output_structured['gender']['metric'] == 'auroc'][['file', 'train', 'test', 'cv_mean', 'cv_std']]
gender_auprc = output_structured['gender'][output_structured['gender']['metric'] == 'auprc'][['file', 'train', 'test', 'cv_mean', 'cv_std']]
gender_brier = output_structured['gender'][output_structured['gender']['metric'] == 'brier'][['file', 'train', 'test', 'cv_mean', 'cv_std']]

In [None]:
gender_auroc

In [None]:
print(gender_auroc.sort_values(by='test').to_latex(index = False, float_format='%.3f'))

In [None]:
gender_auprc.sort_values(by='test')

In [None]:
gender_brier.sort_values(by='test')

# embedding_significance/race

## Make filenames easier to read

In [None]:
race_info_map = {
    'no_race_everything': 'other_info', 
    'no_race_nothing': 'no_info', 
    'given_race_everything': 'race_info, other_info',
    'given_race_nothing': 'race_info'
    }

def make_readable_race(filename):

    filename = filename.replace('_8b', '')

    if filename == '':
        return "NONE"

    
    info = []
    
    for key in race_info_map:
        
        if key in filename:
            info.append(race_info_map[key])

    #print(personal_info_map)
    long_name = ', '.join(info)
    return long_name

In [None]:
output_structured['race']['file'] = output_structured['race']['file'].apply(make_readable_race)

## Separate by evaluation metric

In [None]:
race_auroc = output_structured['race'][output_structured['race']['metric'] == 'auroc'][['file', 'train', 'test', 'cv_mean', 'cv_std']]
race_brier = output_structured['race'][output_structured['race']['metric'] == 'brier'][['file', 'train', 'test', 'cv_mean', 'cv_std']]
race_auprc = output_structured['race'][output_structured['race']['metric'] == 'auprc'][['file', 'train', 'test', 'cv_mean', 'cv_std']]

In [None]:
race_auprc.sort_values(by='test')

In [None]:
print(race_auroc.sort_values(by='test').to_latex(index = False, float_format='%.3f'))

## AUROC by group

Sort by subgroup

In [None]:
choose_metric = (output_structured['race']['metric'] == 'auroc_per_class')
race_auroc_per_class = output_structured['race'][choose_metric][['file', 'train', 'test']]


no_info = race_auroc_per_class[race_auroc_per_class['file'] == 'no_info']
other_info = race_auroc_per_class[race_auroc_per_class['file'] == 'other_info']
race_info = race_auroc_per_class[race_auroc_per_class['file'] == 'race_info']
full_info = race_auroc_per_class[race_auroc_per_class['file'] == 'race_info, other_info']

no_info = {'file': list(range(0,21)), 
                       'train': no_info['train'].tolist()[0],
                       'test': no_info['test'].tolist()[0]
                      }

other_info = {'file': list(range(0,21)), 
                       'train': other_info['train'].tolist()[0],
                       'test': other_info['test'].tolist()[0]
                      }
race_info = {'file': list(range(0,21)), 
                       'train': race_info['train'].tolist()[0],
                       'test': race_info['test'].tolist()[0]
                      }
full_info = {'file': list(range(0,21)), 
                       'train': full_info['train'].tolist()[0],
                       'test': full_info['test'].tolist()[0]
                      }

no_info = pd.DataFrame(no_info)
other_info = pd.DataFrame(other_info)
race_info = pd.DataFrame(race_info)
full_info = pd.DataFrame(full_info)

Load class mapping for races

In [None]:
race_file_path = Path('attribute_prediction/race/artifacts')
race_file_list = race_file_path.glob('**/*metadata.json')

race_map = {}
    
for file in race_file_list:
    
    with open(file) as file:
        contents = json.load(file)
    path = contents['csv_path'].replace('attribute_prediction/race/data', '')
    mapping = contents['label_map']

    race_map[path] = mapping

race_map Sanity Check - Everything has the same label mapping

In [None]:
print(race_map['/given_race_nothing/test.csv'] == race_map['/given_race_nothing/train.csv'])
print(race_map['/no_race_everything/test.csv'] == race_map['/no_race_everything/train.csv'])
print(race_map['/given_race_nothing/test.csv'] == race_map['/given_race_nothing/train.csv'])
print(race_map['/no_race_nothing/test.csv'] == race_map['/no_race_nothing/train.csv'])

print(race_map['/no_race_nothing/test.csv'] == race_map['/no_race_everything/test.csv'])
print(race_map['/given_race_nothing/test.csv'] == race_map['/given_race_everything/test.csv'])
print(race_map['/no_race_nothing/test.csv'] == race_map['/given_race_everything/test.csv'])

In [None]:
map_race = pd.DataFrame.from_dict(race_map['/given_race_nothing/test.csv'], orient = 'index', columns = ['file'])
map_race = map_race.reset_index(names = 'race')

Merge the DataFrames and map the class number to its race label

In [None]:
sorted_other_info = pd.merge(map_race, other_info, on = 'file')
sorted_other_info = sorted_other_info.sort_values(by = 'race')
sorted_other_info =sorted_other_info[['race', 'train', 'test']]
sorted_other_info['information'] = ['other_info']*21

sorted_no_info = pd.merge(map_race, no_info, on = 'file')
sorted_no_info = sorted_no_info.sort_values(by = 'race')
sorted_no_info = sorted_no_info[['race', 'train', 'test']]
sorted_no_info['information'] = ['no_info']*21

sorted_full_info = pd.merge(map_race, full_info, on = 'file')
sorted_full_info = sorted_full_info.sort_values(by = 'race')
sorted_full_info = sorted_full_info[['race', 'train', 'test']]
sorted_full_info['information'] = ['race_info, other_info']*21

sorted_race_info = pd.merge(map_race, race_info, on = 'file')
sorted_race_info = sorted_race_info.sort_values(by = 'race')
sorted_race_info =sorted_race_info[['race', 'train', 'test']]
sorted_race_info['information'] = ['race_info']*21

In [None]:
race_grouped = pd.concat([sorted_other_info, sorted_no_info, sorted_full_info, sorted_race_info])

In [None]:
race_structured = pd.DataFrame({'race': race_grouped['race'].unique(), 
                                'race_info, other_info [TRAIN]': None,
                                'race_info [TRAIN]': None,
                                'other_info [TRAIN]': None, 
                                'no_info [TRAIN]': None, 
                                'race_info, other_info [TEST]': None,
                                'race_info [TEST]': None,
                                'other_info [TEST]': None, 
                                'no_info [TEST]': None,
                               })

for race in race_grouped['race'].unique():
    for information in race_grouped['information'].unique():
        row_filter = (race_grouped['information'] == information) & (race_grouped['race'] == race)
        race_structured.loc[race_structured['race'] == race, f'{information} [TRAIN]'] = race_grouped[row_filter]['train'].values
        race_structured.loc[race_structured['race'] == race, f'{information} [TEST]'] = race_grouped[row_filter]['test'].values

In [None]:
race_structured

In [None]:
#sorted_no_info.drop('information', axis = 1)

# embedding_significance/age

#### Make filenames easier to read

In [None]:
age_info_map = {
    'no_age_everything': 'other_info', 
    'given_age_everything': 'age_info, other_info', 
    'no_age_nothing': 'no_info',
    'given_age_nothing': 'age_info'
    }

def make_readable_age(filename):

    filename = filename.replace('_8b', '')

    if filename == '':
        return "NONE"

    
    info = []
    
    for key in age_info_map:
        
        if key in filename:
            info.append(age_info_map[key])

    #print(personal_info_map)
    long_name = ', '.join(info)
    return long_name

In [None]:
output_structured['age']['file'] = output_structured['age']['file'].apply(make_readable_age)

#### r2 by group

In [None]:
age_metrics = output_structured['age'].sort_values(by = 'test')

In [None]:
print(age_metrics.sort_values(by='test').to_latex(index = False, float_format='%.3f'))

#### Plot true vs. predicted age

In [None]:
pred_true_by_group = {}

pred_true_by_group['no_info'] = output['age']['no_age_nothing_8b']['pred_vs_true']
pred_true_by_group['other_info'] = output['age']['no_age_everything_8b']['pred_vs_true']
pred_true_by_group['age_info'] = output['age']['given_age_nothing_8b']['pred_vs_true']
pred_true_by_group['age_info, other_info'] = output['age']['given_age_everything_8b']['pred_vs_true']

In [None]:
for group in pred_true_by_group:

    """
    train_pred = pred_true_by_group[group]['y_train_pred']
    train_true = pred_true_by_group[group]['y_train_true']
    """
    test_pred = pred_true_by_group[group]['y_test_pred']
    test_true = pred_true_by_group[group]['y_test_true']
    
    """
    sns.scatterplot(x = train_true, y = train_pred)
    plt.suptitle("true vs predicted age in Training Data")
    plt.xlabel("true age [years]")
    plt.ylabel("predicted age [years]")
    plt.plot([min(train_true, train_pred), max(train_true, train_pred)], 
             [min(train_true, train_pred), max(train_true, train_pred)], 
            color = 'black')
    plt.show()
    """
    sns.scatterplot(x = test_true, y = test_pred)
    plt.suptitle(f"{group}: age in test group")
    plt.xlabel("true age [years]")
    plt.ylabel("predicted age [years]")
    plt.plot([min(test_true, test_pred), max(test_true, test_pred)], 
             [min(test_true, test_pred), max(test_true, test_pred)], 
            color = 'black')
    plt.show()

# grouped_anonymized/gender_swapping

#### Load gender_swapping outputs

In [None]:
gs_anon = output_structured['gender_swapping_anonModel'].copy()
gs_full = output_structured['gender_swapping_fullModel'].copy()

#remove model from file name
gs_anon['file'] = output_structured['gender_swapping_anonModel']['file'].apply(lambda x: x[:-8])
gs_full['file'] = output_structured['gender_swapping_fullModel']['file'].apply(lambda x: x[:-8])

In [None]:
output_structured['gender_swapping_anonModel']

#### Put each metric into its own column

In [None]:
#Split by metric
gs_anon_mean_proba = gs_anon[gs_anon['metric'] == 'mean_proba'].drop('metric', axis = 1)
gs_anon_auroc = gs_anon[gs_anon['metric'] == 'auroc'].drop('metric', axis = 1)
gs_anon_brier = gs_anon[gs_anon['metric'] == 'brier'].drop('metric', axis = 1)
gs_anon_auprc = gs_anon[gs_anon['metric'] == 'auprc'].drop('metric', axis = 1)

gs_full_mean_proba = gs_full[gs_full['metric'] == 'mean_proba'].drop('metric', axis = 1)
gs_full_auroc = gs_full[gs_full['metric'] == 'auroc'].drop('metric', axis = 1)
gs_full_brier = gs_full[gs_full['metric'] == 'brier'].drop('metric', axis = 1)
gs_full_auprc = gs_full[gs_full['metric'] == 'auprc'].drop('metric', axis = 1)

#Merge back together
gs_anon = pd.merge(gs_anon_mean_proba, gs_anon_auroc, on = 'file', suffixes = ['_mean_proba', '_auroc'])
gs_anon = pd.merge(gs_anon, gs_anon_brier, on = 'file')
gs_anon = pd.merge(gs_anon, gs_anon_auprc, on = 'file', suffixes = ['_brier', '_auprc'])

gs_full = pd.merge(gs_full_mean_proba, gs_full_auroc, on = 'file', suffixes = ['_mean_proba', '_auroc'])
gs_full = pd.merge(gs_full, gs_full_brier, on = 'file')
gs_full = pd.merge(gs_full, gs_full_auprc, on = 'file', suffixes = ['_brier', '_auprc'])

In [None]:
#gs_full

#### Merge the models by their swapping categories

In [None]:
gs = pd.merge(gs_anon, gs_full, on = 'file', suffixes = ['_anonModel', '_fullModel'])

#### Split by original gender

In [None]:
gs_f = gs[gs['file'].str.startswith('f')].copy() 
gs_m = gs[gs['file'].str.startswith('m')].copy() 

#### Compute mean_test_probability delta

In [None]:
for metric in ['mean_proba', 'auroc', 'brier', 'auprc']:

    #anonymized Model
    f_proba_anon = gs_f.loc[gs_f['file'] == 'f', f'test_{metric}_anonModel']
    m_proba_anon = gs_m.loc[gs_m['file'] == 'm', f'test_{metric}_anonModel']
    
    gs_f[f'delta_{metric}_anonModel'] = gs_f[f'test_{metric}_anonModel'].apply(lambda x: x - f_proba_anon) #delta x -f 
    gs_m[f'delta_{metric}_anonModel'] = gs_m[f'test_{metric}_anonModel'].apply(lambda x: x - m_proba_anon)
    
    #full_info Model
    f_proba_full = gs_f.loc[gs_f['file'] == 'f', f'test_{metric}_fullModel']
    m_proba_full = gs_m.loc[gs_m['file'] == 'm', f'test_{metric}_fullModel']
    
    gs_f[f'delta_{metric}_fullModel'] = gs_f[f'test_{metric}_fullModel'].apply(lambda x: x - f_proba_full)
    gs_m[f'delta_{metric}_fullModel'] = gs_m[f'test_{metric}_fullModel'].apply(lambda x: x - m_proba_full)

In [None]:
gs = pd.concat([gs_f, gs_m])
gs_cols = [col for col in gs.columns if 'delta_mean_proba' in col or col == 'file']
gs_delta_proba = gs.loc[~gs['file'].isin(['f','m']), gs_cols]
gs_delta_proba.columns = ['file', 'anonModel', 'fullModel']

# create MultiIndex columns with 'f_to_m' and 'm_to_f' first
gs_delta_proba[('f_to_m', 'anonModel')] = gs_delta_proba.loc[gs_delta_proba['file'].str.startswith('f_m'), 'anonModel']
gs_delta_proba[('m_to_f', 'anonModel')] = gs_delta_proba.loc[gs_delta_proba['file'].str.startswith('m_f'), 'anonModel']
gs_delta_proba[('f_to_m', 'fullModel')] = gs_delta_proba.loc[gs_delta_proba['file'].str.startswith('f_m'), 'fullModel']
gs_delta_proba[('m_to_f', 'fullModel')] = gs_delta_proba.loc[gs_delta_proba['file'].str.startswith('m_f'), 'fullModel']

gs_delta_proba['file'] = gs_delta_proba['file'].str[4:]
gs_delta_proba = gs_delta_proba[['file', 
                                 ('f_to_m', 'anonModel'), 
                                 ('m_to_f', 'anonModel'),
                                 ('f_to_m', 'fullModel'),
                                 ('m_to_f', 'fullModel')]]
gs_delta_proba = gs_delta_proba.groupby('file', as_index=False).max()
gs_delta_proba = gs_delta_proba.set_index('file')
gs_delta_proba.columns = pd.MultiIndex.from_tuples(gs_delta_proba.columns)


In [None]:
gs_delta_proba

In [None]:
gs_delta_proba.index = gs_delta_proba.index.str.replace('_', ',\n ')
gs_delta_proba = gs_delta_proba.reindex(index = ['address', 'name', 'name,\n address', 'gender', 'gender,\n address', 'gender,\n name', 'gender,\n name,\n address'])
gs_delta_proba = gs_delta_proba.rename(columns = {'anonModel': 'anon_model', 'fullModel': 'full_model'})
#gs_delta_proba.index

Load test prevalence

In [None]:
prevalence = pd.read_csv('prevalence/prevalence_by_group_strat.csv')
#prevalence

In [None]:
prevalence.loc[prevalence['category'] == 'M', 'train_prevalence'].iloc[0]

F_train = prevalence.loc[prevalence['category'] == 'F', 'train_prevalence'].iloc[0] 
M_train = prevalence.loc[prevalence['category'] == 'M', 'train_prevalence'].iloc[0] 
F_test = prevalence.loc[prevalence['category'] == 'F', 'test_prevalence'].iloc[0] 
M_test = prevalence.loc[prevalence['category'] == 'M', 'test_prevalence'].iloc[0] 

print(F_train - M_train)
print(F_test - M_test)

In [None]:
f_m_plot = gs_delta_proba['f_to_m'].plot.bar(color = ['lightblue', 'blue'], figsize=(6,6))
f_m_plot.set_ylim(-0.06, 0.06)
plt.xticks(rotation = 0, fontsize = 11)
plt.yticks(fontsize = 11)
plt.xlabel('Manipulated Attributes', fontsize = 14)
plt.ylabel('Difference in Readmission-Probability', fontsize = 14)
plt.axhline(y = M_train - F_train, color = 'black', linestyle = 'dashed', label = 'Delta of Readmission Rate (M - F) in the Train Split', linewidth=1.5)
#plt.axhline(y = M_test - F_test, color = 'black', linestyle = 'dotted', label = 'Delta of Readmission Rate (M - F) in the Test Split', linewidth=2)
plt.axhline(y = 0, color = 'black', linewidth = 0.1)
plt.legend(fontsize = 11)
plt.tight_layout()
plt.savefig('images/f_m_plot_strat.png')


m_f_plot = gs_delta_proba['m_to_f'].plot.bar(color = ['moccasin', 'orange'], figsize=(6,6))
m_f_plot.set_ylim(-0.06, 0.06)
plt.xticks(rotation = 0, fontsize = 11)
plt.yticks(fontsize = 11)
plt.xlabel('Manipulated Attributes', fontsize = 14)
plt.ylabel('Difference in Readmission-Probability', fontsize = 14)
plt.axhline(y = F_train - M_train, color = 'black', linestyle = 'dashed', label = 'Delta of Readmission Rate (F - M) in the Train Split', linewidth=1.5)
#plt.axhline(y = F_test - M_test, color = 'black', linestyle = 'dotted', label = 'Delta of Readmission Rate (F - M) in the Test Split', linewidth=2)
plt.axhline(y = 0, color = 'black', linewidth = 0.1)
plt.legend(fontsize = 11)
plt.tight_layout()
plt.savefig('images/m_f_plot_strat.png')

Mean of delta proba --> Should be smaller when the model isn't trained to pay attention to personal information

In [None]:
print((gs['delta_mean_proba_fullModel']).abs().mean())
print((gs['delta_mean_proba_anonModel']).abs().mean())

# grouped_anonymized/race_swapping

#### Load race swapping outputs

In [None]:
rs_anon = output_structured['race_swapping_anonModel'].copy()
rs_full = output_structured['race_swapping_fullModel'].copy()

#remove model from file name
rs_anon['file'] = output_structured['race_swapping_anonModel']['file'].apply(lambda x: x[:-8])
rs_full['file'] = output_structured['race_swapping_fullModel']['file'].apply(lambda x: x[:-8])

#### Put each metric into its own column

In [None]:
#Split by metric
rs_anon_mean_proba = rs_anon[rs_anon['metric'] == 'mean_proba'].drop('metric', axis = 1)
rs_anon_auroc = rs_anon[rs_anon['metric'] == 'auroc'].drop('metric', axis = 1)
rs_anon_brier = rs_anon[rs_anon['metric'] == 'brier'].drop('metric', axis = 1)
rs_anon_auprc = rs_anon[rs_anon['metric'] == 'auprc'].drop('metric', axis = 1)

rs_full_mean_proba = rs_full[rs_full['metric'] == 'mean_proba'].drop('metric', axis = 1)
rs_full_auroc = rs_full[rs_full['metric'] == 'auroc'].drop('metric', axis = 1)
rs_full_brier = rs_full[rs_full['metric'] == 'brier'].drop('metric', axis = 1)
rs_full_auprc = rs_full[rs_full['metric'] == 'auprc'].drop('metric', axis = 1)

#Merge back together
rs_anon = pd.merge(rs_anon_mean_proba, rs_anon_auroc, on = 'file', suffixes = ['_mean_proba', '_auroc'])
rs_anon = pd.merge(rs_anon, rs_anon_brier, on = 'file')
rs_anon = pd.merge(rs_anon, rs_anon_auprc, on = 'file', suffixes = ['_brier', '_auprc'])

rs_full = pd.merge(rs_full_mean_proba, rs_full_auroc, on = 'file', suffixes = ['_mean_proba', '_auroc'])
rs_full = pd.merge(rs_full, rs_full_brier, on = 'file')
rs_full = pd.merge(rs_full, rs_full_auprc, on = 'file', suffixes = ['_brier', '_auprc'])

#### Merge the models by their swapping categories

In [None]:
rs = pd.merge(rs_anon, rs_full, on = 'file', suffixes = ['_anonModel', '_fullModel'])

In [None]:
races = list(rs[~rs['file'].str.endswith('name')& ~rs['file'].str.endswith('address') & ~rs['file'].str.endswith('race')]['file'])

rs_by_race = dict(zip(races, [None]*21))

for race in races:
    rs_by_race[race] = rs[rs['file'].str.startswith(race)]

In [None]:
#rs_by_race['AIAN']

View real Delta of Readmission Prevalence

In [None]:
race_prev = prevalence.copy()
#print(race_prev)
race_prev = race_prev.drop([0,1,12,23, 24, 25, 26, 27, 28, 29, 30])
#print(race_prev)
race_prev = pd.merge(race_prev, race_prev, how = 'cross')
race_prev['train_new-org'] = race_prev['train_prevalence_y'] -  race_prev['train_prevalence_x']
race_prev['test_new-org'] = race_prev['test_prevalence_y'] -  race_prev['test_prevalence_x']
race_prev = race_prev.rename(columns = {'category_x': 'original_race', 'category_y': 'new_race'})
race_prev_train = race_prev.pivot(index='original_race', columns='new_race', values='train_new-org')
race_prev_test = race_prev.pivot(index='original_race', columns='new_race', values='test_new-org')

In [None]:
#race_prev_test.max().max()

In [None]:
print('')
race_prev_test_plot = sns.heatmap(race_prev_test, annot = False, cmap="coolwarm", vmin = -0.17, vmax=0.17)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('r1', fontsize = 14)
plt.ylabel('r2', fontsize = 14)
plt.tight_layout()
race_prev_test_plot = race_prev_test_plot.get_figure()
race_prev_test_plot.savefig('images/race_prev_test_strat.png')

In [None]:
print('Train Prevalence Delta (New - Original)')
race_prev_train_plot = sns.heatmap(race_prev_train, annot = False, cmap="coolwarm", vmin = -0.17, vmax=0.17)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('r1', fontsize = 14)
plt.ylabel('r2', fontsize = 14)
plt.tight_layout()
race_prev_train_plot = race_prev_train_plot.get_figure()
race_prev_train_plot.savefig('images/race_prev_train_strat.png')

Reduce rs to DELTA of test mean_proba for anon

In [None]:
#name
filter_cond = rs_anon_mean_proba['file'].str.endswith('name') | rs_anon_mean_proba['file'].isin(races)
rs_anon_mean_proba_name = rs_anon_mean_proba[filter_cond].copy()
rs_anon_mean_proba_name['file'] = rs_anon_mean_proba_name['file'].str.replace('_name', '')
rs_anon_mean_proba_name['race_tuple'] = rs_anon_mean_proba_name['file'].str.split('_to_').apply(tuple)
rs_anon_mean_proba_name['original_race'] = rs_anon_mean_proba_name['race_tuple'].str[0]
rs_anon_mean_proba_name['new_race'] = rs_anon_mean_proba_name['race_tuple'].str[1]
rs_anon_mean_proba_name.loc[rs_anon_mean_proba_name['new_race'].isna(), 'new_race'] = rs_anon_mean_proba_name['original_race']
rs_anon_name = rs_anon_mean_proba_name.pivot(index='original_race', columns='new_race', values='test')

#address
filter_cond = rs_anon_mean_proba['file'].str.match(r'.*[^e]_address') | rs_anon_mean_proba['file'].isin(races)
rs_anon_mean_proba_address = rs_anon_mean_proba[filter_cond].copy()
rs_anon_mean_proba_address['file'] = rs_anon_mean_proba_address['file'].str.replace('_address', '')
rs_anon_mean_proba_address['race_tuple'] = rs_anon_mean_proba_address['file'].str.split('_to_').apply(tuple)
rs_anon_mean_proba_address['original_race'] = rs_anon_mean_proba_address['race_tuple'].str[0]
rs_anon_mean_proba_address['new_race'] = rs_anon_mean_proba_address['race_tuple'].str[1]
rs_anon_mean_proba_address.loc[rs_anon_mean_proba_address['new_race'].isna(), 'new_race'] = rs_anon_mean_proba_address['original_race']
rs_anon_address = rs_anon_mean_proba_address.pivot(index='original_race', columns='new_race', values='test')

#name_address
filter_cond = rs_anon_mean_proba['file'].str.endswith('name_address') | rs_anon_mean_proba['file'].isin(races)
rs_anon_mean_proba_name_address = rs_anon_mean_proba[filter_cond].copy()
rs_anon_mean_proba_name_address['file'] = rs_anon_mean_proba_name_address['file'].str.replace('_name_address', '')
rs_anon_mean_proba_name_address['race_tuple'] = rs_anon_mean_proba_name_address['file'].str.split('_to_')#.apply(tuple)
rs_anon_mean_proba_name_address['original_race'] = rs_anon_mean_proba_name_address['race_tuple'].str[0]
rs_anon_mean_proba_name_address['new_race'] = rs_anon_mean_proba_name_address['race_tuple'].str[1]
rs_anon_mean_proba_name_address.loc[rs_anon_mean_proba_name_address['new_race'].isna(), 'new_race'] = rs_anon_mean_proba_name_address['original_race']
rs_anon_name_address = rs_anon_mean_proba_name_address.pivot(index='original_race', columns='new_race', values='test')

#race
filter_cond = rs_anon_mean_proba['file'].str.endswith('race') 
rs_anon_mean_proba_race = rs_anon_mean_proba[filter_cond].copy()
rs_anon_mean_proba_race['file'] = rs_anon_mean_proba_race['file'].str.replace('_race', '')
rs_anon_mean_proba_race['race_tuple'] = rs_anon_mean_proba_race['file'].str.split('_to_')#.apply(tuple)
rs_anon_mean_proba_race['original_race'] = rs_anon_mean_proba_race['race_tuple'].str[0]
rs_anon_mean_proba_race['new_race'] = rs_anon_mean_proba_race['race_tuple'].str[1]
rs_anon_mean_proba_race.loc[rs_anon_mean_proba_race['new_race'].isna(), 'new_race'] = rs_anon_mean_proba_race['original_race']
rs_anon_race = rs_anon_mean_proba_race.pivot(index='original_race', columns='new_race', values='test')

#Create new data frames
rs_anon_name_delta = rs_anon_name.apply(lambda row: row - row[row.name], axis = 1)
rs_anon_address_delta = rs_anon_address.apply(lambda row: row - row[row.name], axis = 1)
rs_anon_name_address_delta = rs_anon_name_address.apply(lambda row: row - row[row.name], axis = 1)
rs_anon_race_delta = rs_anon_race.apply(lambda row: row - row[row.name], axis = 1)

#Drop HL_CO (no male subgroup)
rs_anon_name_delta = rs_anon_name_delta.drop(columns = 'HL_CO', index = 'HL_CO')
rs_anon_address_delta = rs_anon_address_delta.drop(columns = 'HL_CO', index = 'HL_CO')
rs_anon_name_address_delta = rs_anon_name_address_delta.drop(columns = 'HL_CO', index = 'HL_CO')
rs_anon_race_delta = rs_anon_race_delta.drop(columns = 'HL_CO', index = 'HL_CO')

In [None]:
anon_name = sns.heatmap(rs_anon_name_delta,annot = False, cmap="coolwarm", vmin = -0.17, vmax=0.17)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new_race', fontsize = 14)
plt.ylabel('original_race', fontsize = 14)
plt.tight_layout()
anon_name = anon_name.get_figure()
anon_name.savefig('images/race_anon_name_strat.png')

In [None]:
anon_address = sns.heatmap(rs_anon_address_delta,annot = False, cmap="coolwarm", vmin = -0.17, vmax=0.17)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new_race', fontsize = 14)
plt.ylabel('original_race', fontsize = 14)
plt.tight_layout()
anon_address = anon_address.get_figure()
anon_address.savefig('images/race_anon_address_strat.png')

In [None]:
anon_name_address = sns.heatmap(rs_anon_name_address_delta,annot = False, cmap="coolwarm", vmin = -0.17, vmax=0.17)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new_race', fontsize = 14)
plt.ylabel('original_race', fontsize = 14)
plt.tight_layout()
anon_name_address = anon_name_address.get_figure()
anon_name_address.savefig('images/race_anon_name_address_strat.png')

In [None]:
anon_race = sns.heatmap(rs_anon_race_delta,annot = False, cmap="coolwarm", vmin = -0.17, vmax=0.17)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new_race', fontsize = 14)
plt.ylabel('original_race', fontsize = 14)
plt.tight_layout()
anon_race = anon_race.get_figure()
anon_race.savefig('images/race_anon_race_strat.png')

Reduce rs to DELTA of test mean_proba for anon

In [None]:
#name
filter_cond = rs_full_mean_proba['file'].str.endswith('name') | rs_full_mean_proba['file'].isin(races)
rs_full_mean_proba_name = rs_full_mean_proba[filter_cond].copy()
rs_full_mean_proba_name['file'] = rs_full_mean_proba_name['file'].str.replace('_name', '')
rs_full_mean_proba_name['race_tuple'] = rs_full_mean_proba_name['file'].str.split('_to_').apply(tuple)
rs_full_mean_proba_name['original_race'] = rs_full_mean_proba_name['race_tuple'].str[0]
rs_full_mean_proba_name['new_race'] = rs_full_mean_proba_name['race_tuple'].str[1]
rs_full_mean_proba_name.loc[rs_full_mean_proba_name['new_race'].isna(), 'new_race'] = rs_full_mean_proba_name['original_race']
rs_full_name = rs_full_mean_proba_name.pivot(index='original_race', columns='new_race', values='test')

#address
filter_cond = rs_full_mean_proba['file'].str.match(r'.*[^e]_address') | rs_full_mean_proba['file'].isin(races)
rs_full_mean_proba_address = rs_full_mean_proba[filter_cond].copy()
rs_full_mean_proba_address['file'] = rs_full_mean_proba_address['file'].str.replace('_address', '')
rs_full_mean_proba_address['race_tuple'] = rs_full_mean_proba_address['file'].str.split('_to_').apply(tuple)
rs_full_mean_proba_address['original_race'] = rs_full_mean_proba_address['race_tuple'].str[0]
rs_full_mean_proba_address['new_race'] = rs_full_mean_proba_address['race_tuple'].str[1]
rs_full_mean_proba_address.loc[rs_full_mean_proba_address['new_race'].isna(), 'new_race'] = rs_full_mean_proba_address['original_race']
rs_full_address = rs_full_mean_proba_address.pivot(index='original_race', columns='new_race', values='test')

#name_address
filter_cond = rs_full_mean_proba['file'].str.endswith('name_address') | rs_full_mean_proba['file'].isin(races)
rs_full_mean_proba_name_address = rs_full_mean_proba[filter_cond].copy()
rs_full_mean_proba_name_address['file'] = rs_full_mean_proba_name_address['file'].str.replace('_name_address', '')
rs_full_mean_proba_name_address['race_tuple'] = rs_full_mean_proba_name_address['file'].str.split('_to_')#.apply(tuple)
rs_full_mean_proba_name_address['original_race'] = rs_full_mean_proba_name_address['race_tuple'].str[0]
rs_full_mean_proba_name_address['new_race'] = rs_full_mean_proba_name_address['race_tuple'].str[1]
rs_full_mean_proba_name_address.loc[rs_full_mean_proba_name_address['new_race'].isna(), 'new_race'] = rs_full_mean_proba_name_address['original_race']
rs_full_name_address = rs_full_mean_proba_name_address.pivot(index='original_race', columns='new_race', values='test')

#race
filter_cond = rs_full_mean_proba['file'].str.endswith('race') #| rs_full_mean_proba['file'].isin(races)
rs_full_mean_proba_race = rs_full_mean_proba[filter_cond].copy()
rs_full_mean_proba_race['file'] = rs_full_mean_proba_race['file'].str.replace('_race', '')
rs_full_mean_proba_race['race_tuple'] = rs_full_mean_proba_race['file'].str.split('_to_').apply(tuple)
rs_full_mean_proba_race['original_race'] = rs_full_mean_proba_race['race_tuple'].str[0]
rs_full_mean_proba_race['new_race'] = rs_full_mean_proba_race['race_tuple'].str[1]
rs_full_mean_proba_race.loc[rs_full_mean_proba_race['new_race'].isna(), 'new_race'] = rs_full_mean_proba_race['original_race']
rs_full_race = rs_full_mean_proba_race.pivot(index='original_race', columns='new_race', values='test')

#Create new dataframes
rs_full_name_delta = rs_full_name.apply(lambda row: row - row[row.name], axis = 1)
rs_full_address_delta = rs_full_address.apply(lambda row: row - row[row.name], axis = 1)
rs_full_name_address_delta = rs_full_name_address.apply(lambda row: row - row[row.name], axis = 1)
rs_full_race_delta = rs_full_race.apply(lambda row: row - row[row.name], axis = 1)

#Drop HL_CO (no male subgroup)
rs_full_name_delta = rs_full_name_delta.drop(columns = 'HL_CO', index = 'HL_CO')
rs_full_address_delta = rs_full_address_delta.drop(columns = 'HL_CO', index = 'HL_CO')
rs_full_name_address_delta = rs_full_name_address_delta.drop(columns = 'HL_CO', index = 'HL_CO')
rs_full_race_delta = rs_full_race_delta.drop(columns = 'HL_CO', index = 'HL_CO')

In [None]:
full_name = sns.heatmap(rs_full_name_delta,annot = False, cmap="coolwarm", vmin = -0.17, vmax=0.17)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new_race', fontsize = 14)
plt.ylabel('original_race', fontsize = 14)
plt.tight_layout()
full_name = full_name.get_figure()
full_name.savefig('images/race_full_name_strat.png')

In [None]:
full_address = sns.heatmap(rs_full_address_delta,annot = False, cmap="coolwarm", vmin = -0.17, vmax=0.17)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new_race', fontsize = 14)
plt.ylabel('original_race', fontsize = 14)
plt.tight_layout()
full_address = full_address.get_figure()
full_address.savefig('images/race_full_address_strat.png')

In [None]:
full_name_address = sns.heatmap(rs_full_name_address_delta,annot = False, cmap="coolwarm", vmin = -0.17, vmax=0.17)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new_race', fontsize = 14)
plt.ylabel('original_race', fontsize = 14)
plt.tight_layout()
full_name_address = full_name_address.get_figure()
full_name_address.savefig('images/race_full_name_address_strat.png')

In [None]:
full_race = sns.heatmap(rs_full_race_delta,annot = False, cmap="coolwarm", vmin = -0.17, vmax=0.17)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new_race', fontsize = 14)
plt.ylabel('original_race', fontsize = 14)
plt.tight_layout()
full_race = full_race.get_figure()
full_race.savefig('images/race_full_race_strat.png')

Race Mean Effects by bigger race group

In [None]:
rs_big_groups_anon_address = rs_anon_mean_proba_address.loc[rs_anon_mean_proba_address['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA', 'BL_A', 'BL_AA', 'BL_CV', 'BL_CI', 
                                                                                                           'HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA', 'W', 'W_BR', 'W_OE', 'W_EE', 'W_RU'])].copy()
rs_big_groups_anon_address.loc[:,'original_race_group'] = None
rs_big_groups_anon_address.loc[rs_big_groups_anon_address['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'original_race_group'] = 'AS'
rs_big_groups_anon_address.loc[rs_big_groups_anon_address['original_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'original_race_group'] = 'BL'
rs_big_groups_anon_address.loc[rs_big_groups_anon_address['original_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'original_race_group'] = 'HL'
rs_big_groups_anon_address.loc[rs_big_groups_anon_address['original_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'original_race_group'] = 'W'
rs_big_groups_anon_address.loc[:,'new_race_group'] = None
rs_big_groups_anon_address.loc[rs_big_groups_anon_address['new_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'new_race_group'] = 'AS'
rs_big_groups_anon_address.loc[rs_big_groups_anon_address['new_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'new_race_group'] = 'BL'
rs_big_groups_anon_address.loc[rs_big_groups_anon_address['new_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'new_race_group'] = 'HL'
rs_big_groups_anon_address.loc[rs_big_groups_anon_address['new_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'new_race_group'] = 'W'

rs_big_groups_anon_name = rs_anon_mean_proba_name.loc[rs_anon_mean_proba_name['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA', 'BL_A', 'BL_AA', 'BL_CV', 'BL_CI', 
                                                                                                           'HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA', 'W', 'W_BR', 'W_OE', 'W_EE', 'W_RU'])].copy()
rs_big_groups_anon_name.loc[:,'original_race_group'] = None
rs_big_groups_anon_name.loc[rs_big_groups_anon_name['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'original_race_group'] = 'AS'
rs_big_groups_anon_name.loc[rs_big_groups_anon_name['original_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'original_race_group'] = 'BL'
rs_big_groups_anon_name.loc[rs_big_groups_anon_name['original_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'original_race_group'] = 'HL'
rs_big_groups_anon_name.loc[rs_big_groups_anon_name['original_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'original_race_group'] = 'W'
rs_big_groups_anon_name.loc[:,'new_race_group'] = None
rs_big_groups_anon_name.loc[rs_big_groups_anon_name['new_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'new_race_group'] = 'AS'
rs_big_groups_anon_name.loc[rs_big_groups_anon_name['new_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'new_race_group'] = 'BL'
rs_big_groups_anon_name.loc[rs_big_groups_anon_name['new_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'new_race_group'] = 'HL'
rs_big_groups_anon_name.loc[rs_big_groups_anon_name['new_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'new_race_group'] = 'W'

rs_big_groups_anon_name_address = rs_anon_mean_proba_name_address.loc[rs_anon_mean_proba_name_address['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA', 'BL_A', 'BL_AA', 'BL_CV', 'BL_CI', 
                                                                                                           'HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA', 'W', 'W_BR', 'W_OE', 'W_EE', 'W_RU'])].copy()
rs_big_groups_anon_name_address.loc[:,'original_race_group'] = None
rs_big_groups_anon_name_address.loc[rs_big_groups_anon_name_address['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'original_race_group'] = 'AS'
rs_big_groups_anon_name_address.loc[rs_big_groups_anon_name_address['original_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'original_race_group'] = 'BL'
rs_big_groups_anon_name_address.loc[rs_big_groups_anon_name_address['original_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'original_race_group'] = 'HL'
rs_big_groups_anon_name_address.loc[rs_big_groups_anon_name_address['original_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'original_race_group'] = 'W'
rs_big_groups_anon_name_address.loc[:,'new_race_group'] = None
rs_big_groups_anon_name_address.loc[rs_big_groups_anon_name_address['new_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'new_race_group'] = 'AS'
rs_big_groups_anon_name_address.loc[rs_big_groups_anon_name_address['new_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'new_race_group'] = 'BL'
rs_big_groups_anon_name_address.loc[rs_big_groups_anon_name_address['new_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'new_race_group'] = 'HL'
rs_big_groups_anon_name_address.loc[rs_big_groups_anon_name_address['new_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'new_race_group'] = 'W'


rs_big_groups_anon_race = rs_anon_mean_proba_race.loc[rs_anon_mean_proba_race['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA', 'BL_A', 'BL_AA', 'BL_CV', 'BL_CI', 
                                                                                                           'HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA', 'W', 'W_BR', 'W_OE', 'W_EE', 'W_RU'])].copy()
rs_big_groups_anon_race.loc[:,'original_race_group'] = None
rs_big_groups_anon_race.loc[rs_big_groups_anon_race['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'original_race_group'] = 'AS'
rs_big_groups_anon_race.loc[rs_big_groups_anon_race['original_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'original_race_group'] = 'BL'
rs_big_groups_anon_race.loc[rs_big_groups_anon_race['original_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'original_race_group'] = 'HL'
rs_big_groups_anon_race.loc[rs_big_groups_anon_race['original_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'original_race_group'] = 'W'
rs_big_groups_anon_race.loc[:,'new_race_group'] = None
rs_big_groups_anon_race.loc[rs_big_groups_anon_race['new_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'new_race_group'] = 'AS'
rs_big_groups_anon_race.loc[rs_big_groups_anon_race['new_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'new_race_group'] = 'BL'
rs_big_groups_anon_race.loc[rs_big_groups_anon_race['new_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'new_race_group'] = 'HL'
rs_big_groups_anon_race.loc[rs_big_groups_anon_race['new_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'new_race_group'] = 'W'


rs_big_groups_anon_address = rs_big_groups_anon_address.groupby(['original_race_group', 'new_race_group'], as_index = False)['test'].mean()
rs_big_groups_anon_address = rs_big_groups_anon_address.pivot(index='original_race_group', columns='new_race_group', values='test')

rs_big_groups_anon_name = rs_big_groups_anon_name.groupby(['original_race_group', 'new_race_group'], as_index = False)['test'].mean()
rs_big_groups_anon_name = rs_big_groups_anon_name.pivot(index='original_race_group', columns='new_race_group', values='test')

rs_big_groups_anon_name_address = rs_big_groups_anon_name_address.groupby(['original_race_group', 'new_race_group'], as_index = False)['test'].mean()
rs_big_groups_anon_name_address = rs_big_groups_anon_name_address.pivot(index='original_race_group', columns='new_race_group', values='test')

rs_big_groups_anon_race = rs_big_groups_anon_race.groupby(['original_race_group', 'new_race_group'], as_index = False)['test'].mean()
rs_big_groups_anon_race = rs_big_groups_anon_race.pivot(index='original_race_group', columns='new_race_group', values='test')


rs_big_groups_anon_address = rs_big_groups_anon_address.apply(lambda row: row - row[row.name], axis = 1)
rs_big_groups_anon_name = rs_big_groups_anon_name.apply(lambda row: row - row[row.name], axis = 1)
rs_big_groups_anon_name_address = rs_big_groups_anon_name_address.apply(lambda row: row - row[row.name], axis = 1)
rs_big_groups_anon_race = rs_big_groups_anon_race.apply(lambda row: row - row[row.name], axis = 1)

In [None]:
rs_big_groups_full_address = rs_full_mean_proba_address.loc[rs_full_mean_proba_address['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA', 'BL_A', 'BL_AA', 'BL_CV', 'BL_CI', 
                                                                                                           'HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA', 'W', 'W_BR', 'W_OE', 'W_EE', 'W_RU'])].copy()
rs_big_groups_full_address.loc[:,'original_race_group'] = None
rs_big_groups_full_address.loc[rs_big_groups_full_address['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'original_race_group'] = 'AS'
rs_big_groups_full_address.loc[rs_big_groups_full_address['original_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'original_race_group'] = 'BL'
rs_big_groups_full_address.loc[rs_big_groups_full_address['original_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'original_race_group'] = 'HL'
rs_big_groups_full_address.loc[rs_big_groups_full_address['original_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'original_race_group'] = 'W'
rs_big_groups_full_address.loc[:,'new_race_group'] = None
rs_big_groups_full_address.loc[rs_big_groups_full_address['new_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'new_race_group'] = 'AS'
rs_big_groups_full_address.loc[rs_big_groups_full_address['new_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'new_race_group'] = 'BL'
rs_big_groups_full_address.loc[rs_big_groups_full_address['new_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'new_race_group'] = 'HL'
rs_big_groups_full_address.loc[rs_big_groups_full_address['new_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'new_race_group'] = 'W'

rs_big_groups_full_name = rs_full_mean_proba_name.loc[rs_full_mean_proba_name['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA', 'BL_A', 'BL_AA', 'BL_CV', 'BL_CI', 
                                                                                                           'HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA', 'W', 'W_BR', 'W_OE', 'W_EE', 'W_RU'])].copy()
rs_big_groups_full_name.loc[:,'original_race_group'] = None
rs_big_groups_full_name.loc[rs_big_groups_full_name['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'original_race_group'] = 'AS'
rs_big_groups_full_name.loc[rs_big_groups_full_name['original_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'original_race_group'] = 'BL'
rs_big_groups_full_name.loc[rs_big_groups_full_name['original_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'original_race_group'] = 'HL'
rs_big_groups_full_name.loc[rs_big_groups_full_name['original_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'original_race_group'] = 'W'
rs_big_groups_full_name.loc[:,'new_race_group'] = None
rs_big_groups_full_name.loc[rs_big_groups_full_name['new_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'new_race_group'] = 'AS'
rs_big_groups_full_name.loc[rs_big_groups_full_name['new_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'new_race_group'] = 'BL'
rs_big_groups_full_name.loc[rs_big_groups_full_name['new_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'new_race_group'] = 'HL'
rs_big_groups_full_name.loc[rs_big_groups_full_name['new_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'new_race_group'] = 'W'

rs_big_groups_full_name_address = rs_full_mean_proba_name_address.loc[rs_full_mean_proba_name_address['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA', 'BL_A', 'BL_AA', 'BL_CV', 'BL_CI', 
                                                                                                           'HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA', 'W', 'W_BR', 'W_OE', 'W_EE', 'W_RU'])].copy()
rs_big_groups_full_name_address.loc[:,'original_race_group'] = None
rs_big_groups_full_name_address.loc[rs_big_groups_full_name_address['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'original_race_group'] = 'AS'
rs_big_groups_full_name_address.loc[rs_big_groups_full_name_address['original_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'original_race_group'] = 'BL'
rs_big_groups_full_name_address.loc[rs_big_groups_full_name_address['original_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'original_race_group'] = 'HL'
rs_big_groups_full_name_address.loc[rs_big_groups_full_name_address['original_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'original_race_group'] = 'W'
rs_big_groups_full_name_address.loc[:,'new_race_group'] = None
rs_big_groups_full_name_address.loc[rs_big_groups_full_name_address['new_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'new_race_group'] = 'AS'
rs_big_groups_full_name_address.loc[rs_big_groups_full_name_address['new_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'new_race_group'] = 'BL'
rs_big_groups_full_name_address.loc[rs_big_groups_full_name_address['new_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'new_race_group'] = 'HL'
rs_big_groups_full_name_address.loc[rs_big_groups_full_name_address['new_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'new_race_group'] = 'W'

rs_big_groups_full_race = rs_full_mean_proba_race.loc[rs_full_mean_proba_race['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA', 'BL_A', 'BL_AA', 'BL_CV', 'BL_CI', 
                                                                                                           'HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA', 'W', 'W_BR', 'W_OE', 'W_EE', 'W_RU'])].copy()
rs_big_groups_full_race.loc[:,'original_race_group'] = None
rs_big_groups_full_race.loc[rs_big_groups_full_race['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'original_race_group'] = 'AS'
rs_big_groups_full_race.loc[rs_big_groups_full_race['original_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'original_race_group'] = 'BL'
rs_big_groups_full_race.loc[rs_big_groups_full_race['original_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'original_race_group'] = 'HL'
rs_big_groups_full_race.loc[rs_big_groups_full_race['original_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'original_race_group'] = 'W'
rs_big_groups_full_race.loc[:,'new_race_group'] = None
rs_big_groups_full_race.loc[rs_big_groups_full_race['new_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'new_race_group'] = 'AS'
rs_big_groups_full_race.loc[rs_big_groups_full_race['new_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'new_race_group'] = 'BL'
rs_big_groups_full_race.loc[rs_big_groups_full_race['new_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'new_race_group'] = 'HL'
rs_big_groups_full_race.loc[rs_big_groups_full_race['new_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'new_race_group'] = 'W'


rs_big_groups_full_address = rs_big_groups_full_address.groupby(['original_race_group', 'new_race_group'], as_index = False)['test'].mean()
rs_big_groups_full_address = rs_big_groups_full_address.pivot(index='original_race_group', columns='new_race_group', values='test')

rs_big_groups_full_name = rs_big_groups_full_name.groupby(['original_race_group', 'new_race_group'], as_index = False)['test'].mean()
rs_big_groups_full_name = rs_big_groups_full_name.pivot(index='original_race_group', columns='new_race_group', values='test')

rs_big_groups_full_name_address = rs_big_groups_full_name_address.groupby(['original_race_group', 'new_race_group'], as_index = False)['test'].mean()
rs_big_groups_full_name_address = rs_big_groups_full_name_address.pivot(index='original_race_group', columns='new_race_group', values='test')

rs_big_groups_full_race = rs_big_groups_full_race.groupby(['original_race_group', 'new_race_group'], as_index = False)['test'].mean()
rs_big_groups_full_race = rs_big_groups_full_race.pivot(index='original_race_group', columns='new_race_group', values='test')

rs_big_groups_full_address = rs_big_groups_full_address.apply(lambda row: row - row[row.name], axis = 1)
rs_big_groups_full_name = rs_big_groups_full_name.apply(lambda row: row - row[row.name], axis = 1)
rs_big_groups_full_name_address = rs_big_groups_full_name_address.apply(lambda row: row - row[row.name], axis = 1)
rs_big_groups_full_race = rs_big_groups_full_race.apply(lambda row: row - row[row.name], axis = 1)

In [None]:
rs_big_groups_full_race

In [None]:
rs_big_groups_full_race_plot = sns.heatmap(rs_big_groups_full_race, annot = False, cmap="coolwarm", vmin = -0.08, vmax = 0.08)
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new race', fontsize = 12, labelpad = 10)
plt.ylabel('original race', fontsize = 12, labelpad = 10)
plt.tight_layout()
rs_big_groups_full_race_plot = rs_big_groups_full_race_plot.get_figure()
rs_big_groups_full_race_plot.savefig('images/race_grouped_full_race_strat.png')

In [None]:
rs_big_groups_full_name_plot = sns.heatmap(rs_big_groups_full_name, annot = False, cmap="coolwarm", vmin = -0.08, vmax = 0.08)
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new race', fontsize = 12, labelpad = 10)
plt.ylabel('original race', fontsize = 12, labelpad = 10)
plt.tight_layout()
rs_big_groups_full_name_plot = rs_big_groups_full_name_plot.get_figure()
rs_big_groups_full_name_plot.savefig('images/race_grouped_full_name_strat.png')

In [None]:
rs_big_groups_full_name_address_plot = sns.heatmap(rs_big_groups_full_name_address, annot = False, cmap="coolwarm", vmin = -0.08, vmax=0.08)
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new race', fontsize = 12, labelpad = 10)
plt.ylabel('original race', fontsize = 12, labelpad = 10)
plt.tight_layout()
rs_big_groups_full_name_address_plot = rs_big_groups_full_name_address_plot.get_figure()
rs_big_groups_full_name_address_plot.savefig('images/race_grouped_full_name_address_strat.png')

In [None]:
rs_big_groups_full_address_plot = sns.heatmap(rs_big_groups_full_address, annot = False, cmap="coolwarm", vmin = -0.08, vmax=0.08)
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new race', fontsize = 12, labelpad = 10)
plt.ylabel('original race', fontsize = 12, labelpad = 10)
plt.tight_layout()
rs_big_groups_full_address_plot = rs_big_groups_full_address_plot.get_figure()
rs_big_groups_full_address_plot.savefig('images/race_grouped_full_address_strat.png')

In [None]:
rs_big_groups_anon_race_plot = sns.heatmap(rs_big_groups_anon_race, annot = False, cmap="coolwarm", vmin = -0.08, vmax = 0.08)
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new race', fontsize = 12, labelpad = 10)
plt.ylabel('original race', fontsize = 12, labelpad = 10)
plt.tight_layout()
rs_big_groups_anon_race_plot = rs_big_groups_anon_race_plot.get_figure()
rs_big_groups_anon_race_plot.savefig('images/race_grouped_anon_race_strat.png')

In [None]:
rs_big_groups_anon_address_plot = sns.heatmap(rs_big_groups_anon_address, annot = False, cmap="coolwarm", vmin = -0.08, vmax = 0.08)
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new race', fontsize = 12, labelpad = 10)
plt.ylabel('original race', fontsize = 12, labelpad = 10)
plt.tight_layout()
rs_big_groups_anon_address_plot = rs_big_groups_anon_address_plot.get_figure()
rs_big_groups_anon_address_plot.savefig('images/race_grouped_anon_address_strat.png')

In [None]:
rs_big_groups_anon_name_address_plot = sns.heatmap(rs_big_groups_anon_name_address, annot = False, cmap="coolwarm", vmin = -0.08, vmax = 0.08)
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new race', fontsize = 12, labelpad = 10)
plt.ylabel('original race', fontsize = 12, labelpad = 10)
plt.tight_layout()
rs_big_groups_anon_name_address_plot = rs_big_groups_anon_name_address_plot.get_figure()
rs_big_groups_anon_name_address_plot.savefig('images/race_grouped_anon_name_address_strat.png')

In [None]:
rs_big_groups_anon_name_plot = sns.heatmap(rs_big_groups_anon_name, annot = False, cmap="coolwarm", vmin = -0.08, vmax = 0.08)
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('new race', fontsize = 12, labelpad = 10)
plt.ylabel('original race', fontsize = 12, labelpad = 10)
plt.tight_layout()
rs_big_groups_anon_name_plot = rs_big_groups_anon_name_plot.get_figure()
rs_big_groups_anon_name_plot.savefig('images/race_grouped_anon_name_strat.png')

In [None]:
race_prev_big = race_prev.loc[race_prev['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA', 'BL_A', 'BL_AA', 'BL_CV', 'BL_CI', 
                                                 'HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA', 'W', 'W_BR', 'W_OE', 'W_EE', 'W_RU'])].copy()
race_prev_big.loc[:,'original_race_group'] = None
race_prev_big.loc[race_prev_big['original_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'original_race_group'] = 'AS'
race_prev_big.loc[race_prev_big['original_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'original_race_group'] = 'BL'
race_prev_big.loc[race_prev_big['original_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'original_race_group'] = 'HL'
race_prev_big.loc[race_prev_big['original_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'original_race_group'] = 'W'
race_prev_big.loc[:,'new_race_group'] = None
race_prev_big.loc[race_prev_big['new_race'].isin(['AS', 'AS_AI', 'AS_CH', 'AS_SEA']), 'new_race_group'] = 'AS'
race_prev_big.loc[race_prev_big['new_race'].isin(['BL_A', 'BL_AA', 'BL_CV', 'BL_CI']), 'new_race_group'] = 'BL'
race_prev_big.loc[race_prev_big['new_race'].isin(['HL','HL_DO', 'HL_GU', 'HL_PR', 'HL_SA']), 'new_race_group'] = 'HL'
race_prev_big.loc[race_prev_big['new_race'].isin(['W', 'W_BR', 'W_OE', 'W_EE', 'W_RU']), 'new_race_group'] = 'W'

race_prev_big_test = race_prev_big.groupby(['original_race_group', 'new_race_group'], as_index = False)['test_new-org'].mean()
race_prev_big_test = race_prev_big_test.pivot(index='original_race_group', columns='new_race_group', values='test_new-org')
race_prev_big_test = race_prev_big_test.apply(lambda row: row - row[row.name], axis = 1)

race_prev_big_train = race_prev_big.groupby(['original_race_group', 'new_race_group'], as_index = False)['train_new-org'].mean()
race_prev_big_train = race_prev_big_train.pivot(index='original_race_group', columns='new_race_group', values='train_new-org')
race_prev_big_train = race_prev_big_train.apply(lambda row: row - row[row.name], axis = 1)

In [None]:
#race_prev_big_train#.min().min()
prevalence

In [None]:
race_prev_big_train_plot = sns.heatmap(race_prev_big_train, annot = False, cmap="coolwarm", vmin = -0.08, vmax = 0.08)
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('r1', fontsize = 12, labelpad = 10)
plt.ylabel('r2', fontsize = 12, labelpad = 10)
plt.tight_layout()
race_prev_big_train_plot = race_prev_big_train_plot.get_figure()
race_prev_big_train_plot.savefig('images/race_grouped_prev_train.png')

In [None]:
race_prev_big_test_plot = sns.heatmap(race_prev_big_test, annot = False, cmap="coolwarm", vmin = -0.08, vmax = 0.08)
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ['Asian', 'Black', 'Hispanic/\nLatino', 'White'], fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('r1', fontsize = 12, labelpad = 10)
plt.ylabel('r2', fontsize = 12, labelpad = 10)
plt.tight_layout()
race_prev_big_test_plot = race_prev_big_test_plot.get_figure()
race_prev_big_test_plot.savefig('images/race_grouped_prev_test.png')

# grouped_anonymized/age_swapping

#### Load age_swapping outputs

In [None]:
asw_anon = output_structured['age_swapping_anonModel'].copy()
asw_full = output_structured['age_swapping_fullModel'].copy()

#remove model from file name
asw_anon['file'] = output_structured['age_swapping_anonModel']['file'].apply(lambda x: x[:-8])
asw_full['file'] = output_structured['age_swapping_fullModel']['file'].apply(lambda x: x[:-8])

#### Put each metric into its own column

In [None]:
#Split by metric
asw_anon_mean_proba = asw_anon[asw_anon['metric'] == 'mean_proba'].drop('metric', axis = 1)
asw_anon_auroc = asw_anon[asw_anon['metric'] == 'auroc'].drop('metric', axis = 1)
asw_anon_brier = asw_anon[asw_anon['metric'] == 'brier'].drop('metric', axis = 1)
asw_anon_auprc = asw_anon[asw_anon['metric'] == 'auprc'].drop('metric', axis = 1)

asw_full_mean_proba = asw_full[asw_full['metric'] == 'mean_proba'].drop('metric', axis = 1)
asw_full_auroc = asw_full[asw_full['metric'] == 'auroc'].drop('metric', axis = 1)
asw_full_brier = asw_full[asw_full['metric'] == 'brier'].drop('metric', axis = 1)
asw_full_auprc = asw_full[asw_full['metric'] == 'auprc'].drop('metric', axis = 1)

#Merge back together
asw_anon = pd.merge(asw_anon_mean_proba, asw_anon_auroc, on = 'file', suffixes = ['_mean_proba', '_auroc'])
asw_anon = pd.merge(asw_anon, asw_anon_brier, on = 'file')
asw_anon = pd.merge(asw_anon, asw_anon_auprc, on = 'file', suffixes = ['_brier', '_auprc'])

asw_full = pd.merge(asw_full_mean_proba, asw_full_auroc, on = 'file', suffixes = ['_mean_proba', '_auroc'])
asw_full = pd.merge(asw_full, asw_full_brier, on = 'file')
asw_full = pd.merge(asw_full, asw_full_auprc, on = 'file', suffixes = ['_brier', '_auprc'])

#### Merge the models by their swapping categories

In [None]:
asw = pd.merge(asw_anon, asw_full, on = 'file', suffixes = ['_anonModel', '_fullModel'])

#### Split by original age

In [None]:
asw_young = asw[asw['file'].str.startswith('young')].copy() 
asw_old = asw[asw['file'].str.startswith('old')].copy() 

#### Compute mean_test_probability delta

In [None]:
for metric in ['mean_proba', 'auroc', 'brier', 'auprc']:

    #anonymized Model
    young_proba_anon = asw_young.loc[asw_young['file'] == 'young', f'test_{metric}_anonModel']
    old_proba_anon = asw_old.loc[asw_old['file'] == 'old', f'test_{metric}_anonModel']
    
    asw_young[f'delta_{metric}_anonModel'] = asw_young[f'test_{metric}_anonModel'].apply(lambda x: x - young_proba_anon) #delta x -young 
    asw_old[f'delta_{metric}_anonModel'] = asw_old[f'test_{metric}_anonModel'].apply(lambda x: x - old_proba_anon)
    
    #full_info Model
    young_proba_full = asw_young.loc[asw_young['file'] == 'young', f'test_{metric}_fullModel']
    old_proba_full = asw_old.loc[asw_old['file'] == 'old', f'test_{metric}_fullModel']
    
    asw_young[f'delta_{metric}_fullModel'] = asw_young[f'test_{metric}_fullModel'].apply(lambda x: x - young_proba_full)
    asw_old[f'delta_{metric}_fullModel'] = asw_old[f'test_{metric}_fullModel'].apply(lambda x: x - old_proba_full)

In [None]:
asw = pd.concat([asw_young, asw_old])
asw_cols = [col for col in asw.columns if 'delta_mean_proba' in col or col == 'file']
asw_delta_proba = asw.loc[~asw['file'].isin(['young','old']), asw_cols]
asw_delta_proba.columns = ['file', 'anonModel', 'fullModel']
asw_delta_proba[('young_to_old','anonModel')] = asw_delta_proba.loc[asw_delta_proba['file'].str.startswith('young_old'),'anonModel']
asw_delta_proba[('old_to_young','anonModel')] = asw_delta_proba.loc[asw_delta_proba['file'].str.startswith('old_young'),'anonModel']
asw_delta_proba[('young_to_old','fullModel')] = asw_delta_proba.loc[asw_delta_proba['file'].str.startswith('young_old'),'fullModel']
asw_delta_proba[('old_to_young','fullModel')] = asw_delta_proba.loc[asw_delta_proba['file'].str.startswith('old_young'),'fullModel']
asw_delta_proba['file'] = asw_delta_proba['file'].str[10:]
asw_delta_proba = asw_delta_proba[['file', ('young_to_old', 'anonModel'), ('old_to_young', 'anonModel'),
                                   ('young_to_old', 'fullModel'), ('old_to_young', 'fullModel')]]
asw_delta_proba = asw_delta_proba.groupby('file', as_index = False).max()
asw_delta_proba = asw_delta_proba.set_index('file')
asw_delta_proba.columns = pd.MultiIndex.from_tuples(asw_delta_proba.columns)

In [None]:
asw_delta_proba.to_latex()
asw_delta_proba

In [None]:
asw_delta_proba.index = asw_delta_proba.index.str.replace('_', ',\n ')
asw_delta_proba = asw_delta_proba.reindex(index = ['address', 'name', 'name,\n address', 'age', 'age,\n address', 'age,\n name', 'age,\n name,\n address'])
asw_delta_proba = asw_delta_proba.rename(columns = {'anonModel': 'anon_model', 'fullModel': 'full_model'})
#gs_delta_proba.index

In [None]:
young_train = prevalence['train_prevalence'].iloc[31]
old_train = prevalence['train_prevalence'].iloc[32]

print(young_train - old_train)

print(prevalence)

In [None]:
y_o_plot = asw_delta_proba['young_to_old'].plot.bar(color = ['lightblue', 'blue'], figsize=(6,6))
y_o_plot.set_ylim(-0.03, 0.03)
plt.xticks(rotation=0, fontsize = 11)
plt.yticks(fontsize = 11)
plt.xlabel('Manipulated Attributes', fontsize = 14)
plt.ylabel('Difference in Readmission-Probability', fontsize = 14)
plt.axhline(y = 0, color = 'black', linewidth = 0.1)
plt.legend(fontsize = 11)
plt.tight_layout()
plt.savefig('images/age_y_o_plot_strat.png')


o_y_plot = asw_delta_proba['old_to_young'].plot.bar(color = ['moccasin', 'orange'], figsize=(6,6))
o_y_plot.set_ylim(-0.03, 0.03)
plt.xticks(rotation=0, fontsize = 11)
plt.yticks(fontsize = 11)
plt.xlabel('Manipulated Attributes', fontsize = 14)
plt.ylabel('Difference in Readmission-Probability', fontsize = 14)
plt.axhline(y = 0, color = 'black', linewidth = 0.1)
plt.legend(fontsize = 11)
plt.tight_layout()
plt.savefig('images/age_o_y_plot_strat.png')

In [None]:
asw_delta_proba.style.background_gradient(cmap ='RdBu', axis=None)\
        .set_properties(**{'font-size': '20px'})

# grouped_anonymized/age_shifting

#### Load age_shifting outputs

In [None]:
ash_anon = output_structured['age_shifting_anonModel'].copy()
ash_full = output_structured['age_shifting_fullModel'].copy()

#remove model from file name
ash_anon['file'] = output_structured['age_shifting_anonModel']['file'].apply(lambda x: x[:-8])
ash_full['file'] = output_structured['age_shifting_fullModel']['file'].apply(lambda x: x[:-8])

#### Put each metric into its own column

In [None]:
#Split by metric
ash_anon_mean_proba = ash_anon[ash_anon['metric'] == 'mean_proba'].drop('metric', axis = 1)
ash_anon_auroc = ash_anon[ash_anon['metric'] == 'auroc'].drop('metric', axis = 1)
ash_anon_brier = ash_anon[ash_anon['metric'] == 'brier'].drop('metric', axis = 1)
ash_anon_auprc = ash_anon[ash_anon['metric'] == 'auprc'].drop('metric', axis = 1)

ash_full_mean_proba = ash_full[ash_full['metric'] == 'mean_proba'].drop('metric', axis = 1)
ash_full_auroc = ash_full[ash_full['metric'] == 'auroc'].drop('metric', axis = 1)
ash_full_brier = ash_full[ash_full['metric'] == 'brier'].drop('metric', axis = 1)
ash_full_auprc = ash_full[ash_full['metric'] == 'auprc'].drop('metric', axis = 1)

#Merge back together
ash_anon = pd.merge(ash_anon_mean_proba, ash_anon_auroc, on = 'file', suffixes = ['_mean_proba', '_auroc'])
ash_anon = pd.merge(ash_anon, ash_anon_brier, on = 'file')
ash_anon = pd.merge(ash_anon, ash_anon_auprc, on = 'file', suffixes = ['_brier', '_auprc'])

ash_full = pd.merge(ash_full_mean_proba, ash_full_auroc, on = 'file', suffixes = ['_mean_proba', '_auroc'])
ash_full = pd.merge(ash_full, ash_full_brier, on = 'file')
ash_full = pd.merge(ash_full, ash_full_auprc, on = 'file', suffixes = ['_brier', '_auprc'])

#### Merge the models by their swapping categories

In [None]:
ash = pd.merge(ash_anon, ash_full, on = 'file', suffixes = ['_anonModel', '_fullModel'])

In [None]:
real_proba_anon = ash.loc[ash['file']=='age', 'test_mean_proba_anonModel']
real_proba_full = ash.loc[ash['file']=='age', 'test_mean_proba_fullModel']

ash['delta_test_mean_proba_anonModel'] = ash['test_mean_proba_anonModel'].apply(lambda x: x - real_proba_anon)
ash['delta_test_mean_proba_fullModel'] = ash['test_mean_proba_fullModel'].apply(lambda x: x - real_proba_full)

In [None]:
ash_delta_proba = ash[['file', 'delta_test_mean_proba_anonModel','delta_test_mean_proba_fullModel']].sort_values(by = 'delta_test_mean_proba_fullModel')

In [None]:
ash_delta_proba.style.background_gradient(cmap ='RdBu', axis=None)\
        .set_properties(**{'font-size': '15px'})

In [None]:
ash_delta_proba = ash_delta_proba.rename(columns = {'delta_test_mean_proba_anonModel': 'anon_model',
                                                    'delta_test_mean_proba_fullModel': 'full_model'})
ash_delta_proba['file'] = ash_delta_proba['file'].replace({'age_plus_20': '+20', 'age_plus_10': '+10', 
                                                           'age_minus_20': '-20', 'age_minus_10': '-10',
                                                          'age': '0'})
ash_delta_proba = ash_delta_proba.sort_values(by = 'anon_model', ascending = False)

In [None]:
age_shift_plot = ash_delta_proba.plot.bar(x = 'file', color = ['lightgreen', 'green'])
age_shift_plot.set_ylim(-0.03, 0.03)
plt.xticks(rotation = 0, fontsize = 11)
plt.yticks(fontsize = 11)
plt.xlabel('Age Shift [years]', fontsize = 14)
plt.ylabel('Difference in Readmission-Probability', fontsize = 14)
plt.axhline(y = 0, color = 'black', linewidth = 0.1)
plt.legend(fontsize = 11)
plt.tight_layout()
plt.savefig('images/age_shift_plot_strat.png')

In [None]:
prev_df = pd.read_csv('/home/wite10/prevalence_by_group_strat.csv')

age_rows = prev_df[prev_df['category'].str.contains(r'\(')]
age_rows = age_rows[:-2]
sns.lineplot(age_rows, x = 'category', y = 'train_prevalence', color = 'green')
#sns.lineplot(age_rows, x = 'category', y = 'test_prevalence', label = 'Test Prevalence')
plt.xticks(fontsize = 11)
plt.yticks(fontsize = 11)
plt.xlabel('Age', fontsize = 14)
plt.ylabel('Readmission Rate in the Train Split', fontsize = 14)
#plt.legend()
plt.tight_layout()
plt.savefig('images/age_prev_strat.png')

In [None]:
prev_df

### Name and Address Quality

In [None]:
demographics = pd.read_csv('/home/wite10/demographics/sampled_data_demographics.csv')[['name', 'address', 'gender', 'abbrev']]
demographics['race'] = demographics['abbrev'].str[2:]
demographics.drop(columns = 'abbrev', inplace = True)

In [None]:
demographics['name_count'] = demographics.groupby(['race', 'gender'])['name'].transform('nunique')
demographics['address_count'] = demographics.groupby(['race', 'gender'])['address'].transform('nunique')

In [None]:
demo_count = demographics[['race', 'gender', 'name_count', 'address_count']].drop_duplicates()
demo_f = demo_count[demo_count['gender'] == 'F'].drop(columns = 'gender')
demo_m = demo_count[demo_count['gender'] == 'M'].drop(columns = 'gender')
demo_count = pd.merge(demo_f, demo_m, on = 'race', suffixes = ['_f', '_m'])

In [None]:
demo_count

In [None]:
demo_count['race_group'] = ''
demo_count.loc[demo_count['race'].str.startswith('AS'), 'race_group'] = 'Asian'
demo_count.loc[demo_count['race'].str.startswith('BL'), 'race_group'] = 'Black'
demo_count.loc[demo_count['race'].str.startswith('HL'), 'race_group'] = 'Hispanic/\nLatino'
demo_count.loc[demo_count['race'].str.startswith('HL_CO'), 'race_group'] = ''
demo_count.loc[demo_count['race'].str.startswith('W'), 'race_group'] = 'White'

demo_count['name_count_f'] = demo_count['name_count_f'].astype(int)

demo_count['mean_name_count_f'] = demo_count.groupby('race_group')['name_count_f'].transform('mean')
demo_count['mean_name_count_m'] = demo_count.groupby('race_group')['name_count_m'].transform('mean')
demo_count['mean_address_count_f'] = demo_count.groupby('race_group')['address_count_f'].transform('mean')
demo_count['mean_address_count_m'] = demo_count.groupby('race_group')['address_count_m'].transform('mean')

In [None]:
demo_plot = demo_count[demo_count['race_group'] != ''][['race_group', 'mean_name_count_f', 'mean_name_count_m']].drop_duplicates()

demo_plot_long = demo_plot.melt(
    id_vars='race_group',
    value_vars=['mean_name_count_f', 'mean_name_count_m'],
    var_name='gender',
    value_name='mean_name_count'
)

demo_plot_long['gender'] = demo_plot_long['gender'].map({'mean_name_count_f': 'female names', 'mean_name_count_m': 'male names'})

sns.barplot(data=demo_plot_long, x='race_group', y='mean_name_count', hue='gender')
plt.ylim(0,102)
plt.xlabel('race', fontsize = 14)
plt.xticks(fontsize = 11)
plt.ylabel('mean number of unique names', fontsize = 14)
plt.yticks(fontsize = 11)
plt.legend(loc = 'lower left')
plt.tight_layout()
plt.savefig('images/count_names.png')

In [None]:
demo_plot = demo_count[demo_count['race_group'] != ''][['race_group', 'mean_address_count_f', 'mean_address_count_m']].drop_duplicates()

demo_plot_long = demo_plot.melt(
    id_vars = 'race_group',
    value_vars = ['mean_address_count_f', 'mean_address_count_m'],
    var_name ='gender',
    value_name ='mean_address_count'
)

demo_plot_long['gender'] = demo_plot_long['gender'].map({'mean_address_count_f': 'addresses of female patients', 'mean_address_count_m': 'addresses of male patients'})

sns.barplot(data=demo_plot_long, x='race_group', y= 'mean_address_count' , hue='gender')
plt.ylim(0,102)
plt.xlabel('race', fontsize = 14)
plt.xticks(fontsize = 11)
plt.ylabel('mean number of unique addresses', fontsize = 14)
plt.yticks(fontsize = 11)
plt.legend(loc = 'lower left')
plt.tight_layout()
plt.savefig('images/count_addresses.png')