In [None]:
import pandas as pd
import os
import sklearn
from sklearn.metrics import roc_curve,auc,precision_recall_curve
import numpy as np
import scipy
import scipy.stats as st
import matplotlib.pyplot as plt
plt.rcParams["axes.grid"] = False
import seaborn as sns
import pandas as pd
import numpy as np
import pickle
from scipy import stats

In [None]:
# load feature importance generated using SHAP in the prediction stage for UW data
uw_real_feat = pd.read_csv("r_70_train_r_30_test_feature_importance.csv")
uw_real_abs = uw_real_feat.abs()
uw_real_abs_value = uw_real_abs.sum(axis=0)
uw_real_abs_value = dict(uw_real_abs_value)
uw_real_abs_value.pop('Unnamed: 0')

In [None]:
# rank the feature based on importance
feature_importance_rank = sorted(uw_real_abs_value, key=uw_real_abs_value.get, reverse=True)

In [None]:
# load the feature dictionary to get the concept names 
file_to_read = open("/data/users/yanyao/myproj/synpuf/uw_vumc_syn/binary_real/mapped_dataset/feature_dict.pickle", "rb")
feature_dict = pickle.load(file_to_read)

In [None]:
# top 200 uw features
uw_200_index = []
for i in feature_importance_rank[:200]:
    uw_200_index.append(feature_dict[i])
uw_200 = pd.DataFrame(uw_200_index)
uw_200.to_csv("./uw_top_200_index.csv", index=False)

In [None]:
# load feature importance generated using SHAP in the prediction stage for VUMC data
vumc_real_feat = pd.read_csv("/data/users/yanyao/myproj/synpuf/uw_vumc_syn/vumc_top_3/r_70_train_r_30_test_feature_importance.csv")
vumc_real_abs = vumc_real_feat.abs()
vumc_real_abs_value = vumc_real_abs.sum(axis=0)
vumc_real_abs_value = dict(vumc_real_abs_value)
vumc_real_abs_value.pop('Unnamed: 0')
# rank VUMC features
feature_importance_rank_vumc = sorted(vumc_real_abs_value, key=vumc_real_abs_value.get, reverse=True)
# top 200 features
vumc_200 = pd.DataFrame(feature_importance_rank_vumc[:200])
vumc_200.to_csv("./vumc_top_200_feat_name_new.csv", index=False)


# UW combined

In [None]:
# top 25 features selected by model trained on UW real data
uw_real_feat = pd.read_csv("r_70_train_r_30_test_feature_importance.csv")
uw_real_abs = uw_real_feat.abs()
uw_real_abs_value = uw_real_abs.sum(axis=0)/uw_real_feat.shape[0]
real_list = list(uw_real_abs_value)
real_list.pop(0)

In [None]:
top_25_idx = np.argsort(real_list)[-25:]
top_25_values_real = [real_list[i] for i in top_25_idx]

In [None]:
## calculate the distance
models = ["baseline","medgan","medbgan","emrwgan","medwgan","dpgan"]
for model in models:
    for run in range(3):
        features = np.load(f"{model}_s_sep_train_r_30_test_shap_{run}.npy")
        avg_features = np.sum(np.absolute(features),axis=0)
        avg_features = list(avg_features/features.shape[0])
        avg_features_25 = [avg_features[i] for i in top_25_idx]
        res = scipy.spatial.distance.cosine(avg_features_25,top_25_values_real)
        print(f"{model}_s_sep_train_r_30_test_shap_{run}: {res}")

# UW separate

In [None]:
# count # of overlapping features among the 25 most important features for UW synthetic data generated 
# under separate synthesis paradigm
models = ["baseline","medgan","medbgan","emrwgan","medwgan","dpgan"]
for model in models:
    for run in range(3):
        features = np.load(f"{model}_s_sep_train_r_30_test_shap_{run}.npy")
        avg_features = np.sum(np.absolute(features),axis=0)
        top_25_idx_syn = np.argsort(avg_features)[-25:]
        lst3 = len(list(filter(lambda x: x in top_25_idx_syn, top_25_idx)))
        print(lst3)
        print(f"{model}_s_sep_train_r_30_test_shap_{run}: {lst3}")
        
        print("*******")
        

# UW combined

In [None]:
# count # of overlapping features among the 25 most important features for UW synthetic data generated 
# under combined synthesis paradigm
models = ["baseline","medgan","medbgan","emrwgan","medwgan","dpgan"]
for model in models:
    for run in range(3):
        features = np.load(f"{model}_s_train_r_30_test_shap_{run}.npy")
        avg_features = np.sum(np.absolute(features),axis=0)
        top_25_idx_syn = np.argsort(avg_features)[-25:]
        lst3 = len(list(filter(lambda x: x in top_25_idx_syn, top_25_idx)))
        print(lst3)
        print(f"{model}_s_train_r_30_test_shap_{run}: {lst3}")
        
        print("*******")
        

# Correlation calculation VUMC

In [None]:
# top 20 features selected by model trained on VUMC real data
vumc_real_feat = pd.read_csv("/data/users/yanyao/myproj/synpuf/uw_vumc_syn/vumc_top_3/r_70_train_r_30_test_feature_importance.csv")
vumc_real_abs = vumc_real_feat.abs()
vumc_real_abs_value = vumc_real_abs.sum(axis=0)/vumc_real_feat.shape[0]
real_list = list(vumc_real_abs_value)
real_list.pop(0)

In [None]:
top_20_idx = np.argsort(real_list)[-20:]
top_20_values_real = [real_list[i] for i in top_20_idx]

In [None]:
# calculate correlation
models = ["baseline","medgan","medbgan","emrwgan","medwgan","dpgan"]
for model in models:
    for run in range(3):
        features = np.load(f"/data/users/yanyao/myproj/synpuf/uw_vumc_syn/vumc_top_3/{model}_s_train_r_30_test_shap_{run}.npy")
        avg_features = np.sum(np.absolute(features),axis=0)
        avg_features = list(avg_features/features.shape[0])
        avg_features_20 = [avg_features[i] for i in top_20_idx]
        res = np.corrcoef(avg_features_20,top_20_values_real)
        print(f"{model}_s_sep_train_r_30_test_shap_{run}: {res[0][1]}")
        

In [None]:
# count # of overlapping features among the 25 most important features for VUMC synthetic data generated 
# under combined synthesis paradigm
models = ["baseline","medgan","medbgan","emrwgan","medwgan","dpgan"]
for model in models:
    for run in range(3):
        features = np.load(f"/data/users/yanyao/myproj/synpuf/uw_vumc_syn/vumc_top_3/{model}_s_train_r_30_test_shap_{run}.npy")
        avg_features = np.sum(np.absolute(features),axis=0)
        top_20_idx_syn = np.argsort(avg_features)[-20:]
        lst3 = len(list(filter(lambda x: x in top_20_idx_syn, top_20_idx)))
        print(f"{model}_s_train_r_30_test_shap_{run}: {lst3}")
        
        