In [None]:
# Import azure-core elements
import azureml.core
from azureml.core.workspace import Workspace
from azureml.core import ScriptRunConfig, Environment, Experiment
from azureml.core.environment import CondaDependencies
from azureml.core import Workspace, Datastore, Dataset
from azureml.data.dataset_factory import DataType

# Initiate workspace
workspace = Workspace.from_config()

# Define datastore and load dataset
datastore_name = 'sp_data'
datastore = Datastore.get(workspace, datastore_name)

datastore_paths = [(datastore, '/patients.parquet')] 
ds = Dataset.Tabular.from_parquet_files(path=datastore_paths)
dfP = ds.to_pandas_dataframe()
dfP.head()

In [None]:
import math
import umap
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import pandas as pd
import torch
import transformers
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
sns.set(style="whitegrid")
sns.set_palette('mako_r')

In [None]:
events = pd.read_parquet("../../data/acuteReadmission/events_acute_labels.parquet")
# make new encounter column, as acute and non acute have different encounter columns in events dataframe
events["encounter"] = [int(i) if math.isnan(i)==False else int(j) for (i,j) in list(zip(events.EncounterKey_dis.values, events.EncounterKey.values))]

event = events.merge(
        dfP[["DurableKey", "BirthDate"]],
        left_on = 'PatientDurableKey',
        right_on = 'DurableKey')\
        .drop(columns='DurableKey')
event['Age'] = np.floor((pd.to_datetime(event.Date_dis) -pd.to_datetime(event.BirthDate)).dt.days / 365.25).astype(int)

age_df = event[["encounter", "Age"]].copy()

In [None]:
model_path = "../../finetuning/acutereadm_finetuned_models/dischargesum/"
model_name = "psyroberta_p4_epoch12"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModel.from_pretrained(model_path+model_name, 
                                    local_files_only=True,
                                    use_safetensors=True, 
                                    output_hidden_states=True,
                                    output_attentions=True)
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(model_path+model_name, local_files_only=True)

assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [None]:
from azure.ai.ml import MLClient#, Input, command
from azure.identity import DefaultAzureCredential
import azure_ml_configs

workspace_id = azure_ml_configs.workspace_id 
subscription_id = azure_ml_configs.subscription_id 
resource_group = azure_ml_configs.resource_group
workspace_name = azure_ml_configs.workspace_name

# Get a handle to the workspace
ml_client = MLClient(
    credential=DefaultAzureCredential(),
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name,
)

discharge_notes_only = True
text_column_name = "text_names_removed_step2"

data_asset = ml_client.data.get(name="clinicalNote_AcuteReadmission", version=1)

print(f"Data asset URI: {data_asset.path}")

data_path = data_asset.path

In [None]:
# loading and prepraring data
cols = [text_column_name, "Acute", "set", "Type", "PatientDurableKey", "EncounterKey", "CreationInstant"]
df = pd.read_csv(data_path, usecols=cols)
# make sure the data is sorted by patient id, encounter and date
df.sort_values(by=["PatientDurableKey", "EncounterKey", "CreationInstant"],inplace=True)
#rename main columns of interest
df.rename(columns={text_column_name: "text", "Acute": "label"}, inplace=True)

if discharge_notes_only:
    df = df[df["Type"].str.contains("Udskrivningsresume|Udskrivningsresum√©")==True].copy()
    

# concatenating texts on patient and encounter id
df = df.groupby(["PatientDurableKey", "EncounterKey", "label", "set"]).text.apply(f'{tokenizer.sep_token}'.join).reset_index()

In [None]:
# Tokenizing text

data_dict = {
    "train": Dataset.from_pandas(df[df.set=="train"]),
    "validation": Dataset.from_pandas(df[df.set=="val"]),
    "test": Dataset.from_pandas(df[df.set=="test"])
    }


raw_datasets = DatasetDict(data_dict)

text_column_name = "text"

def tokenize_function(examples):
    input_ids = []
    attention_masks = []
    labs = []
    patientids = []
    encounterids = []
    texts = []
    for x,y, patient_id, encounter_id in list(zip(examples["text"], examples["label"], examples["PatientDurableKey"], examples["EncounterKey"])):
        encoded_dict = tokenizer.encode_plus(
            x,  # Sentence to encode
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]' or equivelant for roberta
            max_length=512,  # Pad & truncate all sentences.
            padding="max_length", #(needing to specify truncation=True depends on version)
            truncation=True,
            return_overflowing_tokens=True, # return lists of tokens above 512 
            return_offsets_mapping=True,
            stride=32, # The stride used when the context is too large and is split across several features.
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors='pt'  # Return pytorch tensors.
        )
        for inputs, attentions in list(zip(encoded_dict['input_ids'],encoded_dict['attention_mask']))[:None]:
            #print(i.shape)
            # Add the encoded sentence to the list.
            input_ids.append(inputs)
            texts.append(tokenizer.decode(inputs))
            #And its attention mask (simply differentiates padding from non-padding).
            attention_masks.append(attentions)
            labs.append(y)
            patientids.append(patient_id)
            encounterids.append(encounter_id)
    assert len(input_ids) == len(attention_masks) == len(labs) == len(patientids) == len(encounterids)
    sample = {"inputs": input_ids,
            "attn_masks": attention_masks,
            "labels": labs,
            "patient_id": patientids,
            "encounter_id": encounterids,
            "text_split":texts}
    return sample


tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=None,
            remove_columns=raw_datasets['validation'].column_names,
            #load_from_cache_file=not args.overwrite_cache,
            desc="Running tokenizer on every text in dataset",
        )

tokenized_datasets["train"].set_format(type='pt', columns=['inputs', 'attn_masks', 'labels', 'patient_id', 'encounter_id'])
tokenized_datasets["validation"].set_format(type='pt', columns=['inputs', 'attn_masks', 'labels', 'patient_id', 'encounter_id'])
#if args.on_test:
tokenized_datasets["test"].set_format(type='pt',columns=['inputs', 'attn_masks', 'labels', 'patient_id', 'encounter_id'])

traindata = tokenized_datasets["train"]
valdata = tokenized_datasets["validation"]
#if args.on_test:
testdata = tokenized_datasets["test"]

In [None]:
# load saved embeddings (generated in embeddings_analyses_PCA_KNN.ipynb)

train_cls_embeddings = np.load("train_cls_embeddings.npy")
train_mean_pooled_embeddings = np.load("train_mean_pooled_embeddings.npy")

In [None]:
def agegroups(x):
    if x=="Unknown":
        return x
    elif x < 18:
        return "Below 18"
    elif x >= 18 and x < 35:
        return "18-34"
    elif x >= 35 and x < 55:
        return "35-54"
    else:
        return "Above 54"

In [None]:
preds = pd.read_csv("../../result_files/dischargesum_psyroberta_p4_epoch12_AR_train_results.csv", usecols=["pos_prob", "pid", "eid"])
preds = preds.groupby(by=["eid","pid"])["pos_prob"].mean().reset_index()
preds["pred"] = preds.pos_prob.apply(lambda x: 0 if x<0.5 else 1)

train_labels = traindata["labels"]
train_eids = traindata["encounter_id"]
train_pids = traindata["patient_id"]
train_encounter_embs_cls = np.array([np.array([train_cls_embeddings[i] for i in np.where(np.array(train_eids) == k)[0]]).mean(axis=0) for k in np.unique(train_eids)])
#train_num_notes_in_encounter = [len([train_cls_embeddings[i] for i in np.where(np.array(train_eids) == k)[0]]) for k in np.unique(train_eids)]
train_encounter_embs_mean_pooled = np.array([np.array([train_mean_pooled_embeddings[i] for i in np.where(np.array(train_eids) == k)[0]]).mean(axis=0) for k in np.unique(train_eids)])
train_encounter_labs = np.array([np.array([train_labels[i] for i in np.where(np.array(train_eids) == k)[0]]).mean(axis=0) for k in np.unique(train_eids)])
train_num_notes_in_encounter = np.array([len([train_labels[i] for i in np.where(np.array(train_eids) == k)[0]]) for k in np.unique(train_eids)])
train_encounter_pid = np.array([[train_pids[i].item() for i in np.where(np.array(train_eids) == k)[0]][0] for k in np.unique(train_eids)])

train_probs = np.array([preds[preds.eid==i].pos_prob.item() for i in np.unique(train_eids)])
train_preds = np.array([preds[preds.eid==i].pred.item() for i in np.unique(train_eids)])
#train_encounter_tokens =  []

intersection_train = set(train_encounter_pid.tolist()).intersection(set(dfP.DurableKey.values.tolist()))
train_sex = [dfP[dfP["DurableKey"]==i].Sex.item() if i in intersection_train else "Unknown" for i in train_encounter_pid]
train_eth = [dfP[dfP["DurableKey"]==i].Ethnicity.item() if i in intersection_train else "Unknown" for i in train_encounter_pid]
train_age = [age_df[age_df.encounter==i].Age.item() if i in age_df.encounter.values else "Unknown" for i in np.unique(train_eids)]
train_age_groups = [agegroups(i) for i in train_age]

print(train_encounter_embs_cls.shape)
print(train_encounter_embs_mean_pooled.shape)
print(train_encounter_labs.shape)
print(train_num_notes_in_encounter.shape)
print(train_encounter_pid.shape)

In [None]:
train_diagnosis = np.load("train_diagnosis.npy")

def skscode_to_diagnosis(sks):
    if sks.startswith("DF20"):
        return "Schizophrenia"
    elif sks.startswith("DF2"):
        return "Other psychosis"
    elif sks.startswith("DF30") or sks.startswith("DF31"):
        return "Bipolar/manic"
    elif sks.startswith("DF32") or sks.startswith("DF33"):
        return "Depression"
    elif sks.startswith("DF40") or sks.startswith("DF41") or sks.startswith("DF42"):
        return "Anxiety/OCD"
    elif sks.startswith("DF6"):
        return "Personality disorder"
    elif sks.startswith("DF1"):
        return "SUD"
    else:
        return "Other"

train_diagnosis_simple = np.array([sks[1:3] if sks.startswith("DF") else "Other" for sks in train_diagnosis])

train_diagnosis_specific = np.array([skscode_to_diagnosis(sks) for sks in train_diagnosis])

In [None]:
import pickle

neighbors = [5, 20, 80, 320]
min_dists = [0.0125, 0.05, 0.2, 0.8]
metrics = ["cosine", "euclidean"]

for k in neighbors:
    for j in min_dists:
        for m in metrics:
            reducer = umap.UMAP(n_neighbors=k, min_dist=j, metric=m)
            #scaled_mean_pooled = StandardScaler().fit_transform(train_encounter_embs_mean_pooled)
            umap_mean_pooled = reducer.fit_transform(train_encounter_embs_mean_pooled)

            plot_data_mean_pooled = {'UMAP 1': umap_mean_pooled[:, 0], 
                                    'UMAP 2': umap_mean_pooled[:, 1],
                                    'Label': [int(i) for i in train_encounter_labs],
                                    'Num notes': train_num_notes_in_encounter,
                                    "Sex": train_sex,
                                    "Ethnicity": train_eth,
                                    "Age": train_age_groups,
                                    "Prediction": train_preds,
                                    "Probability": train_probs,
                                    "Diagnosis": train_diagnosis_simple,
                                    "Diagnosis_specific": train_diagnosis_specific
                                    }
            with open('umap_train_k{}_mindist{}_{}.pkl'.format(k,j,m), 'wb') as f:
                pickle.dump(plot_data_mean_pooled, f)

In [None]:
sns.set_style("white", {"axes.edgecolor": ".8"})

c0 = sns.color_palette(palette='mako_r',n_colors=6)[0]
c1 = sns.color_palette(palette='mako_r',n_colors=6)[4]

colors=[c0,c1]

fig, ax = plt.subplots(4,4, figsize=(10,10))

neighbors = [5, 20, 80, 320]
min_dists = [0.0125, 0.05, 0.2, 0.8]
metric = "euclidean"

for i,k in enumerate(neighbors):
    for j,m_dist in enumerate(min_dists):
        ax[j,i].set_xticks([]);
        ax[j,i].set_yticks([]);
        ax[0,i].set_title("n_neighbors={}".format(k), fontsize=9);
        ax[j,0].set_ylabel("min_dist={}".format(m_dist), fontsize=9);
        
        
        with open('umap_train_k{}_mindist{}_{}.pkl'.format(k,m_dist,metric), 'rb') as f:
            plot_data_mean_pooled = pickle.load(f)
            
            sns.scatterplot(data=plot_data_mean_pooled, 
                            x='UMAP 1', 
                            y='UMAP 2',
                            hue='Label', 
                            s=0.5, 
                            palette=colors, 
                            ax=ax[j,i])
            ax[j,i].legend([],[], frameon=False)
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
fig.legend(by_label.values(), by_label.keys(), loc=(0.42, 0.92), title="Label", ncol=2)
plt.subplots_adjust(wspace=0.05, hspace=0.05)
        
plt.savefig("../../output/UMAP_train_{}_label.pdf".format(metric), bbox_inches="tight")
plt.savefig("../../output/UMAP_train_{}_label.png".format(metric), bbox_inches="tight")

In [None]:
sns.set_style("white", {"axes.edgecolor": ".8"})

c0 = sns.color_palette(palette='mako_r',n_colors=6)[0]
c1 = sns.color_palette(palette='mako_r',n_colors=6)[4]

colors=[c0,c1]

fig, ax = plt.subplots(4,4, figsize=(10,10))

neighbors = [5, 20, 80, 320]
min_dists = [0.0125, 0.05, 0.2, 0.8]
metric = "euclidean"

for i,k in enumerate(neighbors):
    for j,m_dist in enumerate(min_dists):
        ax[j,i].set_xticks([]);
        ax[j,i].set_yticks([]);
        ax[0,i].set_title("n_neighbors={}".format(k), fontsize=9);
        ax[j,0].set_ylabel("min_dist={}".format(m_dist), fontsize=9);
        
        
        with open('umap_train_k{}_mindist{}_{}.pkl'.format(k,m_dist,metric), 'rb') as f:
            plot_data_mean_pooled = pickle.load(f)
            
            sns.scatterplot(data=plot_data_mean_pooled, 
                            x='UMAP 1', 
                            y='UMAP 2',
                            hue='Prediction', 
                            s=0.5, 
                            palette=colors, 
                            ax=ax[j,i])
            ax[j,i].legend([],[], frameon=False)
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
fig.legend(by_label.values(), by_label.keys(), loc=(0.42, 0.92), title="Prediction", ncol=2)
plt.subplots_adjust(wspace=0.05, hspace=0.05)
        
plt.savefig("../../output/UMAP_train_{}_prediction.pdf".format(metric), bbox_inches="tight")
plt.savefig("../../output/UMAP_train_{}_prediction.png".format(metric), bbox_inches="tight")

In [None]:
sns.set_style("white", {"axes.edgecolor": ".8"})

colors = sns.color_palette(palette='colorblind',n_colors=11)
colors = [colors[0]]+colors[2:5]+colors[7:10]

fig, ax = plt.subplots(4,4, figsize=(10,10))

neighbors = [5, 20, 80, 320]
min_dists = [0.0125, 0.05, 0.2, 0.8]
metric = "cosine"

for i,k in enumerate(neighbors):
    for j,m_dist in enumerate(min_dists):
        ax[j,i].set_xticks([]);
        ax[j,i].set_yticks([]);
        ax[0,i].set_title("n_neighbors={}".format(k), fontsize=9);
        ax[j,0].set_ylabel("min_dist={}".format(m_dist), fontsize=9);
        
        
        with open('umap_train_k{}_mindist{}_{}.pkl'.format(k,m_dist,metric), 'rb') as f:
            plot_data_mean_pooled = pickle.load(f)
            
            sns.scatterplot(data=plot_data_mean_pooled, 
                            x='UMAP 1', 
                            y='UMAP 2',
                            hue='Diagnosis_specific', 
                            palette=colors+["black"], 
                            #style="Diagnosis_specific",
                            hue_order= ['Anxiety/OCD', 
                                        'Personality disorder', 
                                        'Bipolar/manic', 
                                        'Depression', 
                                        'Other psychosis', 
                                        'SUD', 
                                        'Schizophrenia' 
                                        ][::-1]+["Other"], 
                            s=0.5, 
                            ax=ax[j,i])
            
            ax[j,i].legend([],[], frameon=False)
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
fig.legend(by_label.values(), by_label.keys(), loc=(0.1, 0.92), title="Diagnosis", ncol=4)
plt.subplots_adjust(wspace=0.05, hspace=0.05)
        
plt.savefig("../../output/UMAP_train_{}_diagnosis_specific.pdf".format(metric), bbox_inches="tight")
plt.savefig("../../output/UMAP_train_{}_diagnosis_specific.png".format(metric), bbox_inches="tight")