In [None]:
import pandas as pd
import sys
from tqdm import tqdm
import numpy as np

In [None]:
all_files = ["../data/processed_notes/clinicalNote44M_NamesRemoved_261123_{}.csv".format(i) for i in range(0,16)]

li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

frame.to_csv("../data/processed_notes/clinicalNote44M_NamesRemoved.csv")

In [None]:
# Uploading processed data to Azure data store
from azureml.fsspec import AzureMachineLearningFileSystem
path_researcherdata = "PATH TO RESEARCHER DATA FOLDER IN DATA STORE"
fs = AzureMachineLearningFileSystem(path_researcherdata)
fs.upload(lpath="../data/processed_notes/clinicalNote44M_NamesRemoved.csv", rpath="./")

### Preparing preproccesed datasets for pretraining
Removing patients from acute readmission prediction val and test set

In [None]:
from azure.ai.ml import MLClient#, Input, command
from azure.identity import DefaultAzureCredential
sys.path.append("../..")
from utils import azure_ml_configs

workspace_id = azure_ml_configs.workspace_id
subscription_id = azure_ml_configs.subscription_id
resource_group = azure_ml_configs.resource_group
workspace_name = azure_ml_configs.workspace_name

# Get a handle to the workspace
ml_client = MLClient(
    credential=DefaultAzureCredential(),
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name,
)

data_asset = ml_client.data.get(name="clinicalNote44M_NamesRemoved", version=1) 
print(f"Data asset URI: {data_asset.path}")

In [None]:
cols = ["PatientDurableKey", "EncounterKey","text_names_removed_step2", "CreationInstant", "LastEditedInstant"]
df = pd.read_csv(data_asset.path, usecols=cols)

In [None]:
acutedf = pd.read_csv("../data/acuteReadmission/clinicalNote_AcuteReadmissions_NamesRemoved_161023.csv", usecols=["PatientDurableKey", "EncounterKey", "set"])

In [None]:
train_p = acutedf[acutedf.set=="train"].PatientDurableKey.unique()
val_p = acutedf[acutedf.set=="val"].PatientDurableKey.unique()
test_p = acutedf[acutedf.set=="test"].PatientDurableKey.unique()

In [None]:
notes_wo_valtest = df[~df.PatientDurableKey.isin(test_p.tolist()+val_p.tolist())]

notes_wo_valtest.to_csv("../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train.csv")

In [None]:
with open("../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train.txt", "w+") as f:
    for i, text in enumerate(tqdm(notes_wo_valtest.text_names_removed_step2.values)):
        if i==len(notes_wo_valtest)-1:
            f.write(text)
        else:
            f.write(text+"\n")

In [None]:
fs.upload(lpath="../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train.txt", rpath="./")

##### Dividing pretraining data into four parts with seperate patients

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
train = pd.read_csv("../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train.csv",index_col=0)

In [None]:
# get unique patient ids
samplelist = train["PatientDurableKey"].unique()    

# make train, val and test samples based on shuffled patient ids
train_A, train_B = train_test_split(samplelist, test_size=0.5, random_state=5, shuffle=True)

train_1, train_2 = train_test_split(train_A, test_size=0.5, random_state=5)
train_3, train_4 = train_test_split(train_B, test_size=0.5, random_state=5)

# check num patients in each and the percentage size of each set
print(len(train_1), len(train_2), len(train_3), len(train_4))
print(len(train_1)/len(samplelist), len(train_2)/len(samplelist), len(train_3)/len(samplelist), len(train_4)/len(samplelist))

In [None]:
train1 = train[train.PatientDurableKey.isin(train_1)]
train1.to_csv("../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train_randompart1.csv")

train2 = train[train.PatientDurableKey.isin(train_2)]
train2.to_csv("../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train_randompart2.csv")

train3 = train[train.PatientDurableKey.isin(train_3)]
train3.to_csv("../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train_randompart3.csv")

train4 = train[train.PatientDurableKey.isin(train_4)]
train4.to_csv("../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train_randompart4.csv")

In [None]:
for indel, df in enumerate([train1,train2,train3,train4]):
    print(indel+1)
    with open("../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train_randompart{}.txt".format(indel+1), "w+") as f:
        for i, text in enumerate(tqdm(df.text_names_removed_step2.values)):
            if i==len(df)-1:
                f.write(text)
            else:
                f.write(text+"\n")

In [None]:
from azureml.fsspec import AzureMachineLearningFileSystem

path_researcherdata = "PATH TO RESEARCHER DATA FOLDER IN DATA STORE"
fs = AzureMachineLearningFileSystem(path_researcherdata)

fs.upload(lpath="../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train_randompart1.txt", rpath="./")
fs.upload(lpath="../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train_randompart2.txt", rpath="./")
fs.upload(lpath="../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train_randompart3.txt", rpath="./")
fs.upload(lpath="../data/processed_notes/clinicalNote44M_NamesRemoved_MLM_train_randompart4.txt", rpath="./")