In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from tqdm import tqdm

In [2]:
#iterate throguh dir and get all the image path names
data_dir = "/home/yasin/Desktop/causalssl/datasets/mimic/"
file_paths = []
im_names = []
patient = []
study = []

for root,dirs, files in os.walk(data_dir):
    for file in files: 
        if file.endswith("small.jpg"):
            full_path = os.path.join(root,file)
            study.append(int(os.path.basename(os.path.dirname(full_path))[1:]))
            patient.append(int(os.path.basename(os.path.dirname(os.path.dirname(full_path)))[1:]))
            file_paths.append(full_path.lstrip(data_dir))
            im_name,_= os.path.splitext(file)
            im_names.append(im_name.removeprefix(data_dir+'files/').rstrip("_small"))

df = pd.DataFrame({
    'path_preproc':file_paths,
    'image_id':im_names,
    'subject_id': patient,
    'study_id':study
})

In [3]:
df

Unnamed: 0,path_preproc,image_id,subject_id,study_id
0,files/p15/p15092875/s57299949/1f99a5c7-4b8b443...,1f99a5c7-4b8b443a-3f65f9ac-be43fafa-9034393c,15092875,57299949
1,files/p15/p15329272/s50315533/9a6e72fe-1972803...,9a6e72fe-1972803e-16b8085f-57c32513-f8224f62,15329272,50315533
2,files/p15/p15329272/s50315533/e6d56f27-88edd2b...,e6d56f27-88edd2b6-b335ea30-adb8edc9-23892ed9,15329272,50315533
3,files/p15/p15329272/s50807296/cdbe2ecf-3db9a99...,cdbe2ecf-3db9a998-ada5dd64-5ebfc986-a029153d,15329272,50807296
4,files/p15/p15084854/s55517838/bedc6bd8-4a86481...,bedc6bd8-4a864815-c1be923f-efa7f080-7c03d3c0,15084854,55517838
...,...,...,...,...
243207,files/p17/p17287974/s59278816/3cf01810-ba0b124...,3cf01810-ba0b1249-459efac3-7274c452-a9f4f6c2,17287974,59278816
243208,files/p17/p17267806/s55359345/d97f4017-5769bcd...,d97f4017-5769bcd6-64d7d578-df80bb60-0865bfd6,17267806,55359345
243209,files/p17/p17074525/s50198756/045856cf-8ce242d...,045856cf-8ce242d1-b7f98f35-2e0079dc-2b969618,17074525,50198756
243210,files/p17/p17215130/s54394902/5b2ed827-ee959b9...,5b2ed827-ee959b9c-b22eea89-0ecb6ce7-44c612e2,17215130,54394902


In [4]:
admissions_df = pd.read_csv(data_dir+"admissions.csv")
ethnicity_df = admissions_df.loc[:,['subject_id', 'race']].drop_duplicates()

v = ethnicity_df.subject_id.value_counts()
subject_id_more_than_once = v.index[v.gt(1)]

ambiguous_ethnicity_df = ethnicity_df[ethnicity_df.subject_id.isin(subject_id_more_than_once)]
inconsistent_race = ambiguous_ethnicity_df.subject_id.unique()


patients_df = pd.read_csv(data_dir+"patients.csv")
patients_df = patients_df[["subject_id", "gender", "anchor_age"]].drop_duplicates()

diagnosis_df = pd.read_csv(data_dir + 'mimic-cxr-2.0.0-chexpert.csv')

In [5]:
merge_df = pd.merge(df,diagnosis_df,on=['subject_id', 'study_id'])
merge_df = pd.merge(merge_df,ethnicity_df,on='subject_id')
merge_df = merge_df[~merge_df.subject_id.isin(inconsistent_race)]
df_cxr = pd.merge(merge_df,patients_df, on='subject_id')

In [6]:
df_cxr = df_cxr.rename(columns={'anchor_age': 'age'})

In [7]:
mask = (df_cxr.race.str.contains("BLACK", na=False))
df_cxr.loc[mask, "race"] = "Black"

mask = (df_cxr.race.str.contains("WHITE", na=False))
df_cxr.loc[mask, "race"] = "White"

mask = (df_cxr.race.str.contains("PORTUGUESE", na=False))
df_cxr.loc[mask, "race"] = "White"

mask = (df_cxr.race.str.contains("ASIAN", na=False))
df_cxr.loc[mask, "race"] = "Asian"

mask = (df_cxr.race.str.contains("HISPANIC", na=False))
df_cxr.loc[mask, "race"] = "Hispanic"

mask = (df_cxr.race.str.contains("SOUTH AMERICAN", na=False))
df_cxr.loc[mask, "race"] = "Hispanic"

In [8]:
df_cxr = df_cxr[df_cxr.race.isin(["Asian","Black","White"])]
df_cxr['race'].unique()

array(['White', 'Black', 'Asian'], dtype=object)

In [9]:
df_cxr['race_label'] = df_cxr['race']

df_cxr.loc[df_cxr['race_label'] == 'White', 'race_label'] = 0
df_cxr.loc[df_cxr['race_label'] == 'Asian', 'race_label'] = 1
df_cxr.loc[df_cxr['race_label'] == 'Black', 'race_label'] = 2

In [10]:
df_cxr.loc[df_cxr['gender'] == 'F', 'sex'] = 'Female'
df_cxr.loc[df_cxr['gender'] == 'M', 'sex'] = 'Male'

In [11]:
print(sum(df_cxr['gender'] == 'F') / len(df_cxr['gender']))

0.46243053158953246


In [12]:
df_cxr['sex_label'] = df_cxr['sex']

df_cxr.loc[df_cxr['sex_label'] == 'Male', 'sex_label'] = 0
df_cxr.loc[df_cxr['sex_label'] == 'Female', 'sex_label'] = 1

In [13]:
df_cxr['sex'].unique()

array(['Male', 'Female'], dtype=object)

In [14]:
labels = [
    'No Finding',
    'Enlarged Cardiomediastinum',
    'Cardiomegaly',
    'Lung Opacity',
    'Lung Lesion',
    'Edema',
    'Consolidation',
    'Pneumonia',
    'Atelectasis',
    'Pneumothorax',
    'Pleural Effusion',
    'Pleural Other',
    'Fracture',
    'Support Devices']

In [15]:
df_cxr['disease'] = df_cxr[labels[0]]
df_cxr.loc[df_cxr[labels[0]] == 1, 'disease'] = labels[0]
df_cxr.loc[df_cxr[labels[10]] == 1, 'disease'] = labels[10]
df_cxr.loc[df_cxr['disease'].isna(), 'disease'] = 'Other'

df_cxr['disease_label'] = df_cxr['disease']
df_cxr.loc[df_cxr['disease_label'] == labels[0], 'disease_label'] = 0
df_cxr.loc[df_cxr['disease_label'] == labels[10], 'disease_label'] = 1
df_cxr.loc[df_cxr['disease_label'] == 'Other', 'disease_label'] = 2

In [16]:
df_cxr = df_cxr.drop(df_cxr[df_cxr.disease_label == 2].index)

In [17]:
df_cxr = df_cxr.drop(columns=['study_id', 'Atelectasis','Cardiomegaly', 'Consolidation', 'Edema',
       'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity',
       'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia',
       'Pneumothorax', 'Support Devices', 'race', 'gender', 'sex', 'disease'])

In [18]:
df_cxr

Unnamed: 0,path_preproc,image_id,subject_id,age,race_label,sex_label,disease_label
4,files/p15/p15084854/s55517838/bedc6bd8-4a86481...,bedc6bd8-4a864815-c1be923f-efa7f080-7c03d3c0,15084854,32,0,1,0
5,files/p15/p15084854/s53449257/d02bae78-c6084e1...,d02bae78-c6084e16-a5321712-02e3dfb6-d6cd1d5e,15084854,32,0,1,0
11,files/p15/p15467022/s55971724/7c14c513-4c38def...,7c14c513-4c38def0-4de59a7a-e9f5959c-07207003,15467022,87,0,1,0
12,files/p15/p15467022/s58996752/00a4310d-27a2428...,00a4310d-27a24281-a464ab8d-eec250bf-989c11a1,15467022,87,0,1,0
13,files/p15/p15097409/s50254818/3f60cbff-662b0df...,3f60cbff-662b0dfd-efe326a4-dc040d2b-c89e96c4,15097409,53,0,0,0
...,...,...,...,...,...,...,...
189813,files/p17/p17486028/s52003134/4f8fddab-fbc72ad...,4f8fddab-fbc72ad0-f2064c47-d4626012-ae552ea4,17486028,56,0,1,1
189817,files/p17/p17013248/s53038109/60ef66a7-b493d20...,60ef66a7-b493d204-891c70fb-f986453c-91184533,17013248,46,0,1,0
189819,files/p17/p17407744/s50924930/7004730d-a1c94c9...,7004730d-a1c94c9e-2a69a336-9f3df84e-88d9cc5e,17407744,82,0,1,0
189821,files/p17/p17407744/s58264221/faeed6e6-9c051bb...,faeed6e6-9c051bbb-8e35cd07-e8b0eff9-7e1cc3ed,17407744,82,0,1,1


In [19]:
df_cxr.insert(5, "split","none", True)
unique_sub_id = df_cxr.subject_id.unique()

train_percent, valid_percent, test_percent = 0.60, 0.10, 0.30

unique_sub_id = shuffle(unique_sub_id)
value1 = (round(len(unique_sub_id)*train_percent))
value2 = (round(len(unique_sub_id)*valid_percent))
value3 = value1 + value2
value4 = (round(len(unique_sub_id)*test_percent))

In [20]:
df_cxr = shuffle(df_cxr)

train_sub_id = unique_sub_id[:value1]
validate_sub_id = unique_sub_id[value1:value3]
test_sub_id = unique_sub_id[value3:]

In [21]:
df_cxr.loc[df_cxr.subject_id.isin(train_sub_id), "split"]="train"
df_cxr.loc[df_cxr.subject_id.isin(validate_sub_id), "split"]="validate"
df_cxr.loc[df_cxr.subject_id.isin(test_sub_id), "split"]="test"

In [22]:
df_cxr.split.value_counts(normalize=True)

split
train       0.602294
test        0.299369
validate    0.098337
Name: proportion, dtype: float64

In [23]:
df_cxr.split.value_counts(normalize=False)

split
train       56930
test        28297
validate     9295
Name: count, dtype: int64

In [24]:
df_cxr.to_csv(data_dir + 'mimic.sample.csv')

df_train = df_cxr[df_cxr.split=="train"].drop(columns=["subject_id", "split"])
df_val = df_cxr[df_cxr.split=="validate"].drop(columns=["subject_id", "split"])
df_test = df_cxr[df_cxr.split=="test"].drop(columns=["subject_id", "split"])

df_train.to_csv(data_dir + 'meta/train.csv')
df_val.to_csv(data_dir + 'meta/valid.csv')
df_test.to_csv(data_dir + 'meta/test.csv')

In [25]:
props = [0.3, 0.4, 0.5]
name = 'less'
for prop in tqdm(props):
    df_cxr.to_csv(data_dir + 'mimic.sample.csv')

    df_train = df_cxr[df_cxr.split=="train"].drop(columns=["subject_id", "split"])
    df_val = df_cxr[df_cxr.split=="validate"].drop(columns=["subject_id", "split"])
    df_test = df_cxr[df_cxr.split=="test"].drop(columns=["subject_id", "split"])

    if name == "less":
        remove_n = int(len(df_train)*(1-prop))
        drop_indices = np.random.choice(df_train.index, remove_n, replace=False)
        df_train = df_train.drop(drop_indices)
        df_train.to_csv(data_dir + 'meta/'+name+'/train_'+str(prop)+'.csv')
    
    if name == "missing":
        columns_to_set_nan = ['race_label', 'sex_label', 'disease_label', 'age']
        num_values_to_set_nan = int(len(df_train)*(1-prop))
        random_indices = np.random.choice(df_train.index, size=num_values_to_set_nan, replace=False)
        df_train.loc[random_indices, columns_to_set_nan] = float("nan")
        df_train_lab = df_train.loc[list(set(df_train.index).difference(set(random_indices)))]
        df_train_unlab = df_train.loc[random_indices]
        df_train_lab.to_csv(data_dir + 'meta/'+name+'/train_lab_'+str(prop)+'.csv')
        df_train_unlab.to_csv(data_dir + 'meta/'+name+'/train_unlab_'+str(prop)+'.csv')
    
    if name == "random":
        num_values_to_set_nan = int(len(df_train)*(1-prop))
        random_indices = np.random.choice(df_train.index, size=num_values_to_set_nan, replace=False)
        df_train.loc[random_indices, 'race_label'] = float("nan")
        random_indices = np.random.choice(df_train.index, size=num_values_to_set_nan, replace=False)
        df_train.loc[random_indices, 'sex_label'] = float("nan")
        random_indices = np.random.choice(df_train.index, size=num_values_to_set_nan, replace=False)
        df_train.loc[random_indices, 'disease_label'] = float("nan")
        random_indices = np.random.choice(df_train.index, size=num_values_to_set_nan, replace=False)
        df_train.loc[random_indices, 'age'] = float("nan")
        df_train.to_csv(data_dir + 'meta/'+name+'/train_'+str(prop)+'.csv')

    
    df_val.to_csv(data_dir + 'meta/'+name+'/valid_'+str(prop)+'.csv')
    df_test.to_csv(data_dir + 'meta/'+name+'/test_'+str(prop)+'.csv')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.63it/s]


In [26]:
df_train_lab

NameError: name 'df_train_lab' is not defined