# Create MIMIC dataset

Using the [Wild-Time Github](https://github.com/huaxiuyao/Wild-Time/blob/main/wildtime/data/mimic.py), we (re-)create the cohort.

Reading through `run_dip.py`, we need a file called `mimic_diagnoses.csv` with columns including:
 - diagnoses
 - procedure
 - race
 - readmission
 - real_admit_year
 
We need the following MIMIC-IV csvs:
 - patients.csv
 - admissions.csv
 - diagnoses_icd.csv
 - procedures_icd.csv
 
```wget -r -N -c -np --user iychen --ask-password https://physionet.org/files/mimiciv/2.2/hosp/patients.csv.gz```

Note that admissions csv does not seem to have an ethnicity column anymore.


In [1]:
import os
import pandas as pd

In [2]:
data_dir = '../mimic-iv-data/'

for file in ['patients.csv', 'diagnoses_icd.csv', 'procedures_icd.csv']:
    if not os.path.isfile(os.path.join(data_dir, file)):
        raise ValueError(f'Please download {file} to {data_dir}')
        
processed_file = 'mimic_diagnoses.csv'

In [3]:
# helper functions taken from: https://github.com/huaxiuyao/Wild-Time/blob/main/wildtime/data/mimic.py
def get_anchor_year(anchor_year_group):
    year_min = int(anchor_year_group[:4])
    year_max = int(anchor_year_group[-4:])
    assert year_max - year_min == 2
    return year_min

def assign_readmission_label(row):
    curr_subject_id = row.subject_id
    curr_admittime = row.admittime

    next_row_subject_id = row.next_row_subject_id
    next_row_admittime = row.next_row_admittime

    if curr_subject_id != next_row_subject_id:
        label = 0
    elif (next_row_admittime - curr_admittime).days > 15:
        label = 0
    else:
        label = 1

    return label

def diag_icd_to_3digit(icd):
    if icd[:4] == 'ICD9':
        return 'ICD9_' + diag_icd9_to_3digit(icd[5:])
    elif icd[:5] == 'ICD10':
        return 'ICD10_' + diag_icd10_to_3digit(icd[6:])
    else:
        raise

def diag_icd9_to_3digit(icd9):
    if icd9.startswith('E'):
        if len(icd9) >= 4:
            return icd9[:4]
        else:
            print(icd9)
            return icd9
    else:
        if len(icd9) >= 3:
            return icd9[:3]
        else:
            print(icd9)
            return icd9


def diag_icd10_to_3digit(icd10):
    if len(icd10) >= 3:
        return icd10[:3]
    else:
        print(icd10)
        return icd10

def list_join(lst):
    return ' <sep> '.join(lst)

def proc_icd9_to_3digit(icd9):
    if len(icd9) >= 3:
        return icd9[:3]
    else:
        print(icd9)
        return icd9


def proc_icd10_to_3digit(icd10):
    if len(icd10) >= 3:
        return icd10[:3]
    else:
        print(icd10)
        return icd10


def proc_icd_to_3digit(icd):
    if icd[:4] == 'ICD9':
        return 'ICD9_' + proc_icd9_to_3digit(icd[5:])
    elif icd[:5] == 'ICD10':
        return 'ICD10_' + proc_icd10_to_3digit(icd[6:])
    else:
        raise


Need the following csvs:
 - patients.csv
 - admissions.csv
 - diagnoses_icd.csv
 - procedures_icd.csv

In [7]:
# Patients
patients = pd.read_csv(os.path.join(data_dir, 'patients.csv'))
patients['real_anchor_year'] = patients.anchor_year_group.apply(lambda x: get_anchor_year(x))
patients = patients[['subject_id', 'gender', 'anchor_age', 'anchor_year', 'real_anchor_year']]
patients = patients.dropna().reset_index(drop=True)
admissions = pd.read_csv(os.path.join(data_dir, 'admissions.csv'))
admissions['admittime'] = pd.to_datetime(admissions['admittime']).dt.date
admissions = admissions[['subject_id', 'hadm_id', 'admittime', 'hospital_expire_flag', 'race', 'admission_type']]
admissions = admissions.dropna()
admissions['mortality'] = admissions.hospital_expire_flag
admissions = admissions.sort_values(by=['subject_id', 'hadm_id', 'admittime'])
admissions['next_row_subject_id'] = admissions.subject_id.shift(-1)
admissions['next_row_admittime'] = admissions.admittime.shift(-1)
admissions['readmission'] = admissions.apply(lambda x: assign_readmission_label(x), axis=1)
admissions = admissions[['subject_id', 'hadm_id', 'admittime', 'mortality', 'readmission','race', 'admission_type']]
admissions = admissions.dropna().reset_index(drop=True)

# Diagnoses ICD
diagnoses_icd = pd.read_csv(os.path.join(data_dir, 'diagnoses_icd.csv'))
diagnoses_icd = diagnoses_icd.dropna()
diagnoses_icd = diagnoses_icd.drop_duplicates()
diagnoses_icd = diagnoses_icd.sort_values(by=['subject_id', 'hadm_id', 'seq_num'])
diagnoses_icd['icd_code'] = diagnoses_icd.apply(lambda x: f'ICD{x.icd_version}_{x.icd_code}', axis=1)
diagnoses_icd['icd_3digit'] = diagnoses_icd.icd_code.apply(lambda x: diag_icd_to_3digit(x))
diagnoses_icd = diagnoses_icd.groupby(['subject_id', 'hadm_id']).agg({'icd_3digit': list_join}).reset_index()
diagnoses_icd = diagnoses_icd.rename(columns={'icd_3digit': 'diagnoses'})

# Procedures ICD
procedures_icd = pd.read_csv(os.path.join(data_dir, 'procedures_icd.csv'))
procedures_icd = procedures_icd.dropna()
procedures_icd = procedures_icd.drop_duplicates()
procedures_icd = procedures_icd.sort_values(by=['subject_id', 'hadm_id', 'seq_num'])
procedures_icd['icd_code'] = procedures_icd.apply(lambda x: f'ICD{x.icd_version}_{x.icd_code}', axis=1)
procedures_icd['icd_3digit'] = procedures_icd.icd_code.apply(lambda x: proc_icd_to_3digit(x))
procedures_icd = procedures_icd.groupby(['subject_id', 'hadm_id']).agg({'icd_3digit': list_join}).reset_index()
procedures_icd = procedures_icd.rename(columns={'icd_3digit': 'procedure'})

# Merge
df = admissions.merge(patients, on='subject_id', how='inner')
df['real_admit_year'] = df.apply(lambda x: x.admittime.year - x.anchor_year + x.real_anchor_year, axis=1)
df['age'] = df.apply(lambda x: x.admittime.year - x.anchor_year + x.anchor_age, axis=1)
df = df[['subject_id', 'hadm_id',
         'admittime', 'real_admit_year',
         'age', 'gender','race', 
         'mortality', 'readmission', 'admission_type']]
df = df.merge(diagnoses_icd, on=['subject_id', 'hadm_id'], how='inner')
df = df.merge(procedures_icd, on=['subject_id', 'hadm_id'], how='inner')
df.to_csv(os.path.join(data_dir, 'data_preprocessed.csv'))

# Cohort Selection
processed_file = os.path.join(data_dir, 'processed_mimic_data.csv')
df = df[df.age.apply(lambda x: (x >= 18) & (x <= 89))]
df.to_csv(data_dir+'mimic_diagnoses.csv', index=False)
# return processed_file