In [1]:
from pyhealth.datasets import MIMIC3NoteDataset
from pyhealth.datasets import split_by_patient, get_dataloader
from pyhealth.models import Agent
from pyhealth.tasks import mortality_prediction_mimic3_fn
from pyhealth.trainer import Trainer

# STEP 1: load data
base_dataset = MIMIC3NoteDataset(
    root="/data/physionet.org/files/mimiciii/1.4",
    dev=False,
    refresh_cache=False,
)
base_dataset.stat()

  from .autonotebook import tqdm as notebook_tqdm



Statistics of base dataset (dev=False):
	- Dataset: MIMIC3NoteDataset
	- Number of patients: 34560



'\nStatistics of base dataset (dev=False):\n\t- Dataset: MIMIC3NoteDataset\n\t- Number of patients: 34560\n'

In [2]:
len(base_dataset.patients)

34560

In [3]:
base_dataset.patients['100'].attr_dict

{'attr': {'ROW_ID': 91,
  'SUBJECT_ID': '100',
  'GENDER': 'F',
  'DOB': '2085-08-31 00:00:00',
  'DOD': nan,
  'DOD_HOSP': nan,
  'DOD_SSN': nan,
  'EXPIRE_FLAG': 0,
  'HADM_ID': '153952',
  'ADMITTIME': Timestamp('2157-08-10 07:15:00'),
  'DISCHTIME': Timestamp('2157-08-18 19:54:00'),
  'DAYS_NEXT_ADMIT': nan,
  'NEXT_ADMITTIME': NaT,
  'ADMISSION_TYPE': 'ELECTIVE',
  'DEATHTIME': NaT,
  'OUTPUT_LABEL': 0,
  'DURATION': 8.527083333333334,
  'CHARTDATE': Timestamp('2157-08-10 00:00:00'),
  'TEXT': "CSRU ADMISSION NOTE:\n\nPT WITH HISTORY OF [** 473**] IN [**2139**] NOW WITH SEVERE AI WITH PRESERVED LV FXN.  REFERRED FROM [**Hospital3 474**] FOR MINIMALLY INVASIVE [**Hospital3 473**].\n\nPMHX:  HTN\n       [**Hospital3 473**] ([**2139**])\n       APPY ('[**06**])\n       TIA ([**2-4**]) WITH NO RESIDUAL\n\nALLG:  SULFA\n\n[**8-10**]:  PT FOR MINIMALLY INVASIVE [**Month/Day (4) 473**], BUT UPON PLACEMENT OF PA LINE, PT ASYSTOLIC X5-10 SEC.  HR RESUMED WITH CHEST COMPRESSIONS, 100MG EPIX

In [4]:
from pyhealth.tasks import readmission_prediction_mimic3_note_fn

sample_dataset = base_dataset.set_task(readmission_prediction_mimic3_note_fn, emb=True)

Generating samples for readmission_prediction_mimic3_note_fn: 100%|██████████| 34560/34560 [00:00<00:00, 272416.26it/s]
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1671

In [5]:
sample_dataset[11]

{'patient_id': '10640',
 'text': 'date of birth: sex: f service: neurosurgery allergies: patient recorded as having no known allergies to drugs attending: chief complaint: my neck hurts major surgical or invasive procedure: ct angio of the brain history of present illness: 43 yo aaf with history of migraines, came home from work last night at 1 am - was not feeling well - had episode of "shaking" per s.o. with urinary incontinence - did not seek med attention right away. came to er this early am - ct done at 7:30 am showed diffuse sah with hydrocephalus. cta done in er before admission to icu - prelim report is significant for l mca lobulated aneurysm, acomm aneurysm as well as pcomm aneurysm. past medical history: hyperlipidemia migraines chrinic neck/back pain secondary to mva 2 yrs ago social history: smokes 1ppd no etoh no recreational drug use has two children - is not legally married to s.o. works as a collector family history: cerebral aneurysm for mother physical exam: vs 130\'

In [6]:
train_dataset, val_dataset, test_dataset = split_by_patient(
    sample_dataset, [0.8, 0.1, 0.1]
)

In [8]:
len(train_dataset), len(val_dataset), len(test_dataset)

(1330, 173, 168)

In [12]:
train_dataloader = get_dataloader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = get_dataloader(val_dataset, batch_size=4, shuffle=False)
test_dataloader = get_dataloader(test_dataset, batch_size=4, shuffle=False)

In [14]:
train_dataset[0]['label']

0