<a href="https://colab.research.google.com/github/tanoManzo/mimic_trajectories/blob/dev_jamia/MIMIC_TRAJECTORY_NICU_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MIMIC III v1.4 dataset, Table Note Events

#### load notes dataframe

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

PATH_NOTES = "/content/drive/MyDrive/NIH/Data/NOTEEVENTS.csv"
df_notes = pd.read_csv(PATH_NOTES)
df_notes['CGID'] = df_notes['CGID'].astype('Int64')
df_notes.sample(5)

### neonatal notes

In [None]:
file_name = "/content/drive/MyDrive/NIH/Data/ICUSTAYS.csv"
df_icu = pd.read_csv(file_name)
df_icu

In [None]:
def subject_ids_nicu(x):
  condition = (x['FIRST_CAREUNIT']=='NICU') & (x['LAST_CAREUNIT']=='NICU')
  ids = x[condition]['SUBJECT_ID'].unique()
  return ids

# get nicu notes
df_notes_nicu = df_notes[df_notes['SUBJECT_ID'].isin(subject_ids_nicu(df_icu))]
df_notes_nicu

## pre-processing


In [None]:
df_notes_nicu['CATEGORY'].value_counts()

remove cgid nan

In [None]:
df_notes_nicu_with_cgid = df_notes_nicu[~df_notes_nicu['CGID'].isna()].reset_index(drop=True)
df_notes_nicu_with_cgid['CATEGORY'].value_counts()

## Exploration

In [None]:
def get_notes_min_num_per_cg_pt(df_to_infer, num=1, opt = 0):
  """
  Fuction to remove/get cg-pt notes with a minimum or egual number

  Parameters
    ----------
    df_to_infer: dataframe
      contains the columns 'CGID', 'SUBJECT_ID' and 'TEXT'
    num: int
      minimum or egual number of notes required for cg-pt pair
    opt: bool
      0 is for minimum of num, 1 for egual to num

  Returns
    -------
    Dataframe with the required minimum/egual number of notes for cg-pt pair

  Examples
    -------
    At least 2 notes for cg-pt pair:
    >>> get_notes_min_num_per_cg_pt(df_notes_nicu_with_cgid,1,0)

    Only gc-pt with 5 notes:
    >>> get_notes_min_num_per_cg_pt(df_notes_nicu_with_cgid,5,1)
  """
  group_cg_pt = df_to_infer.groupby(['CGID', 'SUBJECT_ID'])
  if opt:
    note_condition  = group_cg_pt['TEXT'].transform('count') == num
  else:
    note_condition  = group_cg_pt['TEXT'].transform('count') > num
  return df_notes_nicu_with_cgid[note_condition].reset_index(drop=True)

In [None]:
# at least 2 notes cg patient
df_notes_nicu_with_cgid_min_note = get_notes_min_num_per_cg_pt(df_notes_nicu_with_cgid)
info_cg_pt = df_notes_nicu_with_cgid_min_note.groupby(['CGID', 'SUBJECT_ID'])['TEXT'].count().describe()

# cg number
n_cg = df_notes_nicu_with_cgid_min_note['CGID'].nunique()
n_pt = df_notes_nicu_with_cgid_min_note['SUBJECT_ID'].nunique()
n_notes = len(df_notes_nicu_with_cgid_min_note)
print(f"number of: cg={n_cg}, pt={n_pt}, av_notes_cg_per_pt={info_cg_pt['mean']:.2f}, notes={n_notes}")

### select notes

In [None]:
df_notes_selected = get_notes_min_num_per_cg_pt(df_notes_nicu_with_cgid,round(info_cg_pt['mean']),1)
cg_ids = df_notes_selected.CGID.unique()

### save dataframe

whole cgs 5 notes cg-pt (av. # notes)

In [18]:
path_to_save = '/content/drive/MyDrive/NIH/Data/trajectory/'
name_to_save = 'NICU_pt5notes.csv'
df_notes_selected.to_csv(path_to_save+name_to_save,index=False)

50 cg 5 notes cg-pt (av. # notes)

In [19]:
condition = df_notes_selected['CGID'].isin(cg_ids[:50])
path_to_save = '/content/drive/MyDrive/NIH/Data/trajectory/'
name_to_save = 'NICU_50cg_pt5notes.csv'
df_notes_selected[condition].to_csv(path_to_save+name_to_save,index=False)

## Attitude


libraries

In [10]:
! pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification
from sklearn.metrics.pairwise import cosine_similarity

models

In [12]:
# model name from huggingface.co/model name_id:model_name
models_name = {
  'roberta':'tanoManzo/roberta-attitude',
#  'distilbert':'tanoManzo/distilbert-attitude',
#  'minilm':'tanoManzo/minilm-attitude',
#  'bloom':'tanoManzo/bloom-attitude'
}

# load tokenizer and model
tokenizers = [AutoTokenizer.from_pretrained(model_name) for model_name in models_name.values()]
models= [AutoModelForSequenceClassification.from_pretrained(model_name) for model_name in models_name.values()]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

get labels and score

In [13]:
def create_sentiment_labels(df_to_infer, batch_size=1):

  # parameters to set
  total_notes = len(df_to_infer)
  iterations = int(len(df_to_infer)/batch_size)

  # start label creation
  l_scores = {k:[] for k in models_name.keys()} # dict Model: labels
  l_labels = {k:[] for k in models_name.keys()} # dict Model: scoress

  for iter in range(iterations):
    idx_start = batch_size*iter     # first window sentence idx in the dataframe
    idx_end = batch_size*(iter+1)   # last window sentence idx in the dataframe

    # get row sentences for batch
    list_of_sentences = list(df_to_infer['TEXT'].iloc[idx_start:idx_end].values)


    for (index, (key, value)) in enumerate(models_name.items()):
      # tokenized input for the model
      tokenizer = tokenizers[index]
      batch = tokenizer(list_of_sentences, # list of sentence
                      padding=True, # add if short
                      truncation=True, # remove if long
                      max_length=512, # sent length
                      return_tensors="pt" # to return pytorch tensor (NO for TF)
                      )


      model = models[index]
      with torch.no_grad():
        outputs = model(**batch)
        predictions = F.softmax(outputs.logits, dim=1)
        label_ids = torch.argmax(predictions, dim=1)

        labels = [model.config.id2label[label_id] for label_id in label_ids.tolist()]
        scores = [round(float(predictions[idx][label_id].item()),4) for idx, label_id in enumerate(label_ids.tolist())]
        l_labels[key].extend(labels)
        l_scores[key].extend(scores)

  return l_labels, l_scores

In [26]:
condition = df_notes_selected['CGID'].isin(cg_ids[:50])
l_labels, l_scores = create_sentiment_labels(df_notes_selected[condition],50)
l_labels, l_scores

({'roberta': ['Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutral Note',
   'Overall Neutr

In [25]:
condition = df_notes_selected['CGID'].isin(cg_ids[:50])
df_notes_selected[condition]

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,1691704,862,186550.0,2127-03-16,2127-03-16 12:30:00,2127-03-16 12:35:00,Nursing/other,Report,18056,,"Neonatology Attending Note\nDOL# 130, CGA 45 w..."
1,1692878,962,124603.0,2140-02-11,2140-02-11 09:08:00,2140-02-11 09:09:00,Nursing/other,Report,19199,,Fellow Physical Examination\nGen: Alert\nHEENT...
2,1698761,1437,136436.0,2190-10-12,2190-10-12 09:51:00,2190-10-12 09:57:00,Nursing/other,Report,17335,,Neonatology Attending\nDOL 1\n\n[**Known lastn...
3,1698846,1456,154330.0,2190-11-28,2190-11-28 07:12:00,2190-11-28 07:20:00,Nursing/other,Report,14815,,Respiratory Care\nBaby rec'd on HFOV with MAP ...
4,1697982,1373,122508.0,2158-04-08,2158-04-08 15:30:00,2158-04-08 15:45:00,Nursing/other,Report,14415,,NPN\n\n\n#1 Resp:\nO: Remains in RA with sats ...
...,...,...,...,...,...,...,...,...,...,...,...
24251,2079354,32146,172363.0,2111-06-08,2111-06-08 09:37:00,2111-06-08 09:42:00,Nursing/other,Report,17335,,Neonatology Attending\nDOL 5 / PMA 40-6/7 week...
24255,2076757,31736,144915.0,2121-12-30,2121-12-30 13:59:00,2121-12-30 14:04:00,Nursing/other,Report,15318,,CLinical Nutrition\nO:\n~30 [**11-26**] wk CGA...
24262,2077837,31884,139535.0,2111-11-23,2111-11-23 12:59:00,2111-11-23 13:01:00,Nursing/other,Report,19211,,"Neonatology\nDOL #6, CGA 38 1/7 weeks.\n\nCVR:..."
24266,2077305,31804,143960.0,2116-07-24,2116-07-24 16:47:00,2116-07-24 16:52:00,Nursing/other,Report,19296,,NPN/0700-1900\n\n\n#1 FEN: TF ^ to 150cc/k/d o...


In [27]:
import pickle

path_to_save = "/content/drive/MyDrive/NIH/tmp results/"
# save dictionary to person_data.pkl file
with open(path_to_save+'l_labels.pkl', 'wb') as fp:
    pickle.dump(l_labels, fp)
    print('dictionary saved successfully to file')

# save dictionary to person_data.pkl file
with open(path_to_save+'l_scores.pkl', 'wb') as fp:
    pickle.dump(l_scores, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file
dictionary saved successfully to file


In [28]:
import pickle

path_to_save = "/content/drive/MyDrive/NIH/tmp results/"
# Read dictionary pkl file
with open(path_to_save+'l_labels.pkl', 'rb') as fp:
    person = pickle.load(fp)
    print('Person dictionary')
    print(person)

Person dictionary
{'roberta': ['Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall Neutral Note', 'Overall 


## Appendix

#### load patients dataframe

In [None]:
PATH_PATIENTS = "/content/drive/MyDrive/NIH/Data/PATIENTS.csv"
df_patients = pd.read_csv(PATH_PATIENTS)
df_patients.sample(5)

#### load caregivers dataframe

In [None]:
PATH_CG = "/content/drive/MyDrive/NIH/Data/CAREGIVERS.csv"
df_cg = pd.read_csv(PATH_CG)
df_cg.sample(5)

In [None]:
cg_ids = df_notes_nicu_with_cgid_min_note['CGID'].unique()
df_cg[df_cg['CGID'].isin(cg_ids)]['LABEL'].value_counts()