<a href="https://colab.research.google.com/github/tanoManzo/mimic_trajectories/blob/dev/MIMIC_TRAJECTORY_embeddings_03_collect_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Collection, Embeddings and Attitude

In [38]:
! pip install transformers -q

In [39]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification

load nursing notes subject-caregiver-chartdate

In [40]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
notes_path = "/content/drive/MyDrive/NIH/Data/"
notes_name = "NOTES_NURSING_TRAJECTORIES_01.csv"
df_notes = pd.read_csv(notes_path+notes_name)
df_notes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531675 entries, 0 to 531674
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   ROW_ID      531675 non-null  int64 
 1   SUBJECT_ID  531675 non-null  int64 
 2   CGID        531675 non-null  int64 
 3   CHARTDATE   531675 non-null  object
 4   TEXT        531675 non-null  object
dtypes: int64(3), object(2)
memory usage: 20.3+ MB


In [42]:
patients = "PATIENTS.csv"
df_patients = pd.read_csv(notes_path+patients)
df_patients.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46520 entries, 0 to 46519
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ROW_ID       46520 non-null  int64 
 1   SUBJECT_ID   46520 non-null  int64 
 2   GENDER       46520 non-null  object
 3   DOB          46520 non-null  object
 4   DOD          15759 non-null  object
 5   DOD_HOSP     9974 non-null   object
 6   DOD_SSN      13378 non-null  object
 7   EXPIRE_FLAG  46520 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 2.8+ MB


In [43]:
ward_name = "ICUSTAYS.csv"
df_ward = pd.read_csv(notes_path+ward_name)
df_ward.info()
df_ward['FIRST_CAREUNIT'].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61532 entries, 0 to 61531
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ROW_ID          61532 non-null  int64  
 1   SUBJECT_ID      61532 non-null  int64  
 2   HADM_ID         61532 non-null  int64  
 3   ICUSTAY_ID      61532 non-null  int64  
 4   DBSOURCE        61532 non-null  object 
 5   FIRST_CAREUNIT  61532 non-null  object 
 6   LAST_CAREUNIT   61532 non-null  object 
 7   FIRST_WARDID    61532 non-null  int64  
 8   LAST_WARDID     61532 non-null  int64  
 9   INTIME          61532 non-null  object 
 10  OUTTIME         61522 non-null  object 
 11  LOS             61522 non-null  float64
dtypes: float64(1), int64(6), object(5)
memory usage: 5.6+ MB


array(['MICU', 'CCU', 'NICU', 'TSICU', 'SICU', 'CSRU'], dtype=object)

In [53]:
drg = "DRGCODES.csv"
df_drg = pd.read_csv(notes_path+drg)
df_drg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125557 entries, 0 to 125556
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ROW_ID         125557 non-null  int64  
 1   SUBJECT_ID     125557 non-null  int64  
 2   HADM_ID        125557 non-null  int64  
 3   DRG_TYPE       125557 non-null  object 
 4   DRG_CODE       125557 non-null  int64  
 5   DESCRIPTION    125494 non-null  object 
 6   DRG_SEVERITY   66634 non-null   float64
 7   DRG_MORTALITY  66634 non-null   float64
dtypes: float64(2), int64(4), object(2)
memory usage: 7.7+ MB


## **Data Preprocessing**

analyzing words in future dataset

In [45]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


imports for counting words

In [46]:
import re
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('sentiwordnet')
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


Grab tsicu notes, merge with drg codes

In [81]:
# get new type of notes (TSICU)
df_notes_TSICU = df_notes.merge(df_ward.drop('ROW_ID', axis=1), on='SUBJECT_ID')
df_notes_TSICU = df_notes_TSICU.merge(df_patients.drop('ROW_ID', axis=1), on='SUBJECT_ID')
df_notes_TSICU = df_notes_TSICU.merge(df_drg[['HADM_ID','DRG_CODE','DESCRIPTION']], on=['HADM_ID'])
df_notes_TSICU = df_notes_TSICU[df_notes_TSICU['FIRST_CAREUNIT'] == 'TSICU'].drop_duplicates()
df_notes_TSICU.info()
df_notes_TSICU.head(5)
#print(df_drg[df_drg.duplicated(['HADM_ID'], keep=False)])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98089 entries, 5852 to 1502532
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ROW_ID          98089 non-null  int64  
 1   SUBJECT_ID      98089 non-null  int64  
 2   CGID            98089 non-null  int64  
 3   CHARTDATE       98089 non-null  object 
 4   TEXT            98089 non-null  object 
 5   HADM_ID         98089 non-null  int64  
 6   ICUSTAY_ID      98089 non-null  int64  
 7   DBSOURCE        98089 non-null  object 
 8   FIRST_CAREUNIT  98089 non-null  object 
 9   LAST_CAREUNIT   98089 non-null  object 
 10  FIRST_WARDID    98089 non-null  int64  
 11  LAST_WARDID     98089 non-null  int64  
 12  INTIME          98089 non-null  object 
 13  OUTTIME         98089 non-null  object 
 14  LOS             98089 non-null  float64
 15  GENDER          98089 non-null  object 
 16  DOB             98089 non-null  object 
 17  DOD             54794 non-

Unnamed: 0,ROW_ID,SUBJECT_ID,CGID,CHARTDATE,TEXT,HADM_ID,ICUSTAY_ID,DBSOURCE,FIRST_CAREUNIT,LAST_CAREUNIT,...,OUTTIME,LOS,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG,DRG_CODE,DESCRIPTION
5852,1373553,8406,17765,2149-04-28,SOCIAL WORK NOTE:\n\nNew trauma pt on T-SICU f...,114444,239553,carevue,TSICU,TSICU,...,2149-05-22 14:04:55,26.6668,F,2117-06-28 00:00:00,2149-09-07 00:00:00,2149-09-07 00:00:00,2149-09-07 00:00:00,1,483,TRACHEOSTOMY WITH MECHANICAL VENTILATION 96+ H...
5853,1373550,8406,16140,2149-04-27,SICU NPN\nROS:\nNeuro: Neuro exam unchanged th...,114444,239553,carevue,TSICU,TSICU,...,2149-05-22 14:04:55,26.6668,F,2117-06-28 00:00:00,2149-09-07 00:00:00,2149-09-07 00:00:00,2149-09-07 00:00:00,1,483,TRACHEOSTOMY WITH MECHANICAL VENTILATION 96+ H...
5854,1373547,8406,16140,2149-04-26,"SICU NPN\nPt hemodynamicallly unstable today, ...",114444,239553,carevue,TSICU,TSICU,...,2149-05-22 14:04:55,26.6668,F,2117-06-28 00:00:00,2149-09-07 00:00:00,2149-09-07 00:00:00,2149-09-07 00:00:00,1,483,TRACHEOSTOMY WITH MECHANICAL VENTILATION 96+ H...
5855,1373567,8406,15659,2149-05-01,T/SICU NSG PROGRESS NOTE.\n0700>>[**2076**]\n\...,114444,239553,carevue,TSICU,TSICU,...,2149-05-22 14:04:55,26.6668,F,2117-06-28 00:00:00,2149-09-07 00:00:00,2149-09-07 00:00:00,2149-09-07 00:00:00,1,483,TRACHEOSTOMY WITH MECHANICAL VENTILATION 96+ H...
5856,1373633,8406,21570,2149-07-02,Nursing Progress Note.\n\nCV: The pt has been ...,114444,239553,carevue,TSICU,TSICU,...,2149-05-22 14:04:55,26.6668,F,2117-06-28 00:00:00,2149-09-07 00:00:00,2149-09-07 00:00:00,2149-09-07 00:00:00,1,483,TRACHEOSTOMY WITH MECHANICAL VENTILATION 96+ H...


In [None]:
len(df_notes_TSICU['CGID'].unique())
deceased_TSICU = df_notes_TSICU[df_notes_TSICU['EXPIRE_FLAG'] == 1]
deceased_TSICU.info()

In [None]:
import seaborn as sns
sns.set()


gb_notes = deceased_TSICU.groupby(['CGID','SUBJECT_ID'])
num_notes = gb_notes['TEXT'].count()

print(f'number of notes: min={num_notes.min()}, max={num_notes.max()}, mean={num_notes.mean()},  median={num_notes.median()}')
sns.boxplot(x=num_notes)
sns.displot(num_notes[((num_notes>2) & (num_notes<15))])

In [None]:
grouping = df_notes_TSICU.groupby(['CGID','SUBJECT_ID']).size()
df_notes_TSICU['num_of_notes'] = df_notes_TSICU.set_index(['CGID','SUBJECT_ID']).index.map(grouping)
df_notes_TSICU_filtered = df_notes_TSICU[df_notes_TSICU['num_of_notes'] == 6]
df_notes_TSICU_filtered.info()
df_notes_TSICU_filtered.head(1)

Inverse Document Frequency

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df_notes_TSICU['TEXT'])

feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1
word_score_dict = dict(zip(feature_names, tfidf_scores))

sorted_words = sorted(word_score_dict.items(), key=lambda x: x[1], reverse=True)

top_words = sorted_words[:50]

In [None]:
print("Rank\tWord\t\t\tCount")
for rank, (word, score) in enumerate(top_words, start=1):
  print(f"{rank}\t{word.ljust(20)}\t{score}")

Tokenized words

In [None]:
#text = df_notes_TSICU['TEXT'].str.cat(sep=' ')
#words = nltk.word_tokenize(text)  # very slow

Count words in notes

In [None]:
#stopwords = nltk.corpus.stopwords.words('english')
#words = [word.lower() for word in words if word.lower() not in stopwords and re.match(r'\b\w+\b', word)]  # also slow but only kinda slow
#word_counts = Counter(words)
#most_common_words = word_counts.most_common(300)

In [None]:
#print("Rank\tWord\t\t\tCount")
#for rank, word_count in enumerate(most_common_words, start=1):
  #word, count = word_count
  #print(f"{rank}\t{word.ljust(20)}\t{count}")

Emotional words

In [None]:
#emotional_words = []
#for word_count in most_common_words:
  #word = word_count[0]
  #synsets = wn.synsets(word)
  #if synsets:
    #senti_synset = swn.senti_synset(synsets[0].name())
    #if senti_synset.pos_score() > 0.45 or senti_synset.neg_score() > 0.45:
      #emotional_words.append(word)

#emotional_words

In [None]:
#from textblob import TextBlob
#emotional_words = []
#for word_count in most_common_words:
  #word = word_count[0]
  #tb = TextBlob(word)
  #sentiment = tb.sentiment.polarity
  #if abs(sentiment) > 0.35:
    #emotional_words.append(word)

#emotional_words

select emotional words then filter rows that dont have a minimum number of them

In [None]:
selected_words = ['pain', 'family', 'stable', 'care', 'well', 'social', 'support', 'able', 'decreased', 'warm', 'unable', 'strong', 'intact', 'good', 'times', 'aware', 'eyes', 'tolerated', 'denies', 'tolerating', 'please', 'palpable']
print("selected_words list length: ", len(selected_words))
pattern = '|'.join(selected_words)
filtered_data = df_notes_TSICU[df_notes_TSICU['TEXT'].str.contains(pattern, regex=True)].copy()
filtered_data['num_search_words'] = filtered_data['TEXT'].str.count(pattern)
filtered_data = filtered_data[filtered_data['num_search_words'] >= 18]
filtered_data.info()

Remove similar sentences

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

sentences = filtered_data['TEXT'].tolist()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences)
similarity_threshold = 0.8

similar_indices = []
for i, sentence in enumerate(sentences):
  if any(cosine_similarity(tfidf_matrix[i], tfidf_matrix[j])[0][0] > similarity_threshold for j in range(i)):  # very slow ~8 minutes
    similar_indices.append(i)

In [None]:
index_labels = filtered_data.index[similar_indices]
filtered_data.drop(index_labels, inplace=True)

filtered_data.info()

Seperate deceased and living

In [None]:
deceased_patients = filtered_data[filtered_data['EXPIRE_FLAG'] == 1]
deceased_patients.info()

In [None]:
living_patients = filtered_data[filtered_data['EXPIRE_FLAG'] == 0]
living_patients.info()

Random selection from sample set

In [None]:
filtered_data = filtered_data.sample(n=200, random_state=42)

Display notes

In [None]:
pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_rows', None)
print(filtered_data['TEXT'].head(200))
pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')

Save to CSV

In [None]:
# path and name of the cvs file 
PATH_TO_SAVE = "/content/drive/MyDrive/NIH/Data/"
name_file_to_save = 'NOTES_NURSING_TRAJECTORIES_TSICU.csv'
#filtered_data.to_csv(PATH_TO_SAVE+name_file_to_save,index=False)