# Explore MIMIC III

In this experiment I want to load the MIMIC data that I have access to. Here I do some exploration of the data. 

In [1]:
import os
import pandas as pd
import sys

from pathlib import Path




# Import the variables that have been set in the init.py folder in the root directory
# These include a constant called PROJECT_ROOT which stores the absolute path to this folder
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
import init
PROJECT_ROOT = os.getenv("PROJECT_ROOT")

In [2]:
os.listdir(PROJECT_ROOT + '/data_store/MIMIC-III')

['.DS_Store', 'NOTEEVENTS.csv', 'NOTEEVENTS.csv.gz']

In [3]:
# Define the path to your CSV file
file_path = PROJECT_ROOT + '/data_store/MIMIC-III/NOTEEVENTS.csv'

df = pd.read_csv(file_path, nrows = 1000)
df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...
...,...,...,...,...,...,...,...,...,...,...,...
995,1158,15198,137838,2184-10-21,,,Discharge summary,Report,,,Admission Date: [**2184-10-14**] Discha...
996,1159,2712,198266,2101-02-15,,,Discharge summary,Report,,,Admission Date: [**2101-2-10**] ...
997,1160,2712,157537,2103-07-18,,,Discharge summary,Report,,,Admission Date: [**2103-7-17**] ...
998,1161,18511,123903,2191-03-03,,,Discharge summary,Report,,,Admission Date: [**2191-2-23**] ...


In [18]:

# Specify the column you want to count values for
column_name = 'CATEGORY'

# Initialize an empty Series to store aggregated counts
aggregated_counts = pd.Series(dtype=int)

# Specify the chunk size
chunk_size = 10000  # Adjust based on your system's memory capacity

# Read the CSV file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size, usecols=[column_name]):
    # Compute value counts for the current chunk and add to the aggregate counts
    aggregated_counts = aggregated_counts.add(chunk[column_name].value_counts(), fill_value=0)

# Convert aggregated counts to integer (optional, for cleaner presentation)
aggregated_counts = aggregated_counts.astype(int)

# Now aggregated_counts contains the count of each unique value in the specified column across the entire CSV file
print(aggregated_counts)


CATEGORY
Case Management         967
Consult                  98
Discharge summary     59652
ECG                  209051
Echo                  45794
General                8301
Nursing              223556
Nursing/other        822497
Nutrition              9418
Pharmacy                103
Physician            141624
Radiology            522279
Rehab Services         5431
Respiratory           31739
Social Work            2670
dtype: int64


In [4]:

# Load social worker notes

# Specify the chunk size
chunk_size = 10000  # Adjust based on your system's memory capacity

# Initialize an empty list to hold the filtered DataFrames
filtered_chunks = []

# Iterate over the CSV file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    # Filter the chunk to include only rows where CATEGORY == 'Social Work'
    filtered_chunk = chunk[chunk['CATEGORY'] == 'Social Work']
    
    # Append the filtered chunk to the list
    filtered_chunks.append(filtered_chunk)

# Concatenate all filtered chunks into a single DataFrame
soc = pd.concat(filtered_chunks, ignore_index=True)




In [5]:
soc['discharge'] = soc.TEXT.str.contains('discharge')
discharge_notes = soc[soc.discharge]['TEXT'].tolist()
discharge_notes[5]

'Family Information\n   Next of [**Doctor First Name **]: N/A\n   Health Care Proxy appointed:\n   Family Spokesperson designated:\n   Communication or visitation restriction:\n   Patient Information:\n   Previous living situation: Homeless\n   Previous level of functioning: Independent\n   Previous [**Hospital1 54**] or other hospital admissions: several previous [**Hospital1 54**]\n   admissions.  Most recent [**1-24**] d/t etoh intoxication\n   Past psychiatric history: n/a\n   Past addictions history: long hx of etoh abuse, currently drinking 2\n   bottles of vodka daily.  has hx of cocaine abuse but none current.\n   Employment status: Disable\n   Legal involvement: unknown\n   Mandated Reporting Information:\n   Additional Information:\n   Patient / Family Assessment: Pt referred to sw for etoh assessment and\n   counseling.  Pt is a 54 yo man with hx of cardiac disease, hep c and\n   etoh abuse who was admitted on [**5-9**] with etoh intoxication/management\n   of withdrawal.  [

## Query using llama

In [6]:
import json

from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate

In [7]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = Ollama(model="llama2", callback_manager=callback_manager)

In [8]:
template = """[INST]
<<SYS>>
Your are an experience social worker with good understanding of how patients' social situation (housing, family, transport issues etc) can make their discharge from hospital. 
<</SYS>>

### INSTRUCTIONS ### 

Give your judgement about any factors that might be pertinent in a patient's social background that make discharge from hospital more complicated

Here is some information about the patient: 

{data}
[/INST]
"""

prompt = PromptTemplate.from_template(template)
chain = prompt | llm

In [10]:
batch = []

for i in range(5):
    batch.append({"data": discharge_notes[i]})

In [12]:
discharge_notes[0]

'Family Information\n   Next of [**Doctor First Name **]: unknown at this time\n   Health Care Proxy appointed: Deferred\n   Family Spokesperson designated: none - no family involved in pt\'s life\n   Communication or visitation restriction:  none\n   Patient Information:\n   Previous living situation: homeless, living on street in [**Location (un) 223**]\n   Previous level of functioning: Independent\n   Previous [**Hospital1 54**] or other hospital admissions: multiple admissions to\n   [**Hospital1 54**] this year - see OMR\n   Past psychiatric history: past dx of PTSD, Major Depression, Antisocial\n   Personality Disorder\n   Past addictions history: h/o heroin abuse\n   Employment status: Disabled, receives Disability\n   Legal involvement: unknown at this time\n   Mandated Reporting Information:  none\n   Additional Information:\n   60 y.o. single, Caucasian, Catholic man admitted to MICU [**2108-9-4**] with\n   dx of bradycardia.  SW met with pt at request of RN to assess resour

In [11]:
formatted_prompt = prompt.format_prompt(data = batch[0])
type(formatted_prompt.to_string())

# for a batch of items
# llm.generate([formatted_prompt.to_string()])

# for a single item
response = llm(formatted_prompt.to_string())

  warn_deprecated(


Based on the information provided in the patient's chart, there are several factors that could make discharge from the hospital more complicated for this patient. Here are some of the key issues:

1. Lack of a support system: The patient has no family or friends involved in his life, which could make it difficult for him to cope with stressors after discharge.
2. History of substance abuse: The patient has a history of heroin abuse, which can increase the risk of relapse and other psychiatric problems.
3. Active PTSD and depressive symptoms: The patient is currently experiencing symptoms of post-traumatic stress disorder (PTSD) and depression, which could make it challenging for him to manage his emotions and behaviors after discharge.
4. Homelessness: The patient has been homeless in the past, and there is no clear plan for housing after discharge.
5. Lack of daily structure: The patient lacks a daily structure, which could make it difficult for him to manage his time and responsibili

In [None]:
outputs = chain.batch(batch)