# 01_data_exploration.ipynb

## ðŸ“Œ Purpose
This notebook performs exploratory data analysis (EDA) on the pre-cleaned dataset
and clinical notes pulled from MIMIC-IV. It visualizes distributions, missingness,
class imbalance, and text characteristics.  

Functions are imported from `src/data_prep.py`.


### Imports

In [1]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from src.data_prep import load_cleaned_data, inspect_dataframes
from src.utils import resolve_path, save_fig

### Load Structured Dataset

In [2]:
# Path to dataset (update after SQL fix)
df_clean = load_cleaned_data(resolve_path("data/raw/data_after_cleaning.csv"))
df_clean.head()


âœ… Loaded cleaned dataset from C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\data\raw\data_after_cleaning.csv with shape (5208, 48)


Unnamed: 0,subject_id,hospital_expire_flag,max_age,los_icu,first_hosp_stay,suspected_infection,sofa_score,sepsis3,avg_urineoutput,glucose_min,...,race_Hispanic or Latin,race_Others race,race_White,antibiotic_Vancomycin,antibiotic_Vancomycin Antibiotic Lock,antibiotic_Vancomycin Enema,antibiotic_Vancomycin Intrathecal,antibiotic_Vancomycin Oral Liquid,gender_F,gender_M
0,19986715,0,24,10.58,True,1,2,True,136.657143,82,...,0,0,0,1,0,0,0,0,1,0
1,19973083,0,58,2.33,True,1,8,True,34.263158,94,...,0,0,0,1,0,0,0,0,1,0
2,19907774,1,65,1.83,True,1,2,True,105.476191,65,...,0,0,0,1,0,0,0,0,1,0
3,19894745,1,76,1.08,True,1,4,True,34.0,267,...,0,0,0,1,0,0,0,0,1,0
4,19884808,1,64,20.46,True,1,5,True,107.229508,146,...,0,0,0,1,0,0,0,0,1,0


### Dataset Summary

In [None]:
print("Shape:", df_clean.shape)
print(df_clean.info())
df_clean.describe(include="all")


### Class Balance

In [None]:
fig, ax = plt.subplots(figsize=(6,5))
sns.countplot(x="hospital_expire_flag", data=df_clean, ax=ax)
ax.set_title("Class Distribution (Sepsis Mortality)")

save_fig(fig, "class_distribution_sepsis_mortality")
plt.show()

### Missingness Check

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
sns.heatmap(df_clean.isnull(), cbar=False, ax=ax)
ax.set_title("Missing Values Heatmap")

save_fig(fig, "missing_values_heatmap")
plt.show()

In [None]:
nlp_ready_df = pd.read_csv(resolve_path("data/interim/data_nlp_ready.csv"))

missing_radiology = nlp_ready_df[nlp_ready_df["Radiology_notes"].isna() | (nlp_ready_df["Radiology_notes"] == "")]
missing_discharge = nlp_ready_df[nlp_ready_df["Discharge_summary_notes"].isna() | (nlp_ready_df["Discharge_summary_notes"] == "")]

print(f"Patients missing radiology notes: {len(missing_radiology)}")
print(f"Patients missing discharge notes: {len(missing_discharge)}")


### Notes Data Preview

In [None]:
# After preprocessing, load NLP-ready dataset
df_notes = pd.read_csv("../data/interim/data_nlp_ready.csv")

# Preview
df_notes[["subject_id", "Radiology_notes", "Discharge_summary_notes", "combined_notes"]].head(10)


### Notes Data Preview

In [None]:
df_notes["note_length"] = df_notes["Radiology_notes"].str.len()

fig, ax = plt.subplots(figsize=(8,6))
sns.histplot(df_notes["note_length"], bins=50, kde=True, ax=ax)
ax.set_title("Distribution of Radiology Note Lengths")
ax.set_xlabel("Characters per Note")
ax.set_ylabel("Frequency")

save_fig(fig, "distribution_radiology_note_lengths")
plt.show()


In [None]:
df_notes["note_length"] = df_notes["Discharge_summary_notes"].str.len()

sns.histplot(df_notes["note_length"], bins=50, kde=True)
plt.title("Distribution of Discharge Summary Note Lengths")
plt.xlabel("Characters per Note")
plt.ylabel("Frequency")
plt.show()

In [None]:
df_notes["note_length"] = df_notes["combined_notes"].str.len()

sns.histplot(df_notes["note_length"], bins=50, kde=True)
plt.title("Distribution of Combined Note Lengths")
plt.xlabel("Characters per Note")
plt.ylabel("Frequency")
plt.show()

#  Create Table 1

In [3]:
from src.utils import resolve_path
from src.evaluation import generate_table1

table1 = generate_table1(df_clean)


âœ… Table 1 saved:
 - C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\reports\tables\table1_structured.csv
 - C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\reports\tables\table1_structured.tex


## Next Steps


- Move to `02_data_preprocessing.ipynb` for truncation, tokenization,
  and preparation of notes for Word2Vec / BERT.
