In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import numpy as np

csv_path = "/Users/ayush/Desktop/mimicDataset/mimic-iv-bhc.csv"

# creating a 10% sample of the dataset due to size and memory constraints
total_rows = sum(1 for _ in open(csv_path)) - 1
print("Total Rows:", total_rows)
sample_frac = 0.1  # 10%
sample_n = int(total_rows * sample_frac)

# randomly skip rows
skip_idx = sorted(np.random.choice(np.arange(1, total_rows + 1), total_rows - sample_n, replace=False))
df = pd.read_csv(csv_path, skiprows=skip_idx)

print(df.info())
print(df.head())


Total Rows: 270033
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27003 entries, 0 to 27002
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   note_id        27003 non-null  object
 1   input          27003 non-null  object
 2   target         27003 non-null  object
 3   input_tokens   27003 non-null  int64 
 4   target_tokens  27003 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.0+ MB
None
          note_id                                              input  \
0  10000248-DS-10  <SEX> M <SERVICE> MEDICINE <ALLERGIES> No Know...   
1  10000764-DS-11  <SEX> M <SERVICE> MEDICINE <ALLERGIES> No Know...   
2  10000935-DS-19  <SEX> F <SERVICE> MEDICINE <ALLERGIES> Sulfa (...   
3   10001338-DS-6  <SEX> F <SERVICE> SURGERY <ALLERGIES> Patient ...   
4  10001401-DS-20  <SEX> F <SERVICE> MEDICINE <ALLERGIES> No Know...   

                                              target  input_tokens  \
0  Mr. ___ is 

In [2]:
# remove unnecessary characters 
def clean_text(text):
    text = text.replace('\n', ' ').replace('\r', ' ') # remove newline characters
    text = re.sub(r'\[\*\*.*?\*\*\]', '', text) # remove PHI markers
    text = re.sub(r'[-=]{2,}', ' ', text) # remove repeated punctuation of -, =, maintain _ for deidentified information
    text = re.sub(r'\s+', ' ', text) # remove extra spaces
    return text

# testing cleaning of data
example_row = df.iloc[1]
cleaned_input = clean_text(example_row["input"])
cleaned_output = clean_text(example_row["target"])
print("Original Input:", example_row["input"])
print("Cleaned Input:", cleaned_input)

# clean texts based on above criteria
df["body"] = df["input"].apply(clean_text)
df["summary"] = df["target"].apply(clean_text)

Original Input: <SEX> M <SERVICE> MEDICINE <ALLERGIES> No Known Allergies / Adverse Drug Reactions <ATTENDING> ___ <CHIEF COMPLAINT> Epistaxis <MAJOR SURGICAL OR INVASIVE PROCEDURE> None <HISTORY OF PRESENT ILLNESS> Mr. ___ is an ___ with history of AAA s/p repair complicated by MI, hypertension, and hyperlipidemia who presents upon transfer from outside hospital with nasal fractures and epistaxis secondary to fall. The patient reports that he was at the ___ earlier this afternoon. While coughing, he tripped on the curb and suffered trauma to his face. He had no loss of consciousness. However, he had a persistent nosebleed and appeared to have some trauma to his face, thus was transferred to ___ for further care. There, a CT scan of the head, neck, and face were remarkable for a nasal bone and septal fracture. Given persistent epistaxis, bilateral RhinoRockets were placed. He had a small abrasion to the bridge of his nose which was not closed. Bleeding was well controlled. While in the

In [3]:
# calculate the mean length and shortest length text
def text_lengths(text):
    mean = text.str.len().mean()
    shortest = text.str.len().min()
    return mean, shortest

# explore the lengths of the texts for the body and summary columns
mean_body, shortest_body = text_lengths(df["body"])
print("Mean Body Length:", mean_body)
print("Shortest Body Length:", shortest_body)

mean_summary, shortest_summary = text_lengths(df["summary"])
print("Mean Summary Length:", mean_summary)
print("Shortest Summary Length:", shortest_summary)

# filter out bad note/summary examples (notes less than 1300 characters & summaries less than 300 characters)
df = df[(df["body"].str.len() > 1300) & (df["summary"].str.len() > 300)]
df_ready = df[["body", "summary"]].dropna()

Mean Body Length: 7619.19301559086
Shortest Body Length: 240
Mean Summary Length: 2417.840202940414
Shortest Summary Length: 2


In [4]:
# create train, test, validation splits based on the cleaned dataset
train, test = train_test_split(df_ready, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

print(len(train), len(val), len(test))

21671 2408 2676


In [5]:
train.to_csv("train_clean.csv", index=False)
val.to_csv("val_clean.csv", index=False)
test.to_csv("test_clean.csv", index=False)