_No need to run it on local environments_

Mounting drive and importing my local kaggle key

In [1]:
from google.colab import drive
import os


drive.mount('/content/drive')
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/Colab Notebooks/Kaggle Config/kaggle.json"

Mounted at /content/drive


Installing kagglehub if it's not already installed

In [2]:
pip install kagglehub



Loading the dataset from kaggle

In [3]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd


rawDataset = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "aminexdr/bhc-mimic-iv-summary", "BHC_MIMIC-IV.csv")

Downloading from https://www.kaggle.com/api/v1/datasets/download/aminexdr/bhc-mimic-iv-summary?dataset_version_number=1&file_name=BHC_MIMIC-IV.csv...


100%|██████████| 1.34G/1.34G [00:12<00:00, 112MB/s]


Displaying column names and total rows in the dataset

In [4]:
print(rawDataset.columns)

print(rawDataset.count())

Index(['note_id', 'subject_id', 'hadm_id', 'note_type', 'note_seq',
       'charttime', 'storetime', 'input', 'target', 'input_length',
       'target_length'],
      dtype='object')
note_id          269516
subject_id       269516
hadm_id          269516
note_type        269516
note_seq         269516
charttime        269516
storetime        269516
input            269516
target           269516
input_length     269516
target_length    269516
dtype: int64


The column **input** contains raw data. Extracting sections from this column.

In [5]:
import re
from collections import Counter
import pandas as pd


rawDataColumnName = 'input'
sectionRegexPattern = r'(^[A-Za-z \-/()]+?):'
sectionsPattern = re.compile(sectionRegexPattern, flags=re.MULTILINE)

headings = set()

for text in rawDataset[rawDataColumnName]:
    matches = sectionsPattern.findall(text)

    for m in matches:
        headings.add(m.strip())

uniqueHeadings = sorted(headings)

print(uniqueHeadings)
print(len(uniqueHeadings))

['Chief Complaint', 'Discharge Condition', 'Discharge Diagnosis', 'Discharge Instructions', 'Discharge Medications', 'History of Present Illness', 'Imaging', 'Medications on Admission', 'Past Medical History', 'Pertinent Results', 'Physical Exam', 'create a summary based on the following information', 'generate a brief hospital summary', 'summarize', 'write a discharge summary']
15


Removing prompts from the headings as they are useless

In [6]:
redundantHeadings = ["create a summary based on the following information", "generate a brief hospital summary", "summarize", "write a discharge summary"]

for a in redundantHeadings:
    try:
        uniqueHeadings.remove(a)
    except ValueError:
        continue

print(uniqueHeadings)
print(len(uniqueHeadings))

['Chief Complaint', 'Discharge Condition', 'Discharge Diagnosis', 'Discharge Instructions', 'Discharge Medications', 'History of Present Illness', 'Imaging', 'Medications on Admission', 'Past Medical History', 'Pertinent Results', 'Physical Exam']
11


Extracting all the text corresponding to these headings and adding to the original dataset

In [7]:
from difflib import get_close_matches
from collections import OrderedDict


sectionRegex = re.compile(
    r'^(?P<h>[A-Za-z][A-Za-z0-9\s/()&\-\.,]{0,100}?)\s*:\s*(?P<inline>.*)$',
    re.MULTILINE
)

def normalizeRowText(text: str) -> str:
    return text.replace('\r\n', '\n').replace('\r', '\n')

def extractSections(text):
    out = OrderedDict()
    if not isinstance(text, str) or not text.strip():
        return out

    text = normalizeRowText(text)
    lines = text.split('\n')

    currentHeading = None
    buffer = []

    def flush():
        nonlocal currentHeading, buffer, out

        if currentHeading is not None:
            content = '\n'.join(buffer).strip()
            out[currentHeading] = content
        buffer = []

    for rawLine in lines:
        line = rawLine.strip()
        m = sectionRegex.match(line)

        if m:
            # We hit a new heading line → flush the previous section first
            flush()

            # Start a new section
            currentHeading = m.group('h').strip()
            inline = (m.group('inline') or '').strip()

            # Initialize content with inline (if present); otherwise empty.
            buffer = [inline] if inline else []
        else:
            # Not a heading line → it belongs to the current section (if any).
            if currentHeading is None:
                # Text before the first heading: put it into a 'Preamble' bucket.
                currentHeading = 'Preamble'
                buffer = []
            buffer.append(line)

    # Flush the final section after loop
    flush()
    return out

parsedData = rawDataset[rawDataColumnName].apply(extractSections)
structuredRawData = pd.DataFrame(parsedData.tolist())
structuredRawData.head(10)

Unnamed: 0,summarize,Chief Complaint,History of Present Illness,Past Medical History,Physical Exam,Pertinent Results,Medications on Admission,Discharge Medications,Discharge Diagnosis,Discharge Condition,Discharge Instructions,generate a brief hospital summary,create a summary based on the following information,write a discharge summary,Imaging
0,,Worsening ABD distension and pain,"HCV cirrhosis cb ascites, hiv on ART, ho IVDU,...",1. HCV Cirrhosis 2. No history of abnormal Pap...,,10:25PM GLUCOSE-109 UREA N-25 CREAT-0.3 SODIUM...,The Preadmission Medication list is accurate a...,1. Albuterol Inhaler 2 PUFF IH Q4H:PRN wheezin...,Ascites from Portal HTN,,"Dear Ms. , It was a pleasure taking care of yo...",,,,
1,,abdominal fullness and discomfort,"with HIV on HAART, COPD, HCV cirrhosis complic...",1. HCV Cirrhosis 2. No history of abnormal Pap...,,,The Preadmission Medication list is accurate a...,"1. Acetaminophen 500 mg PO Q6H:PRN pain,fever ...",,,"Dear , was a pleasure to take care of you at ...",,,,
2,,dysphagia,w anxiety and several years of dysphagia who p...,- GERD - Hypercholesterolemia - Kidney stones ...,ADMISSIONDISCHARGE EXAM,ADMISSION LABS 08:27AM BLOOD WBC-5.0 RBC-4.8...,The Preadmission Medication list is accurate a...,1. Omeprazole 20 mg PO BID,,,"Dear Ms. , You were hospitalized at . You came...",,,,
3,,Left hip pain,,- GERD - Hypercholesterolemia - Kidney stones ...,,,The Preadmission Medication list is accurate a...,1. Acetaminophen 1000 mg PO Q6H:PRN Pain - Mil...,Left valgus impacted femoral neck fracture,"AVSS NAD, AOx3",,,,,
4,,Right flank bruising and pain sp fall,Mr. is a with history of factor VIII deficie...,-Factor VIII deficiency,ON ADMISSION,"ADMISSION, DISCHARGE, PERTINENT LABS: 07:03PM...",The Preadmission Medication list is accurate a...,1. Acetaminophen 1000 mg PO Q8H pain 2. Desmop...,,,"Mr. , It was our pleasure caring for you at ....",,,,
5,,renal mass,yo healthy female with incidental finding of r...,,,07:15AM BLOOD WBC-7.6 RBC-3.82 Hgb-11.9 Hct-33...,,1. Hydrocodone-Acetaminophen mg Tablet Sig: ...,renal cell carcinoma,stable,"-You may shower but do not bathe, swim or imme...",,,,
6,,Epistaxis,Mr. is an with history of AAA sp repair comp...,MI after AAA repair when he was yo HTN Hyperc...,,,The Preadmission Medication list is accurate a...,1. Clopidogrel 75 mg PO DAILY 2. Acetaminophen...,Nasal fracture Epistaxis NSTEMI,,"Mr. , You were admitted after you fell and bro...",,,,
7,,Abdominal distention.,This is a very nice woman with ETOH abuse who...,--Alcohol abuse --Chronic back pain,,04:50AM BLOOD WBC-12.2 RBC-3.37 Hgb-12.0 Hct-3...,.,1. Multivitamin Tablet Sig: One Tablet PO DAI...,,,You were admitted to the hospital for inflamma...,,,,
8,,"Abdominal distention, back pain, fever; leukoc...",This is a woman with a history of ETOH abuse ...,--Alcohol abuse --Chronic back pain,,,"Multivitamin, thiamine, folate, spironolactone...",1. Multivitamin Tablet Sig: One Tablet PO DAI...,,,You were admitted to the hospital for alcoholi...,,,,
9,,Abdominal distentionpain and fever,"with recently diagnosed alcoholic hepatitis, p...","- Alcohol abuse - Alcoholic hepatitis, with pe...",,,- AMITRIPTYLINE - 10 mg PO HS - OXYCODONE - 5 ...,1. Thiamine HCl 100 mg Tablet Sig: One Tablet...,,Alert and Oriented. Ambulating without help. H...,You were seen in the Associates with complain...,,,,


Validation of cleansed data by comparison of text reconstructed from divided sections with the raw text in the dataset

In [8]:
import difflib


uniqueHeadingsPattern = re.compile(
    r'(?m)^(?:' + "|".join(re.escape(h) for h in uniqueHeadings) + r')\s*:\s*(?:$|.+)'
)

def reconstructFromSections(sections: dict) -> str:
    pieces = []
    for h, body in sections.items():
        if body:
            pieces.append(f"{h}: {body}\n")
        else:
            pieces.append(f"{h}: ")
    return ''.join(pieces)

def validateRow(rawText: str):
    report = {
        "has_headings": False,
        "roundtrip_equal": False,
        "stray_heading_in_body": False,
        "empty_sections": [],
        "duplicate_headings": [],
        "diff_sample": "",
        "heading_count": 0
    }
    if not isinstance(rawText, str) or not rawText.strip():
        return report

    rawNormalizedText = normalizeRowText(rawText).strip()
    sections = extractSections(rawNormalizedText)
    report["heading_count"] = len(sections)
    report["has_headings"] = len(sections) > 0

    # Check duplicates (same heading encountered twice is suspicious unless expected)
    # Our parser keeps the last; if you expect repeats, adapt this.
    # Here, we quickly scan raw for heading occurrences:
    heads = [m.group('h').strip() for m in sectionRegex.finditer(rawNormalizedText)]
    dupes = [h for h in set(heads) if heads.count(h) > 1]
    report["duplicate_headings"] = dupes

    # Empty sections list
    report["empty_sections"] = [h for h, v in sections.items() if v.strip() == ""]

    # Reconstruct and compare (lenient compare)
    reconstructedFromSections = reconstructFromSections(sections)
    reconstructedFromRawText = '\n'.join(
        f"{m.group('h').strip()}: {(m.group('inline') or '').strip()}"
        for m in sectionRegex.finditer(rawNormalizedText)
    )

    # Compare reconstruction to a “headings+inline” projection of the raw
    # (since we can’t recreate inter-heading whitespace exactly)
    reconstructedFromSections = normalizeRowText(reconstructedFromSections).strip()
    reconstructedFromRawText = normalizeRowText(reconstructedFromRawText).strip()
    report["roundtrip_equal"] = (reconstructedFromSections == reconstructedFromRawText)

    # Stray heading detector: do any lines inside bodies look like headings?
    strayHeadingFound = False
    for h, body in sections.items():
        # Find heading pattern inside body at start of a line (which would mean mis-split)
        if uniqueHeadingsPattern.search(body):
            strayHeadingFound = True
            break
    report["stray_heading_in_body"] = strayHeadingFound

    # If round-trip not equal, capture a diff sample
    if not report["roundtrip_equal"]:
        diff = difflib.unified_diff(
            reconstructedFromSections.splitlines(), reconstructedFromRawText.splitlines(),
            fromfile='raw_heads_inline', tofile='reconstructed', lineterm=''
        )
        # keep first ~40 lines of diff
        report["diff_sample"] = '\n'.join(list(diff)[:40])

    return report

def validateCorpus(df: pd.DataFrame, rawTextCol: str, sampleN: int = 50):
    results = []
    for i, text in df[rawTextCol].dropna().sample(min(sampleN, df[rawTextCol].dropna().shape[0]), random_state=42).items():
        rep = validateRow(text)
        rep["row_index"] = i
        results.append(rep)
    report = pd.DataFrame(results)
    summary = {
        "notes_checked": len(report),
        "avg_headings": report["heading_count"].mean() if len(report) else 0,
        "pct_roundtrip_equal": 100.0 * (report["roundtrip_equal"].mean() if len(report) else 0),
        "pct_stray_heading_in_body": 100.0 * (report["stray_heading_in_body"].mean() if len(report) else 0),
        "avg_empty_sections_per_note": report["empty_sections"].apply(len).mean() if len(report) else 0
    }
    return report, summary


tempValidationCorpus = structuredRawData.join(rawDataset[rawDataColumnName])

_, summary = validateCorpus(tempValidationCorpus, rawTextCol=rawDataColumnName, sampleN=100000)
print(summary)

# Check for empty columns
emptyColReport = (tempValidationCorpus == "").mean().sort_values(ascending=False)
print("\nFraction of empty rows per heading:\n", emptyColReport)

# Check for suspiciously long text (possible concatenation errors)
longColReport = tempValidationCorpus.apply(lambda col: col.str.len().max())
print("\nMax text length per heading:\n", longColReport)

{'notes_checked': 100000, 'avg_headings': np.float64(10.80058), 'pct_roundtrip_equal': np.float64(55.342999999999996), 'pct_stray_heading_in_body': np.float64(0.066), 'avg_empty_sections_per_note': np.float64(3.33544)}

Fraction of empty rows per heading:
 Discharge Condition                                    0.760133
Physical Exam                                          0.630545
Pertinent Results                                      0.317751
generate a brief hospital summary                      0.250271
summarize                                              0.250174
create a summary based on the following information    0.249829
write a discharge summary                              0.249725
Discharge Diagnosis                                    0.248275
Past Medical History                                   0.185017
Medications on Admission                               0.060761
Discharge Instructions                                 0.056104
History of Present Illness             

Validation of cleansed dataset by simple word-by-word comparison between divided sections and raw text in the dataset

In [9]:
import re, unicodedata
from collections import Counter


whiteSpaceRegex = re.compile(r"\s+")
splitIntoTokensRegex = re.compile(r"[^A-Za-z0-9]+")

redundantHeadingsTokens = ["create" ,"a", "summary", "based", "on", "the",
                           "following", "information", "generate", "brief",
                           "hospital", "summarize", "write", "discharge"]

def unicodeNormalizationWithCaseInsenstivity(s: str) -> str:
    if s is None:
        return ""
    if not isinstance(s, str):
        s = str(s)
    # Unicode normalize & lowercase
    s = unicodedata.normalize("NFKC", s).lower()
    # Collapse whitespace
    s = whiteSpaceRegex.sub(" ", s).strip()
    return s

def tokenizeIntoWords(s: str):
    s = unicodeNormalizationWithCaseInsenstivity(s)
    # Split on non-alphanum; drop empties
    toks = [t for t in splitIntoTokensRegex.split(s) if t]

    for a in redundantHeadingsTokens:
        while a in toks:
            try:
                toks.remove(a)
            except ValueError:
                continue

    return toks

def tokenizeTextIntoBOW(s: str) -> Counter:
    return Counter(tokenizeIntoWords(s))

def concateColumnsofRow(row: pd.Series, cols: list) -> str:
    parts = []
    for c in cols:
        v = row.get(c, "")
        if pd.isna(v):
            continue
        v = c+": "+str(v).strip()
        if v:
            parts.append(v)
    return "\n".join(parts)

def compareRow(rawText: str, cleanText: str):
    tokenizedRawText = tokenizeTextIntoBOW(rawText)
    tokenizedCleanText = tokenizeTextIntoBOW(cleanText)

    # exact bag-of-words equality (counts & tokens)
    equalBow = (tokenizedRawText == tokenizedCleanText)

    # set-level similarity (order-insensitive, count-insensitive)
    rawSet, cleanSet = set(tokenizedRawText.keys()), set(tokenizedCleanText.keys())
    inter = len(rawSet & cleanSet)
    union = len(rawSet | cleanSet) or 1
    jaccard = inter / union

    # what got lost or added (by counts)
    missing = (tokenizedRawText - tokenizedCleanText)  # tokens present more times in raw
    extra   = (tokenizedCleanText - tokenizedRawText)  # tokens present more times in cleaned

    return {
        "equal_bow": equalBow,
        "jaccard_set_sim": round(jaccard, 4),
        "missing_token_count": sum(missing.values()),
        "extra_token_count":   sum(extra.values())
    }

def validateRawAndCleanText(df: pd.DataFrame, sampleNRows):
    reports = []
    for i, row in df.iterrows():
        if i > sampleNRows:
            break
        rep = compareRow(row[rawDataColumnName], concateColumnsofRow(row, uniqueHeadings))
        reports.append(rep)

    report = pd.DataFrame(reports)
    summary = {
        "rows_compared": len(report),
        "pct_equal_bow": round(100 * report["equal_bow"].mean(), 2),
        "avg_jaccard_set_sim": round(report["jaccard_set_sim"].mean(), 4),
        "avg_missing_tokens": round(report["missing_token_count"].mean(), 2),
        "avg_extra_tokens": round(report["extra_token_count"].mean(), 2),
    }
    return report, summary

_, summary = validateRawAndCleanText(tempValidationCorpus, 10000)
print(summary)

{'rows_compared': 10001, 'pct_equal_bow': np.float64(100.0), 'avg_jaccard_set_sim': np.float64(1.0), 'avg_missing_tokens': np.float64(0.0), 'avg_extra_tokens': np.float64(0.0)}


Finalizing the dataset clensing

In [10]:
structuredRawData = structuredRawData.drop(columns=redundantHeadings, errors="ignore")
structuredRawData = structuredRawData.join(rawDataset['target'])
structuredRawData = structuredRawData.join(rawDataset['note_id'])

summaryOutputColumn = "target"
summaryInputColumn = "source"

def makeSourceRow(row):
    parts = []
    # Short header helps the model understand the task
    parts.append("Summarize the following hospitalization for the discharge summary.")

    # Add sections if non-empty
    for sec in uniqueHeadings:
        txt = row.get(sec, "")
        # Handle NaN or non-string values safely
        if isinstance(txt, float) or txt is None:
            txt = ""
        else:
            txt = str(txt).strip()

        # Add only non-empty text
        if txt:
            parts.append(f"{sec}: {txt}")

    return "\n".join(parts)

def makeTargetRow(row):
    txt = row.get(summaryOutputColumn)
    if isinstance(txt, float) or txt is None:
        txt = ""
    else:
        txt = str(txt).strip()

    # Short header helps the model understand the boundary
    return f"Summary: {txt}"

def cleanFinalizedText(s):
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = "\n".join(line.strip() for line in s.split("\n"))
    return s.strip()

# Drop rows without targets or with very short targets
structuredRawData = structuredRawData[structuredRawData[summaryOutputColumn].astype(str).str.strip().str.len() > 20]
structuredRawData[summaryInputColumn] = structuredRawData.apply(makeSourceRow, axis=1)
structuredRawData[summaryOutputColumn] = structuredRawData.apply(makeTargetRow, axis=1)

structuredRawData[summaryInputColumn] = structuredRawData[summaryInputColumn].map(cleanFinalizedText)
structuredRawData[summaryOutputColumn] = structuredRawData[summaryOutputColumn].map(cleanFinalizedText)


print(structuredRawData.columns)
print("\n--- SOURCE ---\n")
print(structuredRawData[summaryInputColumn].head(2)[:600])
print("\n--- TARGET ---\n")
print(structuredRawData[summaryOutputColumn].head(2)[:400])

Index(['Chief Complaint', 'History of Present Illness', 'Past Medical History',
       'Physical Exam', 'Pertinent Results', 'Medications on Admission',
       'Discharge Medications', 'Discharge Diagnosis', 'Discharge Condition',
       'Discharge Instructions', 'Imaging', 'target', 'note_id', 'source'],
      dtype='object')

--- SOURCE ---

0    Summarize the following hospitalization for th...
1    Summarize the following hospitalization for th...
Name: source, dtype: object

--- TARGET ---

0    Summary: HCV cirrhosis cb ascites, hiv on ART,...
1    Summary: with HIV on HAART, HCV cirrhosis with...
Name: target, dtype: object


Writing cleansed data to a new csv on to the drive

In [11]:
destinationFilePath = "/content/drive/MyDrive/Colab Notebooks/Gen AI/Mini Project 1/cleaned_dataset.csv"

structuredRawData.to_csv(destinationFilePath, index=False)
print(os.path.exists(destinationFilePath))  # Should return True

True
