In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import numpy as np

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

csv_path = "/Users/harish/Documents/NLP/mimic-iv-ext-bhc-labeled-clinical-notes-dataset-for-hospital-course-summarization-1.2.0/mimic-iv-bhc.csv"

total_rows = sum(1 for _ in open(csv_path)) - 1
sample_frac = 0.1  # 10%
sample_n = int(total_rows * sample_frac)

skip_idx = sorted(np.random.choice(np.arange(1, total_rows + 1), total_rows - sample_n, replace=False))

df = pd.read_csv(csv_path, skiprows=skip_idx)

def clean_text(text):
    text = text.replace('\n', ' ').strip()
    text = re.sub(r'\[\*\*.*?\*\*\]', '', text)  
    text = re.sub(r'\s+', ' ', text) 
    return text

df["body"] = df["input"].apply(clean_text)
df["summary"] = df["target"].apply(clean_text)

df = df[(df["body"].str.len() > 200) & (df["summary"].str.len() > 50)]
df_ready = df[["body", "summary"]].dropna()

train_df, test_df = train_test_split(df_ready, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

print(len(train_df), len(val_df), len(test_df))

# ---------------------------
# 4. Convert to HuggingFace Dataset
# ---------------------------

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

# ---------------------------
# 5. Tokenization
# ---------------------------

model_name = "google/flan-t5-base"   # change if you want
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_input_length = 1024
max_target_length = 256

def preprocess(examples):
    model_inputs = tokenizer(
        examples["body"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=max_target_length,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["body", "summary"])

# ---------------------------
# 6. Save tokenized dataset
# ---------------------------

tokenized_dataset.save_to_disk("bhc_tokenized_dataset")

print("Tokenized dataset saved!")


21865 2430 2700


Downloading tokenizer_config.json: 0.00B [00:00, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (â€¦)cial_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/21865 [00:00<?, ? examples/s]

