In [47]:
import os
import os.path as osp
import pandas as pd
import numpy as np
import yaml
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split

from datasets import Dataset, DatasetDict

In [53]:
# All of the files below result from running the preprocessing pipeline by Adams et al. (2020) on the raw CASI and MIMIC-III datasets.

CASI_DIR = '/data/casi-sense' # Modify as needed
casi_df = pd.read_csv(osp.join(CASI_DIR, 'preprocessed_dataset_window_10.csv'))
sf_lf_map = json.load(open(osp.join(CASI_DIR, 'sf_lf_map.json'))) # Short-form to long-form "sense" 
labeled_sf_lf_map = pd.read_csv(osp.join(CASI_DIR, 'labeled_sf_lf_map.csv'))

MIMIC_DIR = '/usr1/data/dataset/mimic-iii-sense' # Modify as needed
mimic_df = pd.read_csv(osp.join(MIMIC_DIR, 'mimic_rs_dataset_preprocessed_window_10.csv'))
mimic_sf_list = yaml.load(open(osp.join(MIMIC_DIR, 'mimic_sf.yaml')), Loader=yaml.FullLoader)

# Filter out the acronyms that are not present in MIMIC-III, as done in Adams et al. (2020)
casi_df = casi_df[casi_df['sf'].isin(mimic_sf_list)]

# Sample at most 500 examples per acronym
casi_df = casi_df.groupby('sf').apply(lambda x: x.sample(n=min(len(x), 500), random_state=42)).reset_index(drop=True)
mimic_df = mimic_df.groupby('sf').apply(lambda x: x.sample(n=min(len(x), 500), random_state=42)).reset_index(drop=True)

print('### Summary ###')
print(f'CASI: samples={casi_df.shape[0]}, acronyms={casi_df["sf"].nunique()}')
print(f'MIMIC-III: samples={mimic_df.shape[0]}, acronyms={mimic_df["sf"].nunique()}')

### Summary ###
CASI: samples=16586, acronyms=37
MIMIC-III: samples=17300, acronyms=37


In [83]:
casi_df.iloc[0]

sf                                                                 AMA
target_lf                                       against medical advice
sf_rep                                                            AMA.
start_idx                                                        175.0
end_idx                                                          178.0
section                                                 IDENTIFICATION
context              His wife works as a CPA. He is unemployed. He ...
lf_in_sf                                                         False
target_lf_sense                                 against medical advice
tokenized_context    wife works cpa unemployed entered with long hi...
sf_occurrences                                                     0.0
trimmed_tokens       addiction heroin crack-cocaine drugs of choice...
target_lf_idx                                                        2
row_idx                                                           1832
sectio

In [90]:
# Take stratified split and only select the relevant columns
casi_train = []
casi_val = []
casi_test = []

mimic_train = []
mimic_val = []
mimic_test = []

cols = ['sf', 'target_lf_sense', 'context']
pbar = tqdm(mimic_sf_list, desc='Taking stratified split of CASI data')
for sf in pbar:
    pbar.set_postfix(acronym=sf)
    casi_sf = casi_df[casi_df['sf'] == sf].reset_index(drop=True)
    casi_sf = casi_sf[cols]
    n_casi_sf = casi_sf.shape[0]
    n_test = int(n_casi_sf * 0.2)
    n_val = int(n_casi_sf * 0.2)
    train_idx, test_idx = train_test_split(np.arange(n_casi_sf), test_size=n_test, random_state=42)
    train_idx, val_idx = train_test_split(train_idx, test_size=n_val, random_state=42)
    
    casi_train += casi_sf.iloc[train_idx].to_dict(orient='records')
    casi_val += casi_sf.iloc[val_idx].to_dict(orient='records')
    casi_test += casi_sf.iloc[test_idx].to_dict(orient='records')

pbar = tqdm(mimic_sf_list, desc='Taking stratified split of MIMIC-III data')
for sf in mimic_sf_list:
    pbar.set_postfix(acronym=sf)
    mimic_sf = mimic_df[mimic_df['sf'] == sf].reset_index(drop=True)
    mimic_sf = mimic_sf[cols]
    n_mimic_sf = mimic_sf.shape[0]
    n_test = int(n_mimic_sf * 0.2)
    n_val = int(n_mimic_sf * 0.2)
    train_idx, test_idx = train_test_split(np.arange(n_mimic_sf), test_size=n_test, random_state=42)
    train_idx, val_idx = train_test_split(train_idx, test_size=n_val, random_state=42)

    mimic_train += mimic_sf.iloc[train_idx].to_dict(orient='records')
    mimic_val += mimic_sf.iloc[val_idx].to_dict(orient='records')
    mimic_test += mimic_sf.iloc[test_idx].to_dict(orient='records')

casi_train = Dataset.from_list(casi_train)
casi_val = Dataset.from_list(casi_val)
casi_test = Dataset.from_list(casi_test)
casi_dataset = DatasetDict(dict(train=casi_train, val=casi_val, test=casi_test))

mimic_train = Dataset.from_list(mimic_train)
mimic_val = Dataset.from_list(mimic_val)
mimic_test = Dataset.from_list(mimic_test)
mimic_dataset = DatasetDict(dict(train=mimic_train, val=mimic_val, test=mimic_test))

Taking stratified split of MIMIC-III data:   0%|          | 0/37 [00:38<?, ?it/s, acronym=FSH]
Taking stratified split of CASI data: 100%|██████████| 37/37 [00:00<00:00, 188.34it/s, acronym=FSH]
Taking stratified split of MIMIC-III data:   0%|          | 0/37 [00:00<?, ?it/s, acronym=FSH]

In [None]:
# Save the datasets to disk
DATA_DIR = '/data'
casi_outdir = osp.join(DATA_DIR, 'casi-sense-preprocessed')
os.makedirs(casi_outdir, exist_ok=True)
casi_dataset.save_to_disk(casi_outdir)

mimic_outdir = osp.join(DATA_DIR, 'mimic-iii-sense-preprocessed')
os.makedirs(mimic_outdir, exist_ok=True)
mimic_dataset.save_to_disk(mimic_outdir)

In [97]:
# Also save the short-form to long-form mapping in the preprocessed datasets
with open(osp.join(casi_outdir, 'sf_lf_map.json'), 'w') as fh:
    json.dump(sf_lf_map, fh)

with open(osp.join(mimic_outdir, 'sf_lf_map.json'), 'w') as fh:
    json.dump(sf_lf_map, fh)