In [2]:
def load_raw_data():
    data = []

    for file_idx in tqdm(range(1, 5), desc="Loading raw data from .mat files"):
        fname = f'Part_{file_idx}'
        file_data = mat73.loadmat(RAW_DATASET_PATH / f'{fname}.mat')[fname]
        data.extend(file_data)
    return data


def sample_subjects(subjects_ids, max_samples_per_subject):
    unique_ids = np.unique(subjects_ids)
    chosen_samples = []
    for _id in unique_ids:
        subject_ids = set(np.where(subjects_ids == _id)[0])
        if max_samples_per_subject > len(subject_ids):
            subject_samples = list(subject_ids)        
        else:
            subject_samples = random.sample(subject_ids, k=max_samples_per_subject)
        chosen_samples.extend(subject_samples)
    chosen_samples = np.array(sorted(chosen_samples))
    return chosen_samples


def prepare_mimic_clean_data(max_samples_per_subject=8, n_samples=1000, seed=42):
    random.seed(seed)
    
    # PPG, ABP, ECG
    raw_data = load_raw_data()
    data = np.concatenate([measurement.reshape(3, -1, n_samples) for measurement in raw_data], axis=1)
    data = torch.from_numpy(data)
    
    samples_per_subject = torch.tensor([measurement.shape[1] // n_samples for measurement in raw_data])
    subjects_ids = np.concatenate([[i] * samples_num for i, samples_num in enumerate(samples_per_subject)])
    
    sample_ids = sample_subjects(subjects_ids, max_samples_per_subject=max_samples_per_subject)
    
    subjects_ids = subjects_ids[sample_ids]
    data = data[:, sample_ids]
    
    abp = data[1].numpy()
    
    sbp = np.array([_abp[find_peaks(_abp, height=_abp.mean())[0]].mean() for _abp in tqdm(abp, "Gathering sbp target values")])
    dbp = np.array([_abp[find_peaks(-_abp, height=-_abp.mean())[0]].mean() for _abp in tqdm(abp, "Gathering dbp target values")])
    
    valid_mask = ~(np.isnan(sbp) | np.isnan(dbp))
    
    sample_ids = sample_ids[valid_mask]
    subjects_ids = subjects_ids[valid_mask]
    data = data[:, valid_mask]  
    data = torch.index_select(data, 0, torch.tensor([0, 2])) # PPG, ECG
    data = data.permute(1, 0, 2) # [2, S, 1000] -> [S, 2, 1000]
    sbp = sbp[valid_mask]
    dbp = dbp[valid_mask]
    
    info = pd.DataFrame({
        'sample_id': sample_ids,
        'subject_id': subjects_ids,
        'sbp': sbp,
        'dbp': dbp
    })

    return data, info


def save_data_and_targets_to_files(data, splits_info):
    splits = splits_info["split"].unique()
    for split in tqdm(splits, desc="Saving split files"):
        split_mask = splits_info['split'].values == split
        split_data = data[split_mask] # [S, 2, 1000]

        sbp = splits_info['sbp'].values[split_mask]
        dbp = splits_info['dbp'].values[split_mask]
        sbp_dbp_avg = torch.tensor(np.array([sbp, dbp])).T

        torch.save(split_data, RAW_TENSORS_DATA_PATH / f"{split}.pt")
        torch.save(sbp_dbp_avg, RAW_TENSORS_TARGETS_PATH / f"{split}.pt")


def create_mimic_clean_raw_tensors(max_samples_per_subject=8, n_samples=1000, seed=42):
    data, info = prepare_mimic_clean_data(
        max_samples_per_subject=max_samples_per_subject,
        n_samples=n_samples,
        seed=seed
    )
    
    splits_info = create_train_val_test_split_info(
        groups=info['subject_id'].values, 
        info=info,
        train_size=0.7,
        val_size=0.15,
        test_size=0.15,
        random_state=seed
    )
    
    splits_info.to_csv(SPLIT_INFO_PATH)
    save_data_and_targets_to_files(data, splits_info)

In [3]:
create_mimic_clean_raw_tensors(max_samples_per_subject=20, n_samples=1000, seed=42)

A Jupyter Widget

A Jupyter Widget

  sbp = np.array([_abp[find_peaks(_abp, height=_abp.mean())[0]].mean() for _abp in tqdm(abp, "Gathering sbp target values")])
  ret = ret.dtype.type(ret / rcount)


A Jupyter Widget

  dbp = np.array([_abp[find_peaks(-_abp, height=-_abp.mean())[0]].mean() for _abp in tqdm(abp, "Gathering dbp target values")])


A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_info.loc[:, "split"] = "train"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_info.loc[:, "split"] = "val"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_info.loc[:, "split"] = "test"


A Jupyter Widget