In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import wfdb
from tqdm.notebook import tqdm
import requests
from joblib import Parallel, delayed
from pathlib import Path
import wget
import time
import random
from copy import deepcopy
import os
import shutil
from itertools import groupby
from operator import itemgetter

In [7]:
def is_good_measurement(record, measurement):
    header = requests.get(f"https://physionet.org/files/{MIMIC_DB_NAME}/1.0/{record}/{measurement}.hea").text
    return {
        "path": f"{record}/{measurement}",
        **{signal: signal in header for signal in ["ABP", "PLETH", "II", "V"]}
    }

def find_good_measurements(all_records, start, n_records=200):
    end = start + n_records
    current_records = all_records[start:end]
    good_measurements = []
    for record in tqdm(current_records, desc="Records"):
        record_measurements = requests.get(f"https://physionet.org/files/{MIMIC_DB_NAME}/1.0/{record}/RECORDS").text.split("\n")
        record_measurements = [measurement for measurement in record_measurements if len(measurement) > 0]
        results = Parallel(n_jobs=-1)(delayed(is_good_measurement)(record, measurement) for measurement in tqdm(record_measurements, desc="Record measurements"))
        results = [result for result in results if result is not None]
        good_measurements.extend(results)
        time.sleep(0.1)
    df = pd.DataFrame(good_measurements)
    df.to_csv(f"mimic_recordings/{start}-{end}.csv", index=False)
    return df

def download_measurement(measurement_path):
    try:
        directory, subject, recording = measurement_path.split("/")
        path = Path(f"mimic_db/{directory}/{subject}")
        path.mkdir(parents=True, exist_ok=True)
        url_dat = f'https://physionet.org/files/mimic3wdb-matched/1.0/{directory}/{subject}/{recording}.dat'
        url_hea = f'https://physionet.org/files/mimic3wdb-matched/1.0/{directory}/{subject}/{recording}.hea'
        wget.download(url_dat, out=str(path))
        wget.download(url_hea, out=str(path))
        with open("downloaded_measurements.txt", "a") as file:
            file.write(f"{measurement_path}\n")
        return True
    except Exception:
        print(f"   {measurement_path} failed")
        return False
        
def download_measurements(measurements, n_jobs=-1):  
    results = Parallel(n_jobs=n_jobs)(delayed(download_measurement)(path) for path in tqdm(measurements, desc="Downloading measurements"))
    return results

def validate(path, desired_signals=['ABP', 'II', 'PLETH']):
    try:
        data, info = wfdb.rdsamp(f'mimic_db/{path}')
        df = pd.DataFrame(data, columns=info['sig_name'])
        if any(sig_name not in info['sig_name'] for sig_name in desired_signals):
            return False
        is_nans_good = all([nans / len(df) < 0.95 for nans in [np.isnan(df[sig_name].values).sum() for sig_name in desired_signals]])
        return is_nans_good
    except Exception:
        print(f"{path} failed")
        return False
    
def remove_measurements(measurements):
    for path in tqdm(measurements, desc="Removing invalid measurements"):
        try:
            dir_path = f"mimic_db/{path}"
            os.remove(f"{dir_path}.hea")
            os.remove(f"{dir_path}.dat")
            shutil.rmtree(dir_path)
            with open("removed_measurements.txt", "a") as file:
                file.write(f"{dir_path}\n")
        except Exception as e:
            print(dir_path, f" failed ({e})")

def find_repeats(arr, min_n_repeats):
    """Finds contiguous True regions of the boolean array "condition". Returns
    a 2D array where the first column is the start index of the region and the
    second column is the end index."""
    condition = np.concatenate((arr[:-1] == arr[1:], np.array([False])))
    d = np.diff(condition)
    idx, = d.nonzero() 
    idx += 1
    if condition[0]:
        idx = np.r_[0, idx]
    if condition[-1]:
        idx = np.r_[idx, condition.size] # Edit
    idx.shape = (-1,2)
    return [segment.tolist() for segment in idx if (segment[1] - segment[0]) >= min_n_repeats - 1]

def validate_signal(data, low=None, high=None, n_same=10, min_good_samps=60):
    if len(data) < min_good_samps:
        return np.zeros_like(data) == 1
    
    no_nans = ~np.isnan(data)
    if low is None or high is None:
        good_range = np.zeros_like(data) == 0
    else:
        good_range = (data > low) & (data < high)
    
    different_vals = np.zeros_like(data) == 0
    repeats_bounds = find_repeats(data, n_same)
    for start_idx, end_idx in repeats_bounds:
        different_vals[start_idx : end_idx+1] = False
        
    return no_nans & good_range & different_vals

def validate_recording(abp, ppg, ecg, fs=125, n_same=10, min_good_secs=30):
    min_good_samps = min_good_secs * fs
    abp_is_good = validate_signal(abp, low=0, high=300, n_same=n_same)
    ppg_is_good = validate_signal(ppg, low=None, high=None, n_same=n_same)
    ecg_is_good = validate_signal(ecg, low=None, high=None, n_same=n_same)
    is_good = abp_is_good & ppg_is_good & ecg_is_good
    return is_good

def find_good_segments(abp, ppg, ecg, fs=125, n_same=10, min_good_secs=30):
    is_good = validate_recording(abp, ppg, ecg, fs, n_same, min_good_secs)
    segments = [(group[0], group[-1]) for group in (list(group) for key, group in groupby(range(len(is_good)), key=is_good.__getitem__) if key)]
    good_segments = [segment for segment in segments if (segment[1] - segment[0]) / fs > min_good_secs]
    return good_segments

def find_valid_segments(path, n_same=10, min_good_secs=120, return_segments=False):
    try:
        data, info = wfdb.rdsamp(f'mimic_db/{path}')
        fs = info['fs']
        sig_names = info['sig_name']

        abp = data[:, sig_names.index("ABP")] 
        ppg = data[:, sig_names.index("PLETH")] 
        ecg = data[:, sig_names.index("II")] 

        valid_segments = find_good_segments(abp, ppg, ecg, fs=FS, n_same=n_same, min_good_secs=min_good_secs)
        dir_path = Path("mimic_db") / path
        for start, end in valid_segments:
            segment_path = dir_path / f"{start}-{end}"
            with open("valid_segments.txt", "a") as file:
                file.write(f"{str(segment_path)}\n")
        with open("segmented_measurements.txt", "a") as file:
            file.write(f"{path}\n")
        with open("logs.txt", "a") as file:
            file.write(f"{path} OK ({len(valid_segments)} segments)\n")
        if return_segments:
            return valid_segments
        return True
    except Exception as e:
        with open("logs.txt", "a") as file:
            file.write(f"{path} failed ({e})\n")
        print(f"{path} failed ({e})")
        return False
    
def load_segment_to_df(path, start, end):
    data, info = wfdb.rdsamp(path)
    fs = info['fs']
    sig_names = info['sig_name']
    abp = data[start:end, sig_names.index("ABP")] 
    ppg = data[start:end, sig_names.index("PLETH")] 
    ecg = data[start:end, sig_names.index("II")] 
    return pd.DataFrame({"ABP": abp, "PPG": ppg, "ECG": ecg})

def get_records_samples_bounds(records_bounds, sample_len_samples, shuffle_segments=True):
    records_samples_bounds = {}
    for record, record_bounds in records_bounds.items():
        record_segments_bounds = []
        for start, end in record_bounds:
            segment_start = start
            segment_end = start + sample_len_samples
            while segment_end < end:
                record_segments_bounds.append((segment_start, segment_end))
                segment_start, segment_end = segment_end, segment_end + SAMPLE_LEN_SAMPLES
        records_samples_bounds[record] = record_segments_bounds
    if shuffle_segments:
        for record, segments_bounds in records_samples_bounds.items():
            random.shuffle(segments_bounds)
    return records_samples_bounds

def cut_segments_for_subjects(subjects, subjects_records, records_bounds, max_samples_per_subject, sample_len_samples):
    samples_bounds = get_records_samples_bounds(records_bounds, sample_len_samples)
    subjects_data = {}
    
    for subject in tqdm(subjects, desc="Cutting subjects recordings into segments"):
        n_samples = 0
        record_idx = 0
        subject_records = subjects_records[subject]
        subject_data = []
        while n_samples < max_samples_per_subject:
            current_record = subject_records[record_idx]
            record_path = f"{subject}/{current_record}"
            possible_segments_bounds = samples_bounds[record_path]
            try:
                sample_start_idx, sample_end_idx = possible_segments_bounds.pop()
            except:
                break # no more samples
            sample_path = f"{record_path}/{sample_start_idx}-{sample_end_idx}"
            if sample_path not in subject_data:
                subject_data.append({'path': record_path, 'start': sample_start_idx, 'end': sample_end_idx})
            n_samples += 1
            record_idx += 1
            if record_idx > len(subject_records) - 1:
                record_idx = 0
        subjects_data[subject] = subject_data
    return subjects_data

def create_dataset(valid_segments, max_samples_per_subject, sample_len_samples, seed=42):
    subjects = np.unique(["/".join(m['path'].split("/")[1:3]) for m in valid_segments])
    print(f" Found {len(subjects)} unique subjects")
    records = np.unique(["/".join(segment['path'].split('/')[1:]) for segment in valid_segments])
    print(f" Found {len(records)} valid records with total of {len(valid_segments)} valid segments")
    records_bounds = {"/".join(record.split("/")[1:]): [(segment['start'], segment['end']) for segment in segments] for record, segments in groupby(valid_segments, key = itemgetter('path'))}
    subjects_segments = {subject: [segment for segment in valid_segments if subject in segment['path']] for subject in subjects}
    subjects_records = {subject: np.unique([segment['path'].split("/")[3] for segment in segments]).tolist() for subject, segments in subjects_segments.items()}
    
    print(f" Segments cutting in progress..")
    random.seed(seed)
    subjects_datataset_info = cut_segments_for_subjects(subjects, subjects_records, records_bounds, MAX_SAMPLES_PER_SUBJECT, SAMPLE_LEN_SAMPLES)
    
    with open("samples_segmentation_info.txt", "a") as file:
        file.write("sample,segment\n")
        
    def save_sample_to_csv(i, subject, sample):
        try:
            path, start, end = sample['path'], sample['start'], sample['end']
            subject = path.split("/")[1]
            path = f'mimic_db/{path}'
            df = load_segment_to_df(path, start, end)
            df.to_csv(f"mimic_csv/{subject}_{i}.csv", index=False)
            with open("samples_segmentation_info.txt", "a") as file:
                file.write(f"{subject}_{i},{path}/{start}-{end}\n")
            return True
        except Exception as e:
            print(f"{i}, {sample} failes")
            return False
        
    for subject, samples in tqdm(subjects_datataset_info.items(), desc=f"Saving subjects segments into csv files ({SAMPLE_LEN_SEC}s samples)"):
        saved_samples = Parallel(n_jobs=4)(delayed(save_sample_to_csv)(i, subject, sample) for i, sample in enumerate(samples))

# **Defining global variables**

In [5]:
MIMIC_DB_NAME = "mimic3wdb-matched"         # specify type of mimic databes (mimic3wdb or mimic3wdb-matched)
N_RECORDS = 250                             # number of records for a single `find_good_measurements` run
FS = 125                                    # sampling frequency of mimic3wdb signals
MAX_SAMPLES_PER_SUBJECT = 30                # maximum number of samples per subject in created dataset
SAMPLE_LEN_SEC = 120                        # desired length of sample in seconds
SAMPLE_LEN_SAMPLES = SAMPLE_LEN_SEC * FS    # desired length of sample in samples

# **Finding good measurements using `*.hea` files**

In [None]:
all_records = requests.get(f"https://physionet.org/files/{MIMIC_DB_NAME}/1.0/RECORDS").text.split("/\n")
all_records = [record for record in all_records if len(record) > 0]
_ = find_good_measurements(all_records, start=8500, n_records=N_RECORDS)

# **Downloading measurements validated by `*.hea` files (measurements with `ABP`, `PLETH` and `II` signals)**

(`II` was in some cases mistaken with `III`)

In [None]:
measurements_df = pd.DataFrame()

for start in np.arange(0, 10000, N_RECORDS):
    end = start + N_RECORDS
    try:
        df = pd.read_csv(f"mimic_recordings/{start}-{end}.csv")
        measurements_df = pd.concat([measurements_df, df])
    except:
        print(f'No file for {start}-{end}')
        
good_measurements = measurements_df.query("ABP == True and PLETH == True and II == True")['path'].values

with open('downloaded_measurements.txt') as f:
    downloaded_measurements = f.readlines()
    downloaded_measurements = [path.replace("\n", "") for path in downloaded_measurements]

measurements_to_download = [path for path in good_measurements if path not in downloaded_measurements]

print(f"Downloaded {len(downloaded_measurements)} measurements, attempting {len(measurements_to_download)} measurements download")

download_measurements(measurements_to_download, n_jobs=-1)

# **Basic validation**

Check if:
* `ABP`, `PLETH` and `II` are in the measurement
* all signals have less than 95% of nans

In [None]:
with open('downloaded_measurements.txt') as f:
    downloaded_measurements = f.readlines()
    downloaded_measurements = np.array([path.replace("\n", "") for path in downloaded_measurements])
    
with open('valid_measurements.txt') as f:
    valid_measurements = f.readlines()
    valid_measurements = np.array([path.replace("\n", "") for path in valid_measurements])
    
measurements_to_validate = np.array([path for path in downloaded_measurements if path not in valid_measurements])
print(f" Validated {len(valid_measurements)} measurements\n Attempting validation of {len(measurements_to_validate)} measurements..")

valid_records = Parallel(n_jobs=-1)(delayed(validate)(path) for path in tqdm(measurements_to_validate, desc="Validating measurements"))

valid_measurements = np.concatenate((valid_measurements, measurements_to_validate[valid_records]))

textfile = open("valid_measurements.txt", "w")
for measurement_path in valid_measurements:
    textfile.write(measurement_path + "\n")
textfile.close()

unique_subjects = np.unique([m.split("/")[1] for m in valid_measurements])
len(unique_subjects)

# **Segmenting measurements**

Find segments in measurement which meet the conditions:
* Atleast 120 sec of good signal, defined by:
    * No nans in any of `ABP`, `PLETH` and `II` signals
    * No *const* values in any of `ABP`, `PLETH` and `II` signals for more than 10 samples

In [None]:
with open('segmented_measurements.txt') as f:
    segmented_measurements = f.readlines()
    segmented_measurements = [path.replace("\n", "") for path in segmented_measurements]
    
measurements_to_segment = [path for path in valid_measurements if path not in segmented_measurements]
print(f" Segmented {len(segmented_measurements)} measurements\n Attempting segmentation of {len(measurements_to_segment)} measurements..")

segmented_measurements = Parallel(n_jobs=-1)(delayed(find_valid_segments)(path) for path in tqdm(measurements_to_segment, desc="Segmenting measurements"))

# **Removing invalid measurements**

Remove measurements which didnt meet the conditions defined in segmentation

In [None]:
downloaded_measurements = pd.read_csv("downloaded_measurements.txt", names=['path'])['path'].values
removed_measurements = pd.read_csv("removed_measurements.txt", names=['path'])['path'].values
removed_measurements = ["/".join(path.split("/")[1:]) for path in removed_measurements]

valid_segments = pd.read_csv("valid_segments.txt", names=['path'])
valid_measurements = np.unique(["/".join(path.split("/")[1:-1]) for path in valid_segments['path'].values])
measurements_to_remove = [measurement for measurement in downloaded_measurements if measurement not in valid_measurements and measurement not in removed_measurements]

# remove_measurements(measurements_to_remove)

# **Creating dataset**

Save to csv valid segments (maximum of 30 per subject with 120s long signal)

In [None]:
with open('valid_segments.txt') as f:
    valid_segments = f.readlines()
    valid_segments = np.array([path.replace("\n", "") for path in valid_segments])

valid_segments = [{'path': "/".join(segment.split("/")[:-1]), 'start': int(segment.split("/")[-1].split("-")[0]), 'end': int(segment.split("/")[-1].split("-")[1])} for segment in valid_segments]

create_dataset(valid_segments, MAX_SAMPLES_PER_SUBJECT, SAMPLE_LEN_SAMPLES, seed=42)

# **Filling info about dataset**

In [None]:
seg_info = pd.read_csv("samples_segmentation_info.txt")
fs = np.array([FS]*len(seg_info))
samples_path = np.array([f"{sample}.csv" for sample in seg_info['sample']])
subjects = np.array([sample.split("_")[0] for sample in seg_info['sample']])
subjects_sample_number = np.array([int(sample.split("_")[1]) for sample in seg_info['sample']])
recording_path = np.array(["/".join(segment_path.split("/")[1:-1]) for segment_path in seg_info['segment']])
samples_start_idxs = np.array([int(sample.split("/")[-1].split("-")[0]) for sample in seg_info['segment']])
samples_end_idxs = np.array([int(sample.split("/")[-1].split("-")[1]) for sample in seg_info['segment']])

dataset_info = pd.DataFrame({
    "sample_path": samples_path,
    "subject": subjects,
    "sample_num": subjects_sample_number,
    "recording": recording_path,
    "sample_start_idx": samples_start_idxs,
    "sample_end_idx": samples_end_idxs,
    "fs": fs
}).sort_values(["subject", "sample_num"]).reset_index(drop=True)
dataset_info.to_csv("dataset_info.csv", index=False)