# MIMIC-III Waveform Matched Subset Data Preparation for RAIM

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
from dateutil import tz
import random
import glob
import csv
import os.path
import pickle
import sys

In [2]:
CNN_duration = 30 # seconds
L1RNN_num_states = 1*60 # 1 hour
L1RNN_duration = L1RNN_num_states * CNN_duration # 1 hour
L2RNN_num_states = 12 # 12 hours
L2RNN_duration = L2RNN_num_states * L1RNN_duration # 0.5 day
hz = 125

shortest_length = 1.0 # hours
waveform_grace_start = 0.5 # hours
future_time_interval = 24.0 # For computing decompensation, i.e., mortaility in the next xx hours

In [3]:
# Process per dir
p_dir = 'p00'

## 0.1. Construct benchmark dataset from clinical database using
```
https://github.com/YerevaNN/mimic3-benchmarks
```

## 0.2. Convert waveforms/numerics to .txt
```

rdsamp -r p00/p000020/p000020-2183-04-28-17-47 -p -v -s “II”
rdsamp -r p00/p000020/p000020-2183-04-28-17-47n -p -v -s “ABPSys”
...

```

## 0.3. Align the benchmark clinical data with the waveform data

```perl
data_preprocess/align_icustay_episode_waveform.perl: 

#!/usr/local/bin/perl

use Date::Parse;

$waveform_record_file = "MIMIC_data/waveform_matched_subset/RECORDS-waveforms";

$icustay_file = "MIMIC_data/clinical_data/from_benchmarks/all_stays.csv";

$test_split_dir = "MIMIC_data/clinical_data/from_benchmarks/test/";

$train_split_dir = "MIMIC_data/clinical_data/from_benchmarks/train/";

$out_file = "mimic_data/intermediate_data/ICUSTAYS.waveform_matched.csv";

%hashSPLIT = ();
%numICUSTAY = ();
%hashICUSTAY = ();

print "Reading split list...\n";
open( $fh, "-|", "find", $test_split_dir, "-type", "d" );
$line = <$fh>;
while ($line = <$fh>) {
        chomp $line;
        $line =~ s/$test_split_dir//g;
        $hashSPLIT{$line} = 'test';
}
close $fh;
...
```

## 1. Read the list of waveforms that will be processed

In [7]:
map_episode_waveform_file = "MIMIC_data/intermediate_data/ICUSTAYS.waveform_matched.%s.csv" % p_dir

csvfile = open(map_episode_waveform_file, 'r')
map_waveform = [row for row in csv.reader(csvfile, delimiter = ',')]
map_waveform_table = pd.DataFrame(map_waveform,columns = 
                                  ["SPLIT","EPISODE","RECORD_WAVEFORM_FILE",
                                   "RECORD_WAVEFORM_START_TIME","SUBJECT_ID",
                                   "HADM_ID","ICUSTAY_ID","LAST_CAREUNIT",
                                   "DBSOURCE","INTIME","OUTTIME","LOS","ADMITTIME",
                                   "DISCHTIME","DEATHTIME","ETHNICITY","DIAGNOSIS",
                                   "GENDER","DOB","DOD","AGE","MORTALITY_INUNIT",
                                   "MORTALITY","MORTALITY_INHOSPITAL"])

map_waveform_table

Unnamed: 0,SPLIT,EPISODE,RECORD_WAVEFORM_FILE,RECORD_WAVEFORM_START_TIME,SUBJECT_ID,HADM_ID,ICUSTAY_ID,LAST_CAREUNIT,DBSOURCE,INTIME,...,DEATHTIME,ETHNICITY,DIAGNOSIS,GENDER,DOB,DOD,AGE,MORTALITY_INUNIT,MORTALITY,MORTALITY_INHOSPITAL
0,train,1932_episode1_timeseries.csv,p00/p001932/p001932-2127-04-18-15-25,2127-04-18 15:25:00,1932,123386,216461,CSRU,carevue,2127-04-18 09:43:40,...,,UNKNOWN/NOT SPECIFIED,MYOCARDIAL INFARCTION;UNSTABLE ANGINA\CATH,F,2045-05-03 00:00:00,2136-10-25 00:00:00,82.01206938102486,0,0,0


## 2.1. Read waveform file

In [None]:
waveform_dir = 'MIMIC_data/waveform_data'
waveform_file = "%s/%s.txt" % (waveform_dir, 'p00/p001932/p001932-2127-04-18-15-25')
# load csv file
df = pd.read_csv(waveform_file, delimiter = '\t', header = None, 
                 names = ['time','II'],skipinitialspace=True, 
                 skiprows=2,dtype={'time':np.float64})

## 2.2. Process waveform file

In [22]:
sample_cnt=0
out_dir = 'MIMIC_data/out_data/pickle_data'
with open(map_episode_waveform_file, "rb") as f:
	rd = csv.reader(f, delimiter = ',')
	#rd.next()
	for row in rd:
		split = row[0]
		episode = row[1]
		waveform = row[2]
		start_time = row[3]
		subject_id = row[4]
		intime = row[9]
		outtime = row[10]
		los = row[11]
		deathtime = row[19] # use DOD
		mortality = row[22] 

		waveform_file = "%s/%s.txt" % (waveform_dir, waveform)
		if not os.path.exists(waveform_file):
			print("%s not exists!" % waveform_file)
			continue

		if los == '':
			print("%s los not exists!" % waveform_file)
			continue
		print("processing %s..." % waveform_file)

		los = float(row[11]) * 24.0	
	
		if deathtime == '':
			lived_time = 1e18
		else:
			lived_time = (datetime.strptime(deathtime, "%Y-%m-%d %H:%M:%S") - datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")).total_seconds() / 3600.0
		
		waveform_offset = (datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(intime, "%Y-%m-%d %H:%M:%S")).total_seconds()
		actual_start_offset = max(shortest_length * 3600 - waveform_offset, waveform_grace_start * 3600) 
		los = los - (actual_start_offset/3600.0)
		lived_time = lived_time -(actual_start_offset/3600.0)

		actual_start_time = datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") + timedelta(seconds=actual_start_offset)

		# load csv file
		df = pd.read_csv(waveform_file, delimiter = '\t', header = None, names = ['time','II'],skipinitialspace=True, skiprows=2+int(actual_start_offset*hz),dtype={'time':np.float64})	
		#csv_chunks = pd.read_csv(waveform_file, delimiter = '\t', header = None, names = ['time','II'],skipinitialspace=True, skiprows=2+int(actual_start_offset*hz),chunksize = 10000)	
		#df = pd.concat(chunk for chunk in csv_chunks)
		len_df = len(df)
		len_hr_df = len_df/(hz * 3600.0)
		
		# check empty files
		if len_df == 0:
			print("Error! empty file")
			continue
		
		# check empty II signal
		if df['II'].dtype == 'object':
			empty_loc = df.index[df['II'] == '-'].tolist()
			if len(empty_loc) > int(len_df/2):
				print("Error! half of the II signal missing")
				continue

			for loc in empty_loc:
				df.at[loc,'II'] = 0.0
		
		# check sample hz
		last_record_time = df['time'][len_df-1]/3600.0
		if int(last_record_time) > int(len_hr_df) + 6:
			print("Skipped! %s has total record time %d vs. expected record time %d" %( waveform_file, int(last_record_time), int(len_hr_df)))
			continue

		# data quality check
		num_episodes = int(len_hr_df/L2RNN_num_states)	
		if num_episodes == 0:
			print("Skipped! %s has less than %d hours ECG records" % (waveform_file, L2RNN_num_states))
			continue

		# procss the file
		#print("In total %d episodes" % num_episodes)
		lenEcgSeg = CNN_duration * hz

		index = 0
		for ep in range(num_episodes):
			final_x = []
			final_los = []
			final_decompensation = []
			final_los_bucket = []

			cur_start_time = actual_start_time + timedelta(seconds=ep * L2RNN_duration)
			out_file = "%s/%s/%s-%s.pickle" % (out_dir, split, episode.replace("_timeseries.csv",""), cur_start_time.strftime('%Y-%m-%d-%H-%M-%S'))

			for i in range(L2RNN_num_states):
				HourEcgSeg = []
				start_index = index + i * L1RNN_duration * hz
				for j in range(L1RNN_num_states):
					start_in_index = start_index + j * lenEcgSeg
					EcgSeg = np.array(df['II'][start_in_index:(start_in_index + lenEcgSeg)])
					HourEcgSeg.append(EcgSeg.reshape(lenEcgSeg,1))
				final_x.append(HourEcgSeg)

				cur_hrs = ep*L2RNN_num_states + i
				final_los.append(los - cur_hrs)
				cur_bucket = int((los-cur_hrs)/24)
				if cur_bucket > 8:
					cur_bucket = 8
				if cur_bucket >= 14:
					cur_bucket = 9
				final_los_bucket.append(cur_bucket)

				if mortality == '0':
					cur_mortality = 0
				else:
					cur_mortality = int(lived_time - cur_hrs < future_time_interval)
				if cur_mortality == 1:
					print("%s start decompensating at hour %d" % (waveform_file, cur_hrs))
				final_decompensation.append(cur_mortality)

			index = index + L2RNN_duration * hz

			final_x = np.array(final_x)
			final_los = np.array(final_los)
			final_decompensation = np.array(final_decompensation)
			final_los_bucket = np.array(final_los_bucket)
			#print(final_x.shape)

			output = {'x_ecg':final_x, 'y_los':final_los, 'y_decompensation':final_decompensation, 'y_los_bucket':final_los_bucket}
			with open(out_file, 'wb') as o:
				pickle.dump(output, o, pickle.HIGHEST_PROTOCOL)
			print("Done")

processing MIMIC_data/waveform_data/p00/p001932/p001932-2127-04-18-15-25.txt...
Done
Done
Done
Done
Done
Done
Done


## 3. Process numerics/vital signs

In [18]:
out_dir = 'MIMIC_data/out_data/vital_pickle_data'

signals = ["ABPSys","ABPDias","ABP Sys", "ABP Dias","NBPSys", "NBPDias","NBP Sys","NBP Dias","PULSE","RESP","SpO2"]
signal_names = ["ABPSys","ABPDias","NBPSys", "NBPDias","PULSE","RESP","SpO2"]

freq = 60 # seconds

sample_cnt=0
with open(map_episode_waveform_file, "rb") as f:
	rd = csv.reader(f, delimiter = ',')
	#rd.next()
	for row in rd:
		split = row[0]
		episode = row[1]
		waveform = row[2]
		start_time = row[3]
		subject_id = row[4]
		intime = row[9]
		outtime = row[10]
		los = row[11]
		deathtime = row[14]
		mortality = row[22]

		if los == '':
			print("%s los not exists!" % waveform_file)
			continue
		los = float(row[11]) * 24.0	
		
		print("processing %s..." % waveform)
		nonempty_sigs = 0
		for s in range(len(signals)):
			sig = signals[s]
			waveform_file = "%s/%sn.%s.txt" % (waveform_dir, waveform, sig.replace(" ","_"))
			if not os.path.exists(waveform_file):
				print("%s not exists!" % waveform_file)
				continue
			sig_name = sig.replace(" ","")
			if nonempty_sigs == 0:
				df = pd.read_csv(waveform_file, delimiter = '\t', header = None, names = ['time',sig_name],skipinitialspace=True,skiprows = 2, dtype={'time':np.int32})
				len_df = len(df)
				if len_df == 0:
					continue
				if df[sig_name].dtype == 'object':
					empty_loc = df.index[df[sig_name] == '-'].tolist()
					if len(empty_loc) > int(len_df/2):
						continue
					for loc in empty_loc:
						df.at[loc,sig_name] = 0.0
				df = df.loc[df['time'] % 60 == 0]
				df = df.reset_index(drop=True)
				nonempty_sigs = 1
			else:
				sub_df = pd.read_csv(waveform_file, delimiter = '\t', header = None, names = ['time',sig.replace(" ","")],skipinitialspace=True, skiprows = 2, dtype={'time':np.int32})
				len_sub_df = len(sub_df)
				if len_sub_df == 0:
					continue
				if sub_df[sig_name].dtype == 'object':
					empty_loc = sub_df.index[sub_df[sig_name] == '-'].tolist()
					if len(empty_loc) > int(len_sub_df/2):
						continue
					for loc in empty_loc:
						sub_df.at[loc,sig_name] = 0.0
				df = df.join(sub_df.set_index('time'), on = 'time',how ='left')

		if nonempty_sigs == 0:
			print("Error! empty file")
			continue

		if deathtime == '':
			lived_time = 1e18
		else:
			lived_time = (datetime.strptime(deathtime, "%Y-%m-%d %H:%M:%S") - datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")).total_seconds() / 3600.0
		
		waveform_offset = (datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(intime, "%Y-%m-%d %H:%M:%S")).total_seconds()
		actual_start_offset = max(shortest_length * 3600 - waveform_offset, waveform_grace_start * 3600) 
		los = los - (actual_start_offset/3600.0)
		lived_time = lived_time -(actual_start_offset/3600.0)

		actual_start_time = datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") + timedelta(seconds=actual_start_offset)

		index = int(actual_start_offset/freq)
		len_df = len(df) - index
		len_hr_df = len_df/60
		
		# data quality check
		num_episodes = int(len_hr_df/L2RNN_num_states)	
		if num_episodes == 0:
			print("Skipped! less than %d hours vital sign records" % (L2RNN_num_states))
			continue

		# procss the file
		print("In total %d episodes" % num_episodes)

		lenVitalSeg = 60 # 1 hour and 7 vital signs
		for ep in range(num_episodes):
			final_x = []
			final_los = []
			final_decompensation = []
			final_los_bucket = []

			cur_start_time = actual_start_time + timedelta(seconds=ep * L2RNN_duration)
			out_file = "%s/%s/%s-%sn.pickle" % (out_dir, split, episode.replace("_timeseries.csv",""), cur_start_time.strftime('%Y-%m-%d-%H-%M-%S'))

			for i in range(L2RNN_num_states):
				HourVitalSeg = np.zeros(lenVitalSeg * 7) 
				start_index = index + i * lenVitalSeg 
				for s in range(len(signal_names)):
					sig_name = signal_names[s]
					if sig_name in df.columns:	
						HourVitalSeg[(s * lenVitalSeg):((s+1) * lenVitalSeg)] = np.array(df[sig_name][start_index:(start_index+lenVitalSeg)])
				final_x.append(HourVitalSeg.reshape((lenVitalSeg*7),1))

				cur_hrs = ep*L2RNN_num_states + i
				final_los.append(los - cur_hrs)
				cur_bucket = int((los-cur_hrs)/24)
				if cur_bucket > 8:
					cur_bucket = 8
				if cur_bucket >= 14:
					cur_bucket = 9
				final_los_bucket.append(cur_bucket)

				if mortality == '0':
					cur_mortality = 0
				else:
					cur_mortality = int(lived_time - cur_hrs < future_time_interval)
				final_decompensation.append(cur_mortality)

			index = index + lenVitalSeg * L2RNN_num_states

			final_x = np.array(final_x)
			final_los = np.array(final_los)
			final_decompensation = np.array(final_decompensation)
			final_los_bucket = np.array(final_los_bucket)

			output = {'x_vitals':final_x, 'y_los':final_los, 'y_decompensation':final_decompensation, 'y_los_bucket':final_los_bucket}
			with open(out_file, 'wb') as o:
				pickle.dump(output, o, pickle.HIGHEST_PROTOCOL)
			print("Done")

processing p00/p001932/p001932-2127-04-18-15-25...
MIMIC_data/waveform_data/p00/p001932/p001932-2127-04-18-15-25n.ABP_Sys.txt not exists!
MIMIC_data/waveform_data/p00/p001932/p001932-2127-04-18-15-25n.ABP_Dias.txt not exists!
MIMIC_data/waveform_data/p00/p001932/p001932-2127-04-18-15-25n.NBP_Sys.txt not exists!
MIMIC_data/waveform_data/p00/p001932/p001932-2127-04-18-15-25n.NBP_Dias.txt not exists!
In total 7 episodes
Done
Done
Done
Done
Done
Done
Done


## 4. Process Lab and demographics

In [None]:
clinical_dir = 'MIMIC_data/clinical_data/from_benchmarks'
demographic_features = ["Ethnicity","Gender","Age"]
demographic_dtype = {"Ethnicity":np.int32,"Gender":np.int32,"Age":np.float64}
temporal_features = ["oxygen saturation","diastolic blood pressure",
                     "heart rate","mean blood pressure",
                     "respiratory rate","systolic blood pressure"]
lab_features = ["glucose","ph","temperature"]
series_dtype = {"Hours":np.float64,"glucose":np.float64,
                "oxygen saturation":np.float64,"ph":np.float64,
                "temperature":np.float64,"diastolic blood pressure":np.float64,
                "heart rate":np.float64,"mean blood pressure":np.float64,
                "respiratory rate":np.float64,"systolic blood pressure":np.float64}

in_dir = 'MIMIC_data/out_data/pickle_data'
out_dir = 'MIMIC_data/out_data/lab_demog_pickle_data'

def getLabDemog(hashEpisode, clinical_dir, in_dir, out_dir, split):
	in_files = os.listdir("%s/%s" % (in_dir,split))
	for f in in_files:
		f_name = f.replace(".pickle","")
		tokens = f_name.split("-")		
		episode = tokens[0]

		timeseries_file = "%s/%s/%s_timeseries.csv" % (clinical_dir, split, episode.replace("_","/"))
		episode_file = "%s/%s/%s.csv" % (clinical_dir, split, episode.replace("_","/"))
		out_file = "%s/%s/%s.pickle" %(out_dir, split, f_name)

		start_time = "%s-%s-%s %s:%s:%s" % (tokens[1],tokens[2], tokens[3], tokens[4], tokens[5], tokens[6])
		intime = hashEpisode["%s_timeseries.csv" % episode]	
		print("processing %s..." % f_name)

		# for output
		demographics = np.zeros(len(demographic_features))
		nonempty_labs = 0
		nonempty_labs_indicator = np.zeros(L2RNN_num_states)
		features = []
		len_labs = len(lab_features)
		len_temporal_features = len(temporal_features)
		len_features = len_labs + 2 * len_temporal_features
		hrFeature = np.zeros(len_features)
		for n in range(L2RNN_num_states):		
			features.append(hrFeature.reshape(len_features,1))
		features = np.array(features)

		# check file exists
		# read demographics
		if os.path.exists(episode_file):
			d_df = pd.read_csv(episode_file, dtype = demographic_dtype)
			if len(d_df) > 0:
				for d in range(len(demographic_features)):
					demographics[d] = d_df[demographic_features[d]][0]

		# read labs
		if not os.path.exists(timeseries_file):
			output = {'demographics':demographics, 'nonempty_labs':nonempty_labs, 'nonempty_lab_indicators':nonempty_labs_indicator, 'features':features}
			with open(out_file, 'wb') as o:
				pickle.dump(output, o, pickle.HIGHEST_PROTOCOL)
			print("No clinical data. Store empty data")
			continue

		s_df = pd.read_csv(timeseries_file, dtype = {'Hours':np.float64})
		start_interval = (datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(intime, "%Y-%m-%d %H:%M:%S")).total_seconds() / 3600.0
		s_df = s_df.loc[(s_df['Hours'] >= start_interval) & (s_df['Hours'] < (start_interval + L2RNN_num_states))]
		s_df = s_df.reset_index(drop=True)

		if len(s_df) == 0:
			output = {'demographics':demographics, 'nonempty_labs':nonempty_labs, 'nonempty_lab_indicators':nonempty_labs_indicator, 'features':features}
			with open(out_file, 'wb') as o:
				pickle.dump(output, o, pickle.HIGHEST_PROTOCOL)
			print("No clinical data. Store empty data")
			continue

		last_temporal_features = np.zeros(len_temporal_features)
		for index in range(len(s_df)):
			cur_hr = int(s_df['Hours'][index] - start_interval)	
			for l in range(len_labs):
				cur_lab = s_df[lab_features[l]][index]
				if isinstance(cur_lab, basestring):
					cur_lab = cur_lab.replace("-","")
					cur_lab = cur_lab.replace("cs","")
					cur_lab = cur_lab.replace("CS","")
					cur_lab = cur_lab.replace(" ","")
					cur_lab = cur_lab.replace("u","")
					cur_lab = cur_lab.replace("/",".")
					#if cur_lab in invalid_values:
					#	cur_lab = "0"	
				try:	
					cur_lab = float(cur_lab)
				except ValueError:
					print("xxxxx %s" % cur_lab)
					cur_lab = 0.0

				if pd.isnull(cur_lab):
					continue
				features[cur_hr][l] = cur_lab 
				nonempty_labs = 1
				nonempty_labs_indicator[cur_hr] = nonempty_labs_indicator[cur_hr] + 1

			for t in range(len_temporal_features):
				cur_feature = s_df[temporal_features[t]][index]
				#print(cur_feature)
				if isinstance(cur_feature, basestring):
					cur_feature = cur_feature.replace("-","")
					cur_feature = cur_feature.replace("cs","")
					cur_feature = cur_feature.replace("CS","")
					cur_feature = cur_feature.replace(" ","")
					#if cur_feature in invalid_values:
					#	cur_feature = '0'
				try:
					cur_feature = float(cur_feature)
				except ValueError:
					print("xxxxx %s" % cur_feature)
					cur_feature = 0.0
				if pd.isnull(cur_feature):
					if last_temporal_features[t] !=0:
						features[cur_hr][len_labs + t] = last_temporal_features[t]
					continue
				features[cur_hr][len_labs + len_temporal_features + t] = 1
				if last_temporal_features[t] == 0:
					for h in range(cur_hr):
						features[h][len_labs + t] = cur_feature
				features[cur_hr][len_labs + t] = cur_feature
				last_temporal_features[t] = cur_feature

		output = {'demographics':demographics, 'nonempty_labs':nonempty_labs, 'nonempty_lab_indicators':nonempty_labs_indicator, 'features':features}
		#print(output)
		with open(out_file, 'wb') as o:
			pickle.dump(output, o, pickle.HIGHEST_PROTOCOL)
		print("Done")

hashEpisode = {}
with open(map_episode_waveform_file, "rb") as f:
	rd = csv.reader(f, delimiter = ',')
	#rd.next()
	for row in rd:
		episode = row[1]
		intime = row[9]
		hashEpisode[episode] = intime

getLabDemog(hashEpisode, clinical_dir, in_dir, out_dir, "train")

## 5. Process interventions

In [None]:
clinical_dir = 'MIMIC_data/clinical_data/from_benchmarks'
in_dir = 'MIMIC_data/out_data/pickle_data'
out_dir = 'MIMIC_data/out_data/intervention_pickle_data'
files = ['PROCEDUREEVENTS_MV_unique.csv',
         'INPUTEVENTS_MV_unique.csv',
         'INPUTEVENTS_CV_unique.csv']

def getOthers(files,clinical_dir, in_dir, out_dir, split):
	in_files = os.listdir("%s/%s" % (in_dir,split))
	for f in in_files:
		f_name = f.replace(".pickle","")
		tokens = f_name.split("-")		
		episode = tokens[0]
		toks = episode.split("_")
		sub_id = toks[0]

		out_file = "%s/%s/%s.pickle" %(out_dir, split, f_name)
		start_time = "%s-%s-%s %s:%s:%s" % (tokens[1],tokens[2], tokens[3], tokens[4], tokens[5], tokens[6])

		# for output
		nonempty_interventions = 0
		nonempty_interventions_indicator = np.zeros(L2RNN_num_states)	
		print("processing %s..." % f_name)
		for ff in files:
			input_file = "%s/%s/%s/%s" % (clinical_dir, split, sub_id, ff)
			if not os.path.exists(input_file):
				continue
			df = pd.read_csv(input_file)
			if len(df) == 0:
				continue
			for index in range(len(df)):
				cur_time = df['STARTTIME'][index]
				cur_hr = (datetime.strptime(cur_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")).total_seconds() / 3600.0				
				cur_hr = int(cur_hr)
				if cur_hr in range(L2RNN_num_states):
					nonempty_interventions_indicator[cur_hr] = nonempty_interventions_indicator[cur_hr] + 1	
					nonempty_interventions = 1

		output = {'nonempty_interventions':nonempty_interventions, 'nonempty_interventions_indicators':nonempty_interventions_indicator}
		#print(output)
		with open(out_file, 'wb') as o:
			pickle.dump(output, o, pickle.HIGHEST_PROTOCOL)

getOthers(files,clinical_dir, in_dir, out_dir, "train")