In [15]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
import pickle
import csv

In [53]:
#Original Data Extraction

DATASET_PATH = './Data-for-Project.csv'
dataset = pd.read_csv(DATASET_PATH, encoding="ISO-8859-1", dtype={"RID": float, "VISCODE": "string", "AGE": float, "PTGENDER": float, "PTEDUCAT": float, "APOE4": float, "ABETA": float, "TAU": float, "Ventricles": float, "Hippocampus": float,
  "WholeBrain": float, "Entorhinal": float, "Fusiform": float,  "MidTemp": float, "ICV": float,"ICV_bl": float,"ADAS11": float, "ADAS13": float,  "MMSE": float, "DX": float,})

features = ['RID', 'VISCODE', 'AGE','PTGENDER', 'PTEDUCAT', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV_bl']
labels = ['RID', 'VISCODE', 'MMSE', 'ADAS11', 'ADAS13']
cols = ['RID', 'VISCODE', 'AGE','PTGENDER', 'PTEDUCAT', 'MMSE', 'ADAS11', 'ADAS13','ICV', 'DX', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV_bl']
df = pd.DataFrame(dataset, columns=cols)
vis_codes = sorted(df.VISCODE.unique(), key=lambda x: (len(x), x)) 
vis_codes_counts = df.VISCODE.value_counts()
patient_codes = df.RID.unique()
print(df.head())
print(vis_codes)
print(vis_codes_counts)
print(type(vis_codes_counts))
print('patient_codes', len(patient_codes))

   RID VISCODE   AGE  PTGENDER  PTEDUCAT  MMSE  ADAS11  ADAS13        ICV  \
0  2.0      bl  74.3       0.0      16.0  28.0   10.67   18.67  1984660.0   
1  3.0      bl  81.3       0.0      18.0  20.0   22.00   31.00  1920690.0   
2  3.0     m06  81.3       0.0      18.0  24.0   19.00   30.00  1906430.0   
3  3.0     m12  81.3       0.0      18.0  17.0   24.00   35.00  1903820.0   
4  3.0     m24  81.3       0.0      18.0  19.0   25.67   37.67  1903420.0   

    DX  Ventricles  Hippocampus  WholeBrain  Entorhinal  Fusiform  MidTemp  \
0  0.0    118233.0       8336.0   1229740.0      4177.0   16559.0  27936.0   
1  2.0     84599.0       5319.0   1129830.0      1791.0   15506.0  18422.0   
2  2.0     88580.0       5446.0   1100060.0      2427.0   14400.0  16972.0   
3  2.0     90099.0       5157.0   1095640.0      1596.0   14617.0  17330.0   
4  2.0     97420.0       5139.0   1088560.0      1175.0   14033.0  16398.0   

      ICV_bl  
0  1984660.0  
1  1920690.0  
2  1920690.0  
3  19206

In [54]:
#Transform data to time steps with format: columns
def getDefaultMap():
    return [[vis_codes[i]] for i in range(len(vis_codes))]

timestepIndex = {}
for i in range(len(vis_codes)):
    timestepIndex[vis_codes[i]] = i

patientsDf = defaultdict(getDefaultMap)

for i, row in df.iterrows():
    patientsDf[row['RID']][timestepIndex[row['VISCODE']]] = list(row)

#populate data into a list of patients with all the available timesteps
patientsDataAll = []
for patient in patientsDf.keys():
    for i in range(len(vis_codes)):
        if len(patientsDf[patient][i]) == 1:
            patientsDf[patient][i] = [patient] + patientsDf[patient][i]
        patientsDataAll.append(patientsDf[patient][i])

In [58]:
#find number number of patients for each number of records
conditionSet = ['bl', 'm12', 'm24', 'm36', 'm48','m60', 'm72','m84', 'm96', 'm108', 'm120', 'm132','m144', 'm156']
MIN_SEQUENCES = 3
patientsSequences = defaultdict(set)

def hasAllEntries(arr):
    return not pd.DataFrame(arr).isnull().any().any()

def getPatientsWithNTimeSteps(n):
    patientsData = {}
    for patient in patientsDf.keys():
        currPatient = []

        for condition in conditionSet:
            i = timestepIndex[condition]
            if len(patientsDf[patient][i]) == len(cols):
                currPatient.append(patientsDf[patient][i])
                if len(currPatient) == n:
                    break
            else:
                currPatient = []
        
        if len(currPatient) == n:
            patientsData[patient] = currPatient

    return patientsData

seqs = []
for n in range(MIN_SEQUENCES, len(conditionSet)):
    patientsSequences[n] = getPatientsWithNTimeSteps(n)
    seqs.append([n, len(patientsSequences[n])])

print('Number of Patients for each sequence', seqs)

Number of Patients for each sequence [[3, 1267], [4, 819], [5, 543], [6, 337], [7, 234], [8, 159], [9, 98], [10, 59], [11, 27], [12, 12], [13, 4]]


In [75]:
# choose how many timesteps will be used
total_steps = 5
sequence_key = total_steps-MIN_SEQUENCES

#Create time series - (samples, time steps, features)
N = seqs[sequence_key][1]
i = 0
n_cols = len(cols)-2

cleaned_data = np.zeros((N, total_steps, n_cols))

patientSet = patientsSequences[seqs[sequence_key][0]]
print('patients #', len(patientSet))

for patient in patientSet:
    currPatient = patientSet[patient]
    
    # filter patients with no ICV_bl
    currPatient = np.array(currPatient)
    if np.isnan(currPatient[:, 16].astype(float)).any(axis=0):
        continue

    for timestep in range(total_steps):
        dfcols = len(currPatient[0])
        cleaned_data[i][timestep] =currPatient[timestep][2:dfcols]
    i+=1


print("resulting data", cleaned_data.shape)
print('ex: ', cleaned_data[0][4])

# delete patients with no ICV_bl
""" ICV_bl = cleaned_data[:,:,14]
cleaned_data = cleaned_data[~np.isnan(ICV_bl).any()]
print("elete patients with no ICV_bl", cleaned_data.shape)
print('ex: ', cleaned_data[0][4]) """

# replace missing values with zeros
cleaned_data = np.where(np.isnan(cleaned_data), 0, cleaned_data)

print("replace NaNs with zeros", cleaned_data.shape)
print('ex: ', cleaned_data[0][4])

patients # 543
resulting data (543, 5, 15)
ex:  [7.43000e+01 0.00000e+00 1.60000e+01 2.80000e+01         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan 1.98466e+06]
replace NaNs with zeros (543, 5, 15)
ex:  [7.43000e+01 0.00000e+00 1.60000e+01 2.80000e+01 0.00000e+00 0.00000e+00
 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
 0.00000e+00 0.00000e+00 1.98466e+06]


In [69]:
# check on data
import sys
np.set_printoptions(threshold=sys.maxsize)
print(cleaned_data)

[[[7.43000e+01 0.00000e+00 1.60000e+01 2.80000e+01 1.40000e+01
   2.30000e+01 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
   0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.98466e+06]
  [7.43000e+01 0.00000e+00 1.60000e+01 2.30000e+01 1.20000e+01
   2.10000e+01 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
   0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.98466e+06]
  [7.43000e+01 0.00000e+00 1.60000e+01 2.40000e+01 9.00000e+00
   1.40000e+01 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
   0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.98466e+06]
  [7.43000e+01 0.00000e+00 1.60000e+01 2.50000e+01 1.00000e+01
   1.80000e+01 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
   0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.98466e+06]
  [7.43000e+01 0.00000e+00 1.60000e+01 2.80000e+01 0.00000e+00
   0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
   0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.98466e+06]]

 [[7.17000e+01 0.00000e+00 1.40000e+01 2.60000e+

In [76]:
# split into 80/10/10 train, validate, and test sets
RANDOM_STATE = 0
SHUFFLE = False
validate_and_test_size = int(N*0.2)
train, validate_and_test = train_test_split(cleaned_data, test_size=validate_and_test_size, shuffle=SHUFFLE, random_state=RANDOM_STATE)
test_size = int(validate_and_test.shape[0] * 0.5)
validate, test = train_test_split(validate_and_test, test_size=test_size, shuffle=SHUFFLE, random_state=RANDOM_STATE)

print("train", train.shape)
print("validate", validate.shape)
print("test", test.shape)


train (435, 5, 15)
validate (54, 5, 15)
test (54, 5, 15)


In [79]:
# format data into X, Y
train_X = train[:,:total_steps,:]
validate_X = validate[:,:total_steps,:]
test_X = test[:,:total_steps,:]

print("train_X", train_X.shape)
print("validate_X", validate_X.shape)
print("test_X", test_X.shape)
print('ex train_X:', train_X[0])
#print('ex test_X:', test_X[7])


def get_one_hot_encoding(dx):
    '''
        Returns one-hot encoding of a diagnosis [CN, MCI, AD]
        CN = 1, MCI = 2, AD = 3
    '''
    if dx == 1:
        return np.array([1, 0, 0]).astype(float)
    elif dx == 2:
        return np.array([0, 1, 0]).astype(float)
    elif dx == 3:
        return np.array([0, 0, 1]).astype(float)
    else:
        return np.array([0, 0, 0]).astype(float)
        
train_Y = train[:,1:,3:6]
validate_Y = validate[:,1:,3:6]
test_Y = test[:,1:,3:6]

print("train_Y", train_Y.shape)
print("validate_Y", validate_Y.shape)
print("test_Y", test_Y.shape)
print('ex train_Y:', train_Y[0])
#print('ex test_Y:', test_Y[7])
#print('===== original')
#print('ex train:', train[0])
#print('ex test:', test[7])

train_X (435, 5, 15)
validate_X (54, 5, 15)
test_X (54, 5, 15)
ex train_X: [[7.43000e+01 0.00000e+00 1.60000e+01 2.80000e+01 1.40000e+01 2.30000e+01
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.98466e+06]
 [7.43000e+01 0.00000e+00 1.60000e+01 2.30000e+01 1.20000e+01 2.10000e+01
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.98466e+06]
 [7.43000e+01 0.00000e+00 1.60000e+01 2.40000e+01 9.00000e+00 1.40000e+01
  0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.98466e+06]
 [7.43000e+01 0.00000e+00 1.60000e+01 2.50000e+01 1.00000e+01 1.80000e+01
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.98466e+06]
 [7.43000e+01 0.00000e+00 1.60000e+01 2.80000e+01 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.9

In [74]:
# export into a pickle file in Jung's format
FILENAME = 'ADNI_DATA_JUNG_FORMAT_5.pkl'
DATA = {'Train_data': train_X, 'Valid_data': validate_X, 'Test_data': test_X,
'Train_label': train_Y, 'Valid_label': validate_Y, 'Test_label': test_Y
}

with open(FILENAME, 'wb') as handle:
    pickle.dump(DATA, handle, protocol=pickle.HIGHEST_PROTOCOL)