# AE LSTM for Spontaneous Using all temp data.
We will execute the AE-LSTM code for Spontaneous using entire days' data.
From previous version of AE-LSTM code, we will use a time distributed dense layer.


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import keras
from sklearn.model_selection import train_test_split
import numpy as np
from keras import layers

import math
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import KFold

from keras.preprocessing.sequence import pad_sequences
keras.utils.set_random_seed(912)


2024-07-24 18:16:10.787881: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-24 18:16:10.812584: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Total PID 51- Read the PIDS from the file Spontaneous_PIDs.txt 
pids = np.loadtxt('../Spontaneous_PIDs_v0.txt')
pids.sort()

exp_dir = '/home/chinmai/src/laborprediction/Autoencoder/ConvAE_GA245_Encodings_y'
max_seq_len = 0

In [3]:
def create_matrix_from_pid(plist):
    global exp_dir, max_seq_len
    # In this block we want to read 5min avg temperature data for training, validation, and test PIDs
    x_arr = []
    y_arr = []
    count = 0
    for pid in plist:
        fname = os.path.join(exp_dir,str(int(pid))+'_5temp_encoding.csv')
        #print('Processing pid: ',pid)
        data = np.loadtxt(fname,delimiter=',')
        mr,mc = data.shape
        # 0: Gestational Age, 1:97 - Temperature Data, 98 - Days to Labor 
        # NOTE: July 12, setting this value from 1:17 insteaf of 0:17 excludes
        # gestational age as a feature.
        d1 = data[:,1:65]
        d2 = data[:,65:]
        if np.isnan(d1).any():
            print(f"PID={pid} has NaN values")
        # Adjust the maximum length of the sequence.
        if mr > max_seq_len:
            max_seq_len = mr
        #print('Max Seq Len :',max_seq_len)
        x_arr.append(d1)
        y_arr.append(d2)
    
        count += 1
    """
    x_pad = pad_sequences(x_arr, maxlen=max_len, dtype='float32', padding='post', value=0.0)
    y_pad = pad_sequences(y_arr, maxlen=max_len, dtype='float32', padding='post', value=0.0)
    x_arr_pad = np.asarray(x_pad)
    y_arr_pad = np.asarray(y_pad)
    print(x_arr_pad.shape)
    """
    return x_arr, y_arr

def add_padding (x_list):
    global max_seq_len
    x_pad = pad_sequences(x_list, maxlen=max_seq_len, dtype='float32', padding='post', value=0.)
    x_arr_pad = np.asarray(x_pad)
    return x_arr_pad
    

In [4]:
def define_lstm(seq_len, enc_dim):
    # To this function, we will pass the sequence length and the encoding dimension.
    input_seq = keras.Input(shape=[seq_len,enc_dim])    # Shape =[47,16]
    m1 = layers.Masking(mask_value = 0.)(input_seq)
    #l1 = layers.LSTM(4,activation = 'tanh', use_bias=True, kernel_initializer="glorot_uniform",
    #             recurrent_initializer="orthogonal", bias_initializer="zeros", return_sequences = True)(m1)
    l1 = layers.LSTM(256,activation = 'tanh', return_sequences = True)(m1)
    ln1 = layers.LayerNormalization(axis=1)(l1)
    #x   = l1(m1)
    #ln1 = layers.LayerNormalization(axis=1)(x)
    d0 = layers.TimeDistributed(layers.Dense(128,activation='LeakyReLU', kernel_initializer='glorot_uniform'))(ln1)
    o1 = layers.TimeDistributed(layers.Dense(1,activation='linear', kernel_initializer='glorot_uniform'))(d0)
    lstm = keras.Model(input_seq,o1)
    return lstm
    #d1 = layers.Dense(1,activation='linear', kernel_initializer='glorot_uniform')(d0)


In [5]:
res_val = []
err = []
x_tr= []
y_tr= []
x_va = []
y_va = []
x_te= []
y_te= []
def main():
    global res_val, x_tr, x_te, y_tr, y_te, x_va, y_va
    # Start the cross Validation process
    kf = KFold(n_splits=10)
    kf.get_n_splits(pids)

    #print(kf)
    #train_pids = []
    #test_pids = []
    res_list = []
    best_res = 9999999999.0
    best_fold = ''
    # CROSS VALIDAION
    for i, (train_index, test_index) in enumerate(kf.split(pids)):
        print(f"Fold {i}:")
        tr_pids = []
        test_pids = []    
        for j in train_index:
            tr_pids.append(pids[j])
        for k in test_index:
            test_pids.append(pids[k])
    
        # Split training data into Training + Validation. 
        # Since we have 51 PIDS and 10 folds, we will assign 5 PIDs data to validation dataset.
        train_pids = tr_pids[:-5]
        val_pids   = tr_pids[-5:]

        print(train_pids,val_pids)
        print(test_pids)
        
        # In this block we want to read 5min avg temperature data for training, validation, and test PIDs
        x_train, y_train = create_matrix_from_pid(train_pids)    
        x_val  , y_val   = create_matrix_from_pid(val_pids)
        x_test , y_test  = create_matrix_from_pid(test_pids)

        x_tr = add_padding(x_train)
        y_tr = add_padding(y_train)
        x_va = add_padding(x_val)
        y_va = add_padding(y_val)
        x_te = add_padding(x_test)
        y_te = add_padding(y_test)
        
        #print(x_tr.shape, x_va.shape, x_te.shape)

        #lstm = define_lstm(47, 64)
        lstm = define_lstm(max_seq_len, 64)
        lstm.summary()
        # Define Callbacks
        early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

        # Compile the model using Adam optimizer and mean absolute error loss function.
        lstm.compile(optimizer=keras.optimizers.Adam(), loss='mean_absolute_error')

        # Need to add fold name here. Otherwise it'll overwrite.
        checkpoint_filepath = './tmp/checkpointLSTM_'+ str(i)
        check_point = keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_filepath,
            save_weights_only=True,
            monitor='val_loss',
            mode='min',
            save_best_only=True)
        
        # Train the model for 100 epocs, batch size as 32, and use validation data for hyperparameter tuning.
        history = lstm.fit(x_tr, y_tr,
                        epochs=100,
                        batch_size=2,
                        callbacks = [early_stop,check_point],
                        validation_data=(x_va, y_va))
        # Save the Encoder model weights, to load and generate encodings later on.
        lstm.save('./LSTM_weights/lstm_night_fold'+str(i)+'.keras')
        
        
        # Run the Evaluate function on the test dataset.
        res = lstm.evaluate(x_te,y_te)
        res_v = lstm.predict(x_te)
        res_val.append(res_v)
        err.append(res)
        print(res)
        #for item in res_v:
            #print((item))
        print(res_v.shape)
        
    
main()


Fold 0:
[15.0, 29.0, 35.0, 36.0, 48.0, 55.0, 61.0, 67.0, 69.0, 72.0, 73.0, 75.0, 77.0, 80.0, 88.0, 90.0, 91.0, 93.0, 97.0, 99.0, 102.0, 106.0, 107.0, 108.0, 110.0, 111.0, 113.0, 120.0, 126.0, 129.0, 136.0, 138.0, 141.0, 144.0, 152.0, 156.0, 161.0, 170.0, 173.0, 179.0] [183.0, 191.0, 193.0, 195.0, 197.0]
[1.0, 2.0, 7.0, 10.0, 11.0]


2024-07-24 18:16:16.202923: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-07-24 18:16:16.202942: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: chinmai-x17
2024-07-24 18:16:16.202945: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: chinmai-x17
2024-07-24 18:16:16.203056: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.183.6
2024-07-24 18:16:16.203065: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 535.183.6
2024-07-24 18:16:16.203067: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 535.183.6


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 47, 64)]          0         
                                                                 
 masking (Masking)           (None, 47, 64)            0         
                                                                 
 lstm (LSTM)                 (None, 47, 256)           328704    
                                                                 
 layer_normalization (Layer  (None, 47, 256)           94        
 Normalization)                                                  
                                                                 
 time_distributed (TimeDist  (None, 47, 128)           32896     
 ributed)                                                        
                                                                 
 time_distributed_1 (TimeDi  (None, 47, 1)             129   

# Warning Message
## Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
The resulting waring message might be caused due to the GPU architecture. The link below specifies that
they do not see this type of error when GPU is disabled.\
[https://developer.apple.com/forums/thread/716638](https://developer.apple.com/forums/thread/716638)

In [6]:
print(sum(err)/len(err)*1.0)

4.890251088142395


In [7]:
max_seq_len

47

In [21]:
# WRITING Predictions/Errors to an output CSV File.

#print(len(err))
# Convert res_val which contains all the predictions into an array 
res_arr = np.array(res_val)
#print(res_arr.shape)
#print(sum(err)/10.0)

# Duplicate the split process.
kf = KFold(n_splits=10)
kf.get_n_splits(pids)
total_mr = 0
# Open a file for writing.
fd = open('Fold_out.csv','w')
# For Each Fold: i will go from 0 - 9
for i, (train_index, test_index) in enumerate(kf.split(pids)):
    test_pids = []
    # Extract the test pid's from their indices
    for j in test_index:
        test_pids.append(pids[j])
        
    #print (test_pids)
    test_len = len(test_pids)
    for k in range(0,test_len):
        print('Processing PID: ',test_pids[k])
        # For each PID in the test set, read the input file corresponding to that PID
        fname = os.path.join(exp_dir,str(int(test_pids[k]))+'_5temp_encoding.csv')
        # Get total no. of rows in that file.
        data = np.loadtxt(fname,delimiter=',')
        # Extract last column, which is days till labor onset.
        d2 = data[:,17:]
        # mr is the total number of days of data starting from 245 for the PID.
        mr,mc = data.shape
        print (mr,mc)
        total_mr += mr
        # i - Fold, k - PID of the fold, 0-mr is valid days ignoring padding.
        tmp = res_val[i][k][0:mr]
        temp = tmp.reshape(1,mr)
        #print(temp)
        # First Write PID
        fd.write(str(int(test_pids[k]))+',')
        if data[0][0] != 245:
            print('Starting GA for PID is not 245, instead: ',data[0][0])
            start_diff = data[0][0] - 245 
            for m in range(0,int(start_diff)):
                fd.write(' ,')
        # Write the absolute Error.
        for l in range(0,len(temp[0])):
            # Write model predictions.
            #fd.write(str(abs(temp[0][l]))+',')
            
            # Difference of prediction - days to labor onset = Error
            fd.write(str(abs(d2[l][0]- temp[0][l]))+',')
        fd.write('\n')

fd.close()

Processing PID:  1.0
29 18
Processing PID:  2.0
20 18
Processing PID:  7.0
32 18
Starting GA for PID is not 245, instead:  248.0
Processing PID:  10.0
42 18
Processing PID:  11.0
30 18
Processing PID:  15.0
28 18
Starting GA for PID is not 245, instead:  259.0
Processing PID:  29.0
45 18
Processing PID:  35.0
33 18
Processing PID:  36.0
35 18
Processing PID:  48.0
29 18
Processing PID:  55.0
25 18
Processing PID:  61.0
23 18
Processing PID:  67.0
23 18
Starting GA for PID is not 245, instead:  257.0
Processing PID:  69.0
41 18
Processing PID:  72.0
40 18
Processing PID:  73.0
28 18
Processing PID:  75.0
21 18
Processing PID:  77.0
29 18
Processing PID:  80.0
26 18
Starting GA for PID is not 245, instead:  257.0
Processing PID:  88.0
38 18
Processing PID:  90.0
19 18
Processing PID:  91.0
20 18
Processing PID:  93.0
26 18
Processing PID:  97.0
36 18
Processing PID:  99.0
38 18
Processing PID:  102.0
30 18
Processing PID:  106.0
28 18
Starting GA for PID is not 245, instead:  246.0
Proce

In [26]:
# OUTPUT RELATIVE TO LABOR DATE

# Convert res_val which contains all the predictions into an array 
res_arr = np.array(res_val)
#print(res_arr.shape)
#print(sum(err)/10.0)

# Duplicate the split process.
kf = KFold(n_splits=10)
kf.get_n_splits(pids)

# What is the maximum days before labor we are predicting results?
max_ld = 46

# Open a file for writing.
fd = open('Fold_out.csv','w')
# For Each Fold: i will go from 0 - 9
for i, (train_index, test_index) in enumerate(kf.split(pids)):
    test_pids = []
    # Extract the test pid's from their indices
    for j in test_index:
        test_pids.append(pids[j])
        
    #print (test_pids)
    test_len = len(test_pids)
    for k in range(0,test_len):
        print('Processing PID: ',test_pids[k])
        # For each PID in the test set, read the input file corresponding to that PID
        fname = os.path.join(exp_dir,str(int(test_pids[k]))+'_5temp_encoding.csv')
        # Get total no. of rows in that file.
        data = np.loadtxt(fname,delimiter=',')
        # Extract last column, which is days till labor onset.
        d2 = data[:,17:]
        # mr is the total number of days of data starting from 245 for the PID.
        mr,mc = data.shape
        print (mr,mc)

        # i - Fold, k - PID of the fold, 0-mr is valid days ignoring padding.
        tmp = res_val[i][k][0:mr]
        temp = tmp.reshape(1,mr)
        #print(temp)
        # First Write PID
        fd.write(str(int(test_pids[k]))+',')
        
        start_diff = 46 - d2[0][0] 
        for m in range(0,int(start_diff)):
                fd.write(' ,')
        # Write the absolute Error.
        for l in range(0,len(temp[0])):
            # Write model predictions.
            fd.write(str(abs(temp[0][l]))+',')
            
            # Difference of prediction - days to labor onset = Error
            #fd.write(str(abs(d2[l][0]- temp[0][l]))+',')
            #fd.write(str(d2[l][0]- temp[0][l])+',')
        fd.write('\n')

fd.close()

Processing PID:  1.0
29 18
Processing PID:  2.0
20 18
Processing PID:  7.0
32 18
Processing PID:  10.0
42 18
Processing PID:  11.0
30 18
Processing PID:  15.0
28 18
Processing PID:  29.0
45 18
Processing PID:  35.0
33 18
Processing PID:  36.0
35 18
Processing PID:  48.0
29 18
Processing PID:  55.0
25 18
Processing PID:  61.0
23 18
Processing PID:  67.0
23 18
Processing PID:  69.0
41 18
Processing PID:  72.0
40 18
Processing PID:  73.0
28 18
Processing PID:  75.0
21 18
Processing PID:  77.0
29 18
Processing PID:  80.0
26 18
Processing PID:  88.0
38 18
Processing PID:  90.0
19 18
Processing PID:  91.0
20 18
Processing PID:  93.0
26 18
Processing PID:  97.0
36 18
Processing PID:  99.0
38 18
Processing PID:  102.0
30 18
Processing PID:  106.0
28 18
Processing PID:  107.0
45 18
Processing PID:  108.0
33 18
Processing PID:  110.0
25 18
Processing PID:  111.0
41 18
Processing PID:  113.0
29 18
Processing PID:  120.0
41 18
Processing PID:  126.0
24 18
Processing PID:  129.0
47 18
Processing PI

In [17]:
print(total_mr)

1593
