In [9]:
import modin.pandas as pd
import numpy as np
import ast
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from google.oauth2 import service_account
import gcsfs
import dask.dataframe as dd

BUCKET_NAME = 'events_trabalho_cdle'
TRAINING_FILE = 'training_padded.csv'
TESTING_FILE = 'testing_padded.csv'

# Authenticate using the service account key file with specified scopes
credentials = service_account.Credentials.from_service_account_file(
    '/Users/rcs/Downloads/cdla-trabalho-db9f3d742a66.json',
    scopes=["https://www.googleapis.com/auth/devstorage.read_write"]
)
fs = gcsfs.GCSFileSystem(project='cdla-trabalho', token=credentials)
# Load CSV files into Dask DataFrames using the authenticated GCSFileSystem
storage_options = {'project': 'cdla-trabalho', 'token': credentials}
ddf_train = dd.read_csv(f'gs://{BUCKET_NAME}/{TRAINING_FILE}', storage_options=storage_options)
ddf_test = dd.read_csv(f'gs://{BUCKET_NAME}/{TESTING_FILE}', storage_options=storage_options)

# Convert Dask DataFrames to Pandas DataFrames
train_df = ddf_train.compute()
test_df = ddf_test.compute()

# Optionally, you can now convert Pandas DataFrames to Modin DataFrames if needed
train_df_modin = pd.DataFrame(train_df)
test_df_modin = pd.DataFrame(test_df)



In [11]:
# Function to parse sequence string to list of sublists
def parse_sequence(sequence):
    try:
        # Convert the string representation of the list to an actual list
        sequence_list = ast.literal_eval(sequence)
        # Flatten each sublist: [timestamp, (a, b), (c, d)] -> [timestamp, a, b, c, d]
        flattened_sequence = []
        for sublist in sequence_list:
            if sublist == [-1]:
                flattened_sequence.append([-1] * 11)  # Adjust the length as per the maximum possible sublist length
            else:
                flattened_sublist = [sublist[0]] + [item for tup in sublist[1:] for item in tup]
                if len(flattened_sublist) < 11:
                    flattened_sublist += [-1] * (11 - len(flattened_sublist))
                flattened_sequence.append(flattened_sublist)
        return flattened_sequence
    except (SyntaxError, ValueError):
        return [[-1] * 11]  # Adjust the length as per the maximum possible sublist length

# Apply the parse_sequence function
train_df_modin['Padded_Sequence'] = train_df_modin['Padded_Sequence'].apply(parse_sequence)
test_df_modin['Padded_Sequence'] = test_df_modin['Padded_Sequence'].apply(parse_sequence)

# Ensure all sequences have the same length
max_length = max(train_df_modin['Padded_Sequence'].apply(len).max(), test_df_modin['Padded_Sequence'].apply(len).max())
input_dim = 11  # This should be the length of the flattened sublist

def pad_sequences(sequences, max_length, input_dim):
    padded_sequences = []
    for seq in sequences:
        padded_seq = seq + [[-1] * input_dim] * (max_length - len(seq))
        padded_sequences.append(padded_seq)
    return np.array(padded_sequences, dtype=np.float32)

X_train = pad_sequences(train_df_modin['Padded_Sequence'].tolist(), max_length, input_dim)
y_train = train_df_modin['LOS'].values

X_test = pad_sequences(test_df_modin['Padded_Sequence'].tolist(), max_length, input_dim)
y_test = test_df_modin['LOS'].values

# Print shapes to verify
print(X_train.shape)
print(X_test.shape)

(6998, 1, 11)
(1750, 1, 11)




In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense, Dropout, LSTM, Input, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Check the minimum sequence length
min_seq_length = max_length
print(f"Minimum sequence length: {min_seq_length}")

# Define the CNN-LSTM model with appropriate kernel sizes
model = Sequential([
    Input(shape=(max_length, input_dim)),
    Conv1D(32, 1, activation='relu'),  # Kernel size 1 to handle very short sequences
    Dropout(0.3),
    Conv1D(64, 1, activation='relu'),
    Dropout(0.3),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(64),
    LeakyReLU(alpha=0.1),
    Dense(32),
    LeakyReLU(alpha=0.1),
    Dense(1)
])

# Compile the model with a smaller learning rate
model.compile(optimizer=Adam(learning_rate=0.0001), loss='mse', metrics=['mae'])

# Callbacks for early stopping and reducing learning rate
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.00001)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_test, y_test), 
                    callbacks=[early_stopping, reduce_lr], verbose=2)

model.save("cnn_lstm_model.keras")

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test MAE: {mae}')


Minimum sequence length: 1
Epoch 1/100
438/438 - 5s - 12ms/step - loss: 75.8038 - mae: 5.7209 - val_loss: 48.7984 - val_mae: 4.6746 - learning_rate: 1.0000e-04
Epoch 2/100
438/438 - 1s - 2ms/step - loss: 53.4863 - mae: 4.6215 - val_loss: 48.7600 - val_mae: 4.6182 - learning_rate: 1.0000e-04
Epoch 3/100
438/438 - 1s - 2ms/step - loss: 53.1771 - mae: 4.6470 - val_loss: 48.9073 - val_mae: 4.7513 - learning_rate: 1.0000e-04
Epoch 4/100
438/438 - 1s - 2ms/step - loss: 53.4353 - mae: 4.6549 - val_loss: 48.8165 - val_mae: 4.6911 - learning_rate: 1.0000e-04
Epoch 5/100
438/438 - 1s - 2ms/step - loss: 53.1843 - mae: 4.6572 - val_loss: 48.7553 - val_mae: 4.5964 - learning_rate: 1.0000e-04
Epoch 6/100
438/438 - 1s - 2ms/step - loss: 53.2457 - mae: 4.6636 - val_loss: 49.0021 - val_mae: 4.4113 - learning_rate: 1.0000e-04
Epoch 7/100
438/438 - 1s - 2ms/step - loss: 53.3094 - mae: 4.6458 - val_loss: 48.8088 - val_mae: 4.5044 - learning_rate: 1.0000e-04
Epoch 8/100
438/438 - 1s - 2ms/step - loss: 52.8