In [3]:
import pandas as pd
import warnings
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, concatenate
from tensorflow.keras.callbacks import EarlyStopping
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from tcn import TCN

warnings.filterwarnings('ignore')


In [4]:
file_path = "myData2.parquet"
df = pd.read_parquet(file_path)


In [5]:
selected_features = ['timestamp_seconds', # lowers the accuracy 
                     'node_memory_Percpu_bytes', 
                     'node_context_switches_total', 
                     'surfsara_power_usage', 
                     'node_netstat_Tcp_InSegs', 
                     'node_netstat_Tcp_OutSegs', 
                     'node_network_transmit_packets_total-sum', 
                     'node_filesystem_size_bytes-sum', 
                     'node_filesystem_files-sum', 
                     'node_memory_MemFree_bytes', 
                     'node_netstat_Tcp_InErrs']


In [6]:
# Add a new column 'failed_jobs' representing the target variable
df['failed_jobs'] = (df['state'] == 'FAILED').astype(int)

# Extract relevant columns
df_selected = df[['timestamp', 'state'] + selected_features].copy()

# Encode the target variable 'state' to binary (0 for "COMPLETED", 1 otherwise)
df_selected['target'] = (df_selected['state'] != 'COMPLETED').astype(int)

# Drop the original 'state' column
df_selected.drop('state', axis=1, inplace=True)

# Define time intervals
time_intervals = {'minute': '1T', 'hour': '1H', 'day': '1D'}

# Normalize selected features
scaler = MinMaxScaler()
df_selected[selected_features] = scaler.fit_transform(df_selected[selected_features])

# Set sequence length
sequence_length = 30

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [7]:
# Function to prepare data
def prepare_data(data, time_interval):
    data.set_index('timestamp', inplace=True) # FixMe
    data_resampled = data.resample(time_interval).sum()
    data_resampled['target'] = data_resampled['target'].clip(upper=1)  # Clip values to 1
    return data_resampled

# Function to create sequences
def create_sequences(data, sequence_length):
    sequences, targets = [], []
    for i in range(len(data) - sequence_length):
        seq = data.iloc[i:i+sequence_length].values
        target = data.iloc[i+sequence_length]['target']
        sequences.append(seq)
        targets.append(target)
    return np.array(sequences), np.array(targets)


# Hour

In [8]:
# Prepare data with hourly intervals
data_hour = prepare_data(df_selected, time_intervals['hour'])


In [9]:
# Create sequences and targets
sequences_hour, targets_hour = create_sequences(data_hour, sequence_length)

# Split the data into training and testing sets
X_train_hour, X_test_hour, y_train_hour, y_test_hour = train_test_split(sequences_hour, targets_hour, test_size=0.3, random_state=42)


## LSTM

In [10]:
# Build the LSTM model
lstm_model_hour = Sequential()
lstm_model_hour.add(LSTM(50, input_shape=(X_train_hour.shape[1], X_train_hour.shape[2])))
lstm_model_hour.add(Dense(1, activation='sigmoid'))
lstm_model_hour.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])


In [11]:
# Train the model
lstm_model_hour.fit(X_train_hour, y_train_hour, epochs=20, batch_size=32, validation_split=0.1, callbacks=[early_stopping])


Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20


<keras.src.callbacks.History at 0x7f5750640e10>

## TCN

In [12]:
# Build the TCN model
tcn_model_hour = Sequential([
    TCN(input_shape=(sequence_length, X_train_hour.shape[2])),
    Dense(1, activation='sigmoid')
])


In [13]:
# Compile the model
tcn_model_hour.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

# Train the model
tcn_model_hour.fit(X_train_hour, y_train_hour, epochs=20, batch_size=32, validation_split=0.1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f5740692550>

## Hybrid Model - Naive

In [15]:
# Ensemble Model combining LSTM and TCN
def create_hybrid_model(lstm_model, tcn_model):
    lstm_input = lstm_model.input
    tcn_input = tcn_model.input

    # Get the output layers of both models
    lstm_output = lstm_model.layers[-1].output
    tcn_output = tcn_model.layers[-1].output

    # Concatenate the outputs
    merged = concatenate([lstm_output, tcn_output])

    # Add a dense layer for final prediction
    merged = Dense(1, activation='sigmoid')(merged)

    # Create the ensemble model
    ensemble_model = Model(inputs=[lstm_input, tcn_input], outputs=merged)

    # Compile the model
    ensemble_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

    return ensemble_model


In [17]:
# Create the hybrid model
hybrid_model_hour = create_hybrid_model(lstm_model_hour, tcn_model_hour)

# Train the hybrid model with both LSTM and TCN data
hybrid_model_hour.fit([X_train_hour, X_train_hour], y_train_hour, epochs=20, batch_size=32, validation_split=0.1)


Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f57401db890>

In [18]:
# Evaluate the model using Mean Absolute Error
mae_hour = hybrid_model_hour.evaluate([X_test_hour, X_test_hour], y_test_hour, verbose=0)[1]
print(f'Model Mean Absolute Error: {mae_hour:.4f}')


Model Mean Absolute Error: 0.3366


In [19]:
# Function to make predictions on new data for the hybrid model
def predict_future_failures_hybrid(model, input_data_lstm, input_data_tcn, sequence_length, prediction_steps):
    predictions = []

    for _ in range(prediction_steps):
        # Make predictions for the next time step using both LSTM and TCN models
        prediction = model.predict([input_data_lstm.reshape(1, sequence_length, input_data_lstm.shape[1]),
                                    input_data_tcn.reshape(1, sequence_length, input_data_tcn.shape[1])])
        predictions.append(prediction[0, 0])

        # Shift the input data by one time step and append the new prediction
        input_data_lstm = np.roll(input_data_lstm, shift=-1, axis=0)
        input_data_lstm[-1, -1] = prediction[0, 0]

        input_data_tcn = np.roll(input_data_tcn, shift=-1, axis=0)
        input_data_tcn[-1, -1] = prediction[0, 0]

    return predictions

# Select a starting point for predictions
input_data_lstm_hybrid = X_test_hour[10]
input_data_tcn_hybrid = X_test_hour[10]

# Number of time steps to predict into the future
prediction_steps_hybrid = 7



Predicted Failures for the Next 7 Time Steps (Hybrid):
[0.18048015, 0.17682935, 0.1784299, 0.17971909, 0.18065532, 0.18113437, 0.18167326]
Mean Absolute Error for Predictions: 0.5465


In [22]:
# Make predictions with the hybrid model
predicted_failures_hybrid = predict_future_failures_hybrid(hybrid_model_hour, input_data_lstm_hybrid, input_data_tcn_hybrid, sequence_length, prediction_steps_hybrid)

# Print the predicted failures
print("Predicted Failures for the Next 7 Time Steps (Hybrid):")
print(predicted_failures_hybrid)

# Evaluate the predictions using Mean Absolute Error
mae_predictions_hybrid = np.mean(np.abs(predicted_failures_hybrid - y_test_hour[10:10+prediction_steps_hybrid]))
print(f'Mean Absolute Error for Predictions: {mae_predictions_hybrid:.4f}')


Predicted Failures for the Next 7 Time Steps (Hybrid):
[0.16422383, 0.23447375, 0.26028836, 0.27101928, 0.27706224, 0.28129625, 0.2888697]
Mean Absolute Error for Predictions: 0.5240


## Hybrid Model - Attention

In [14]:
from tensorflow.keras.layers import Attention, Concatenate

# Function to create a hybrid model with attention mechanism
def create_attention_hybrid_model(lstm_model, tcn_model):
    lstm_input = lstm_model.input
    tcn_input = tcn_model.input

    # Get the output layers of both models
    lstm_output = lstm_model.layers[-1].output
    tcn_output = tcn_model.layers[-1].output

    # Use Attention mechanism to combine outputs
    attention = Attention()([lstm_output, tcn_output])
    merged = Concatenate()([lstm_output, tcn_output, attention])

    # Add a dense layer for the final prediction
    merged = Dense(1, activation='sigmoid')(merged)

    # Create the ensemble model
    ensemble_model = Model(inputs=[lstm_input, tcn_input], outputs=merged)

    # Compile the model
    ensemble_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

    return ensemble_model


In [20]:
# Create the hybrid model
hybrid_model_hour_attention = create_attention_hybrid_model(lstm_model_hour, tcn_model_hour)

# Train the hybrid model with both LSTM and TCN data
hybrid_model_hour_attention.fit([X_train_hour, X_train_hour], y_train_hour, epochs=20, batch_size=32, validation_split=0.1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f56bef285d0>

In [21]:
# Evaluate the model using Mean Absolute Error
mae_hour_attention = hybrid_model_hour_attention.evaluate([X_test_hour, X_test_hour], y_test_hour, verbose=0)[1]
print(f'Model Mean Absolute Error: {mae_hour_attention:.4f}')


Model Mean Absolute Error: 0.2952


In [23]:
# Make predictions with the hybrid model
predicted_failures_hybrid_attention = predict_future_failures_hybrid(hybrid_model_hour_attention, input_data_lstm_hybrid, input_data_tcn_hybrid, sequence_length, prediction_steps_hybrid)

# Print the predicted failures
print("Predicted Failures for the Next 7 Time Steps (Hybrid):")
print(predicted_failures_hybrid_attention)

# Evaluate the predictions using Mean Absolute Error
mae_predictions_hybrid_attention = np.mean(np.abs(predicted_failures_hybrid_attention - y_test_hour[10:10+prediction_steps_hybrid]))
print(f'Mean Absolute Error for Predictions: {mae_predictions_hybrid_attention:.4f}')


Predicted Failures for the Next 7 Time Steps (Hybrid):
[0.18409212, 0.27685666, 0.31460607, 0.33073542, 0.33966762, 0.3459241, 0.3559103]
Mean Absolute Error for Predictions: 0.5131
