<a href="https://colab.research.google.com/github/TheophileZuber/2024_MLEES/blob/main/LSTM_Model_Monkey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import all packages

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns
import os
import psutil  # For memory usage tracking
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datetime import datetime

from google.colab import drive

# Check if drive is already mounted
if not os.path.exists('/content/drive'):
  drive.mount('/content/drive')
else:
  print("Drive is already mounted.")

Data formating and creation of the training, validation and test sets

Function to creat sequences of data and transform/normalise timestamps

In [None]:
# Step 1: Function to create sequences with time-window padding
def create_time_window_sequences(data, window_hours=6, behavior_columns_range=(8, 15), pad_length=240):
    sequences = []  # To store the sequences of input features
    targets = []    # To store the target behavior values corresponding to each sequence

    # Ensure 'data' is a DataFrame
    if not isinstance(data, pd.DataFrame):
        raise ValueError("Expected input 'data' to be a DataFrame")

    # Ensure behavior_columns_range is a tuple
    if not isinstance(behavior_columns_range, tuple):
        raise ValueError("behavior_columns_range must be a tuple")

    # Convert 'timestamp' column to datetime objects (important for time-based operations)
    data['timestamp'] = pd.to_datetime(data['timestamp'])

    # Extract useful time features: hour of the day and day of the week
    data['hour'] = data['timestamp'].dt.hour
    data['day_of_week'] = data['timestamp'].dt.dayofweek

    # Create cyclic (sinusoidal and cosinusoidal) time features to capture periodic patterns
    # This helps the model understand cyclic patterns such as 24-hour cycles or weekly patterns
    data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
    data['day_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data['day_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)

    # Sort the data by individual ID and timestamp to ensure chronological order within each group
    data.sort_values(by=['Ind_ID', 'timestamp'], inplace=True)

    # Define the columns that represent behavior (target) data and feature (input) data
    behavior_columns = list(data.columns[behavior_columns_range[0]:behavior_columns_range[1]])
    feature_columns = [
        'speed_ms', 'stepLenght', 'turnAngle_sin', 'turnAngle_cos',  # Movement features
        'hour_sin', 'hour_cos', 'day_sin', 'day_cos'                 # Cyclic time features
    ] + list(data.columns[15:25])  # Landcover features

    # Group the data by individual IDs to process each individual's data separately
    grouped = data.groupby('Ind_ID')

    # Iterate over each group (individual) in the dataset
    for _, group in grouped:
        group = group.reset_index(drop=True)  # Reset index to avoid issues with indexing within the group
        start_idx = 0  # Initialize the starting index for creating windows within the group

        # Create overlapping sequences within the group based on the defined time window
        while start_idx < len(group):
            # Define the end of the time window based on the start index and specified window size
            end_time = group.loc[start_idx, 'timestamp'] + pd.Timedelta(hours=window_hours)

            # Select data within the time window
            window_data = group[(group['timestamp'] >= group.loc[start_idx, 'timestamp']) &
                                (group['timestamp'] < end_time)]

            # Ensure the window has at least two records to form a valid sequence
            if len(window_data) >= 2:
                # Extract the input features for the sequence and the target behavior values
                seq = window_data[feature_columns].values  # Sequence of input features (numpy array)
                target = window_data[behavior_columns].values[-1]  # Target values from the last row in the window
                sequences.append(seq)  # Add the sequence to the list
                targets.append(target)  # Add the target to the list

            start_idx += 1  # Move the starting index to create the next window

    # Pad sequences to ensure uniform length (maxlen) for model input consistency
    padded_sequences = pad_sequences(
        sequences,                # List of sequences
        maxlen=pad_length,        # Desired length of the sequences after padding
        dtype='float32',          # Data type of the sequences
        padding='post',           # Pad at the end of the sequence if shorter than maxlen
        truncating='post'         # Truncate at the end if sequence is longer than maxlen
    )

    # Return the padded sequences and corresponding targets as numpy arrays
    return np.array(padded_sequences), np.array(targets)


Split the data by individuals and timestamp

In [None]:
# Step 2: Function to split data chronologically by individual
def split_chronologically_by_individual(data, train_ratio=0.5, val_ratio=0.25, random_state=42):
    # Ensure data is sorted chronologically within each individual ID group
    data = data.sort_values(by=['Ind_ID', 'timestamp'])  # Sort by individual ID and timestamp to maintain order

    # Check if the input DataFrame is empty and handle it gracefully
    if data.empty:
        print("Warning: Received an empty DataFrame during split.")
        return data, data, data  # Return empty DataFrames for train, validation, and test splits

    # Get unique individual IDs
    unique_ids = data['Ind_ID'].unique()  # Extract unique IDs to split data by individuals

    # Set a random seed for reproducibility (consistent shuffling results across runs)
    np.random.seed(random_state)
    np.random.shuffle(unique_ids)  # Randomly shuffle the individual IDs to ensure a diverse split

    # Calculate the number of individuals for each dataset split based on the provided ratios
    train_size = int(len(unique_ids) * train_ratio)  # Number of individuals in the training set
    val_size = int(len(unique_ids) * val_ratio)      # Number of individuals in the validation set

    # Split the shuffled unique IDs into training, validation, and test sets
    train_ids = unique_ids[:train_size]                                # First part for training
    val_ids = unique_ids[train_size:train_size + val_size]             # Next part for validation
    test_ids = unique_ids[train_size + val_size:]                      # Remaining part for testing

    # Extract data for each split by filtering based on the assigned individual IDs
    train_data = data[data['Ind_ID'].isin(train_ids)]  # Filter data belonging to train IDs
    val_data = data[data['Ind_ID'].isin(val_ids)]      # Filter data belonging to validation IDs
    test_data = data[data['Ind_ID'].isin(test_ids)]    # Filter data belonging to test IDs

    # Return the three subsets as DataFrames
    return train_data, val_data, test_data


Function which generate the chunks for training, validation and testing

In [None]:
# Step 3: Generator to process data in chunks and split into train/val/test sets
def data_generator_fixed_split(file_path, chunk_size=1024, window_hours=6, pad_length=240):

    # Use pandas' chunk reader to process the large CSV file in manageable chunks
    chunk_iter = pd.read_csv(file_path, chunksize=chunk_size)  # Create an iterator for reading CSV chunks

    # Loop through each chunk of data from the CSV file
    for chunk in chunk_iter:
        # Check if the current chunk is empty and skip it if necessary
        if chunk.empty:
            print(f"Warning: Chunk {chunk} is empty.")  # Print a warning if the chunk is empty
            continue  # Skip to the next chunk

        # Split the current chunk into train, validation, and test sets based on individual IDs
        train_data, val_data, test_data = split_chronologically_by_individual(chunk)

        # Create time-window sequences for each split (train, validation, and test)
        X_train_chunk, y_train_chunk = create_time_window_sequences(train_data, window_hours, pad_length=pad_length)
        X_val_chunk, y_val_chunk = create_time_window_sequences(val_data, window_hours, pad_length=pad_length)
        X_test_chunk, y_test_chunk = create_time_window_sequences(test_data, window_hours, pad_length=pad_length)

        # Ensure that sequences are not empty before yielding
        if X_train_chunk.size == 0 or X_val_chunk.size == 0 or X_test_chunk.size == 0:
            print(f"Skipping empty sequence chunk {chunk}.")  # Print a warning if sequences are empty
            continue  # Skip this chunk if any sequence set is empty

        # Yield the processed data for the current chunk, which allows for batch processing in training
        yield X_train_chunk, X_val_chunk, X_test_chunk, y_train_chunk, y_val_chunk, y_test_chunk


Generate training, validation and test sets

In [None]:
# Paths and variables
file_path = "/content/drive/MyDrive/data/AarhusAsseblief_sub.csv"
output_path = "/content/drive/MyDrive/data_chunks/"
directory = datetime.today().strftime("%Y-%m-%d")
full_output_path = os.path.join(output_path, directory)

if not os.path.exists(full_output_path):
    os.makedirs(full_output_path)

# Process and save data in chunks
for i, (X_train_chunk, X_val_chunk, X_test_chunk, y_train_chunk, y_val_chunk, y_test_chunk) in enumerate(
        data_generator_fixed_split(file_path, chunk_size=5000)):

    np.save(f"{full_output_path}/X_train_chunk_{i}.npy", X_train_chunk)
    np.save(f"{full_output_path}/X_val_chunk_{i}.npy", X_val_chunk)
    np.save(f"{full_output_path}/X_test_chunk_{i}.npy", X_test_chunk)
    np.save(f"{full_output_path}/y_train_chunk_{i}.npy", y_train_chunk)
    np.save(f"{full_output_path}/y_val_chunk_{i}.npy", y_val_chunk)
    np.save(f"{full_output_path}/y_test_chunk_{i}.npy", y_test_chunk)

    print(f"Chunk {i} processed and saved. Memory usage: {psutil.virtual_memory().percent}%")

print("Data processing completed successfully!")

Layer and model definition

Attention layer definition

In [None]:
# Import TensorFlow for building custom layers
import tensorflow as tf

# Define a custom feature attention layer that applies attention mechanisms to input features
class FeatureAttention(tf.keras.layers.Layer):
    """
    A custom Keras layer that applies attention to input features,
    dynamically weighting their importance during training.
    """

    def __init__(self, **kwargs):
        super(FeatureAttention, self).__init__(**kwargs)  # Call parent class initializer

    def build(self, input_shape):
        """
        Defines the weights for the attention mechanism.
        """
        # Initialize the attention weights matrix (W) with shape (num_features, num_features)
        self.W = self.add_weight(
            shape=(input_shape[-1], input_shape[-1]),  # Shape is (num_features, num_features)
            initializer="normal",  # Use normal distribution for initialization
            trainable=True  # Allow the weights to be updated during training
        )
        super(FeatureAttention, self).build(input_shape)  # Finalize the build process

    def call(self, x):
        """
        Applies the attention mechanism to the input data.

        """
        # Calculate attention scores using a softmax over the transformed features
        # (batch_size, time_steps, num_features) x (num_features, num_features) -> (batch_size, time_steps, num_features)
        score = tf.nn.softmax(tf.matmul(x, self.W), axis=-1)  # Apply softmax along the feature axis

        # Multiply each input feature by its corresponding attention score
        # Element-wise multiplication: (batch_size, time_steps, num_features) * (batch_size, time_steps, num_features)
        context = x * score

        # Aggregate the attention-weighted features over all time steps (reduce along axis 1)
        # Resulting shape: (batch_size, num_features)
        return tf.reduce_sum(context, axis=1)



Model definition

In [None]:

# Define the LSTM model with the Attention layer
model = Sequential([
    LSTM(128, return_sequences=True,input_shape=(240, 18)),
    Dropout(0.5),  # Regularization to prevent overfitting
    FeatureAttention(),   # Custom Attention Layer
    Dense(7, activation='softmax')  # Output layer for classification
])


Compile model

In [None]:
# Compile the model before training
model.compile(
    optimizer='adam',                 # Optimizer for weight updates
    loss='categorical_crossentropy',  # Loss function for multi-class classification
    metrics=['accuracy']              # Track accuracy during training
)


Set up paths to datachunks, checkpoint of the model, early_stopping and logs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Paths to saved chunks
train_path = "/content/drive/MyDrive/data_chunks/2024-12-05/"
val_path = "/content/drive/MyDrive/data_chunks/2024-12-05/"
test_path = "/content/drive/MyDrive/data_chunks/2024-12-05/"
base_output_path = "/content/drive/MyDrive/Model_results/"

# Callbacks
checkpoint = ModelCheckpoint('/content/drive/MyDrive/Checkpoints/lstm_model.keras', save_best_only=True, monitor='val_loss')
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
csv_logger = CSVLogger('/content/drive/MyDrive/logs/training_log.csv')


Model training

In [None]:
num_chunks = 227
num_epochs = 1

# Initialize lists to store loss and accuracy metrics
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

# Train and validate on each chunk
for i in range(num_chunks):
    X_train_chunk = np.load(f"{train_path}X_train_chunk_{i}.npy")
    y_train_chunk = np.load(f"{train_path}y_train_chunk_{i}.npy")
    X_val_chunk = np.load(f"{val_path}X_val_chunk_{i}.npy")
    y_val_chunk = np.load(f"{val_path}y_val_chunk_{i}.npy")

    # Train the model on the current chunk and store history
    history = model.fit(X_train_chunk, y_train_chunk, epochs=num_epochs, batch_size=32,
                        validation_data=(X_val_chunk, y_val_chunk),
                        verbose=1, callbacks=[checkpoint, early_stopping, csv_logger])

    # Append metrics from this chunk
    train_losses.extend(history.history['loss'])
    val_losses.extend(history.history['val_loss'])
    train_accuracies.extend(history.history['accuracy'])
    val_accuracies.extend(history.history['val_accuracy'])

# Save training history to CSV
history_df = pd.DataFrame({
    'epoch': np.arange(len(train_losses)),
    'train_loss': train_losses,
    'val_loss': val_losses,
    'train_accuracy': train_accuracies,
    'val_accuracy': val_accuracies
})

history_df.to_csv(os.path.join(base_output_path, 'training_history.csv'), index=False)

Follow the learning of the model

In [None]:

# Plot learning curves
plt.figure(figsize=(12, 5))

# Plot Loss
plt.subplot(1, 2, 1)
plt.plot(history_df['epoch'], history_df['train_loss'], label='Train Loss')
plt.plot(history_df['epoch'], history_df['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss Curve')

# Plot Accuracy
plt.subplot(1, 2, 2)
plt.plot(history_df['epoch'], history_df['train_accuracy'], label='Train Accuracy')
plt.plot(history_df['epoch'], history_df['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Accuracy Curve')

# Save the plot
plt.savefig(os.path.join(base_output_path, 'learning_curves.png'))
plt.close()

Evaluation of the model

In [None]:
# Evaluate on test data
y_true_all = []
y_pred_all = []

for i in range(num_chunks):
    X_test_chunk = np.load(f"{test_path}X_test_chunk_{i}.npy")
    y_test_chunk = np.load(f"{test_path}y_test_chunk_{i}.npy")

    # Predict
    y_pred_chunk = model.predict(X_test_chunk)
    y_true_all.append(np.argmax(y_test_chunk, axis=1))  # Convert one-hot to class labels
    y_pred_all.append(np.argmax(y_pred_chunk, axis=1))  # Predicted class labels

# Combine all chunks
y_true_all = np.concatenate(y_true_all)
y_pred_all = np.concatenate(y_pred_all)

Save all the metrics

In [None]:
# Save classification report
report = classification_report(y_true_all, y_pred_all, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv('/content/drive/MyDrive/Model_results/classification_report.csv')

# Confusion Matrix
conf_matrix = confusion_matrix(y_true_all, y_pred_all)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=range(7), yticklabels=range(7))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.savefig('/content/drive/MyDrive/Model_results/confusion_matrix.png')
plt.close()

# Precision, Recall, and F1-Score Calculation
precision, recall, f1, _ = precision_recall_fscore_support(y_true_all, y_pred_all, average='weighted')
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

# Save precision, recall, F1
metrics_summary = pd.DataFrame({'Metric': ['Precision', 'Recall', 'F1-Score'], 'Value': [precision, recall, f1]})
metrics_summary.to_csv('/content/drive/MyDrive/Model_results/precision_recall_f1.csv', index=False)

# AUC-ROC Calculation (for multi-class)
# Calculate AUC for each class and average
y_true_bin = tf.keras.utils.to_categorical(y_true_all, num_classes=7)
y_pred_prob = model.predict(np.concatenate([np.load(f"{test_path}X_test_chunk_{i}.npy") for i in range(num_chunks)]))

# auc_score = roc_auc_score(y_true_bin, y_pred_prob, average='weighted', multi_class='ovr')
# print(f"Average AUC-ROC: {auc_score:.4f}")

# # Save AUC-ROC score
# with open('/content/drive/MyDrive/Model_results/auc_roc.txt', 'w') as f:
#     f.write(f"Average AUC-ROC: {auc_score:.4f}\n")