In [12]:
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
import pandas as pd

In [13]:
# Load the dataset
train_data = pd.read_csv('../data/featured_train_series.csv')

# Preprocess the data
train_data['event_binary'] = train_data['event'].apply(lambda x: 1 if x != 0 else 0)

X = train_data.drop(['event', 'event_binary', 'series_id'], axis=1)
y_event = train_data['event_binary']
y_type = train_data['event']

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Encode the labels
encoder = LabelEncoder()
y_event = encoder.fit_transform(y_event)
y_type = encoder.fit_transform(y_type)

# Split data into training and validation sets
X_train, X_val, y_event_train, y_event_val, y_type_train, y_type_val = train_test_split(
    X, y_event, y_type, test_size=0.2, stratify=y_event, random_state=42
)

In [22]:
from keras import backend as K

def weighted_binary_crossentropy(weights):
    def loss(y_true, y_pred):
        # Clip predictions to avoid log(0) errors
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        # Calculate weighted binary cross-entropy
        loss = - (weights[0] * y_true * K.log(y_pred) + weights[1] * (1 - y_true) * K.log(1 - y_pred))
        return K.mean(loss)
    return loss

# Stage 1: Event Detection
def build_event_detection_model(input_dim):
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=input_dim))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Stage 2: Onset vs. Wakeup Classification
def build_event_type_model(input_dim):
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=input_dim))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

In [23]:
from sklearn.utils.class_weight import compute_class_weight

# 2. Apply SMOTE for Stage 1
smote = SMOTE(random_state=42)
X_train_smote, y_event_train_smote = smote.fit_resample(X_train, y_event_train)

# Calculate the class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_event_train_smote),
    y=y_event_train_smote
)

# Convert class weights to a dictionary format required by Keras
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# 3. Train Stage 1 Model
event_detection_model = build_event_detection_model(X_train.shape[1])
event_detection_model.fit(X_train_smote, y_event_train_smote, 
                          epochs=20, batch_size=64, 
                          validation_data=(X_val, y_event_val),
                          class_weight=class_weights_dict)

# 4. Predict events and filter for Stage 2
event_predictions = (event_detection_model.predict(X_train) > 0.5).astype(int)
event_indices = event_predictions.flatten() == 1

# Filter the event data for Stage 2
X_events = X_train[event_indices]
y_type_events = y_type_train[event_indices]

# Apply SMOTE for Stage 2
X_events_smote, y_type_events_smote = smote.fit_resample(X_events, y_type_events)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m17014/17014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2ms/step - accuracy: 0.7938 - loss: 0.4405 - val_accuracy: 0.8003 - val_loss: 0.3740
Epoch 2/20
[1m17014/17014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2ms/step - accuracy: 0.8504 - loss: 0.3556 - val_accuracy: 0.8065 - val_loss: 0.3538
Epoch 3/20
[1m17014/17014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2ms/step - accuracy: 0.8558 - loss: 0.3471 - val_accuracy: 0.8065 - val_loss: 0.3681
Epoch 4/20
[1m17014/17014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2ms/step - accuracy: 0.8587 - loss: 0.3414 - val_accuracy: 0.8184 - val_loss: 0.3355
Epoch 5/20
[1m17014/17014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2ms/step - accuracy: 0.8608 - loss: 0.3373 - val_accuracy: 0.8128 - val_loss: 0.3546
Epoch 6/20
[1m17014/17014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2ms/step - accuracy: 0.8615 - loss: 0.3350 - val_accuracy: 0.8046 - val_loss: 0.368

In [24]:
import keras

# One-hot encode y_type_events for multi-class classification
y_type_events_smote = keras.utils.to_categorical(y_type_events_smote - 1)  # Classes: 0 (onset), 1 (wakeup)

# 5. Train Stage 2 Model
event_type_model = build_event_type_model(X_events.shape[1])
event_type_model.fit(X_events_smote, y_type_events_smote, epochs=20, batch_size=64, validation_split=0.2)

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3815/3815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.7683 - loss: 0.4577 - val_accuracy: 0.9981 - val_loss: 0.0064
Epoch 2/20
[1m3815/3815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8105 - loss: 0.3754 - val_accuracy: 0.9994 - val_loss: 0.0037
Epoch 3/20
[1m3815/3815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8191 - loss: 0.3631 - val_accuracy: 0.9998 - val_loss: 0.0014
Epoch 4/20
[1m3815/3815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8250 - loss: 0.3540 - val_accuracy: 0.9996 - val_loss: 0.0019
Epoch 5/20
[1m3815/3815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8294 - loss: 0.3484 - val_accuracy: 0.9996 - val_loss: 0.0013
Epoch 6/20
[1m3815/3815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8327 - loss: 0.3436 - val_accuracy: 0.9998 - val_loss: 9.3901e-04
Epoch 7/20
[1m3815/381

<keras.src.callbacks.history.History at 0x7f83c1813fb0>

In [25]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, roc_auc_score, 
                             classification_report, precision_recall_curve)

y_scores = event_detection_model.predict(X_val).ravel()
precision, recall, thresholds = precision_recall_curve(y_event_val, y_scores)
optimal_threshold = thresholds[np.argmax(precision * recall)] 


# Stage 1: Event Detection Evaluation
# Predict events on validation data
y_event_pred_val = (event_detection_model.predict(X_val) > optimal_threshold).astype(int)

# Calculate evaluation metrics for Stage 1
stage1_accuracy = accuracy_score(y_event_val, y_event_pred_val)
stage1_precision = precision_score(y_event_val, y_event_pred_val)
stage1_recall = recall_score(y_event_val, y_event_pred_val)
stage1_f1 = f1_score(y_event_val, y_event_pred_val)
stage1_conf_matrix = confusion_matrix(y_event_val, y_event_pred_val)
stage1_roc_auc = roc_auc_score(y_event_val, y_event_pred_val)

print("Stage 1 - Event Detection Metrics:")
print(f"Accuracy: {stage1_accuracy:.4f}")
print(f"Precision: {stage1_precision:.4f}")
print(f"Recall: {stage1_recall:.4f}")
print(f"F1-Score: {stage1_f1:.4f}")
print(f"ROC-AUC: {stage1_roc_auc:.4f}")
print(f"Confusion Matrix:\n{stage1_conf_matrix}\n")

# Filter predicted events for Stage 2 evaluation
event_indices_val = y_event_pred_val.flatten() == 1
X_events_val = X_val[event_indices_val]
y_type_val_filtered = y_type_val[event_indices_val]

# Ensure only classes 1 and 2 are present in filtered validation data
valid_indices = (y_type_val_filtered == 1) | (y_type_val_filtered == 2)
X_events_val = X_events_val[valid_indices]
y_type_val_filtered = y_type_val_filtered[valid_indices]

# Predict onset vs. wakeup on filtered validation data
y_type_pred_val = event_type_model.predict(X_events_val)
y_type_pred_val = y_type_pred_val.argmax(axis=1) + 1  # Convert from one-hot to class labels 1 and 2

# Calculate evaluation metrics for Stage 2
stage2_accuracy = accuracy_score(y_type_val_filtered, y_type_pred_val)
stage2_precision = precision_score(y_type_val_filtered, y_type_pred_val, average='weighted')
stage2_recall = recall_score(y_type_val_filtered, y_type_pred_val, average='weighted')
stage2_f1 = f1_score(y_type_val_filtered, y_type_pred_val, average='weighted')
stage2_conf_matrix = confusion_matrix(y_type_val_filtered, y_type_pred_val)

# Specify labels to avoid class mismatch in classification report
stage2_class_report = classification_report(
    y_type_val_filtered, y_type_pred_val, target_names=['Onset', 'Wakeup'], labels=[1, 2]
)

# Print Stage 2 evaluation results
print("Stage 2 - Onset vs. Wakeup Metrics:")
print(f"Accuracy: {stage2_accuracy:.4f}")
print(f"Weighted Precision: {stage2_precision:.4f}")
print(f"Weighted Recall: {stage2_recall:.4f}")
print(f"Weighted F1-Score: {stage2_f1:.4f}")
print(f"Confusion Matrix:\n{stage2_conf_matrix}\n")
print(f"Classification Report:\n{stage2_class_report}\n")


[1m4294/4294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 708us/step
[1m4294/4294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 654us/step
Stage 1 - Event Detection Metrics:
Accuracy: 0.8872
Precision: 0.0563
Recall: 0.6968
F1-Score: 0.1041
ROC-AUC: 0.7929
Confusion Matrix:
[[121001  15110]
 [   392    901]]

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Stage 2 - Onset vs. Wakeup Metrics:
Accuracy: 0.9612
Weighted Precision: 0.9634
Weighted Recall: 0.9612
Weighted F1-Score: 0.9609
Confusion Matrix:
[[363  34]
 [  1 503]]

Classification Report:
              precision    recall  f1-score   support

       Onset       1.00      0.91      0.95       397
      Wakeup       0.94      1.00      0.97       504

    accuracy                           0.96       901
   macro avg       0.97      0.96      0.96       901
weighted avg       0.96      0.96      0.96       901




In [26]:
# Load the test data
test_data = pd.read_csv('../data/featured_test_series.csv')

# Preprocess the test data
X_test = test_data.drop(['series_id', 'event'], axis=1)
X_test = scaler.transform(X_test)

# Encode the test labels
y_test_event = test_data['event']
y_test_event = encoder.transform(y_test_event)
y_test_type = test_data['event'].apply(lambda x: 1 if x != 0 else 0)
y_test_type = encoder.transform(y_test_type)

# Predict events on test data
y_test_event_pred = (event_detection_model.predict(X_test) > optimal_threshold).astype(int)

# Filter predicted events for Stage 2
event_indices_test = y_test_event_pred.flatten() == 1
X_events_test = X_test[event_indices_test]
y_test_type_filtered = y_test_type[event_indices_test]

# Ensure only classes 1 and 2 are present in filtered test data
valid_indices_test = (y_test_type_filtered == 1) | (y_test_type_filtered == 2)
X_events_test = X_events_test[valid_indices_test]
y_test_type_filtered = y_test_type_filtered[valid_indices_test]

# Predict onset vs. wakeup on filtered test data
y_test_type_pred = event_type_model.predict(X_events_test)
y_test_type_pred = y_test_type_pred.argmax(axis=1) + 1  # Convert from one-hot to class labels 1 and 2

# Calculate evaluation metrics for Stage 2 on test data
test_accuracy = accuracy_score(y_test_type_filtered, y_test_type_pred)
test_precision = precision_score(y_test_type_filtered, y_test_type_pred, average='weighted')
test_recall = recall_score(y_test_type_filtered, y_test_type_pred, average='weighted')
test_f1 = f1_score(y_test_type_filtered, y_test_type_pred, average='weighted')
test_conf_matrix = confusion_matrix(y_test_type_filtered, y_test_type_pred)

# Specify labels to avoid class mismatch in classification report
test_class_report = classification_report(
    y_test_type_filtered, y_test_type_pred, target_names=['Onset', 'Wakeup'], labels=[1, 2]
)

# Print Stage 2 evaluation results on test data
print("Test Data - Onset vs. Wakeup Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Weighted Precision: {test_precision:.4f}")
print(f"Weighted Recall: {test_recall:.4f}")
print(f"Weighted F1-Score: {test_f1:.4f}")
print(f"Confusion Matrix:\n{test_conf_matrix}\n")
print(f"Classification Report:\n{test_class_report}\n")



[1m9494/9494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 628us/step
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Test Data - Onset vs. Wakeup Metrics:
Accuracy: 0.3926
Weighted Precision: 1.0000
Weighted Recall: 0.3926
Weighted F1-Score: 0.5638
Confusion Matrix:
[[ 667 1032]
 [   0    0]]

Classification Report:
              precision    recall  f1-score   support

       Onset       1.00      0.39      0.56      1699
      Wakeup       0.00      0.00      0.00         0

    accuracy                           0.39      1699
   macro avg       0.50      0.20      0.28      1699
weighted avg       1.00      0.39      0.56      1699




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
