In [5]:
# 03_Model_Training.ipynb
# Model training, time-series CV, threshold tuning, final test evaluation

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    f1_score,
    classification_report,
    confusion_matrix
)

DATA_DIR = "../data/processed"
features_path = os.path.join(DATA_DIR, "train_features_FD001_no_leak.csv")

df = pd.read_csv(features_path)
print("Processed features shape:", df.shape)
df.head()


Processed features shape: (20631, 173)


Unnamed: 0,unit,op_set_1,op_set_2,op_set_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,...,sensor_12_max_20,sensor_13_max_20,sensor_14_max_20,sensor_15_max_20,sensor_16_max_20,sensor_17_max_20,sensor_18_max_20,sensor_19_max_20,sensor_20_max_20,sensor_21_max_20
0,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,...,521.66,2388.02,8138.62,8.4195,0.03,392.0,2388.0,100.0,39.06,23.419
1,1,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,...,522.28,2388.07,8138.62,8.4318,0.03,392.0,2388.0,100.0,39.06,23.4236
2,1,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,...,522.42,2388.07,8138.62,8.4318,0.03,392.0,2388.0,100.0,39.06,23.4236
3,1,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,...,522.86,2388.08,8138.62,8.4318,0.03,392.0,2388.0,100.0,39.06,23.4236
4,1,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,...,522.86,2388.08,8138.62,8.4318,0.03,393.0,2388.0,100.0,39.06,23.4236


In [6]:
# ----- Separate features and target -----

# 'label' is the binary target column we created in file 2
y = df["label"].values
X = df.drop(columns=["label"]).values

print("Feature matrix shape:", X.shape)
print("Class balance (0=healthy, 1=failure):", np.bincount(y))


Feature matrix shape: (20631, 172)
Class balance (0=healthy, 1=failure): [17531  3100]


In [7]:
# ----- Time-series cross validation (RandomForest baseline model) -----

tscv = TimeSeriesSplit(n_splits=5)
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X), start=1):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    model = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        n_jobs=-1,
        class_weight="balanced",
        random_state=42,
    )

    model.fit(X_tr, y_tr)
    y_val_pred = model.predict(X_val)

    f1 = f1_score(y_val, y_val_pred, pos_label=1)
    fold_scores.append(f1)

    print(f"Fold {fold} F1-score (failure class=1): {f1:.4f}")

print("\nMean CV F1-score (failure class=1):", np.mean(fold_scores))


Fold 1 F1-score (failure class=1): 0.8459
Fold 2 F1-score (failure class=1): 0.8844
Fold 3 F1-score (failure class=1): 0.8366
Fold 4 F1-score (failure class=1): 0.8274
Fold 5 F1-score (failure class=1): 0.8323

Mean CV F1-score (failure class=1): 0.845315440921034


In [8]:
# ----- Hold-out test split (respect time order, no shuffling) -----

# Use last 20% of rows as test (later in time)
test_size = 0.2
split_index = int(len(X) * (1 - test_size))

X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape :", X_test.shape,  y_test.shape)
print("Train label counts:", np.bincount(y_train))
print("Test label counts :", np.bincount(y_test))


Train shape: (16504, 172) (16504,)
Test shape : (4127, 172) (4127,)
Train label counts: [13993  2511]
Test label counts : [3538  589]


In [9]:
# ----- Train final RandomForest + threshold tuning ----- 

# Split train into (subtrain, val) for threshold search, no shuffle
X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    shuffle=False
)

final_model = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    n_jobs=-1,
    class_weight="balanced",
    random_state=42,
)

final_model.fit(X_subtrain, y_subtrain)

# Predict probabilities for failure class (1)
y_val_proba = final_model.predict_proba(X_val)[:, 1]

best_f1 = -1
best_thr = 0.5

print("Searching best decision threshold on validation set...")
for thr in np.linspace(0.1, 0.9, 17):  # 0.1, 0.15, ..., 0.9
    y_val_pred_thr = (y_val_proba >= thr).astype(int)
    f1 = f1_score(y_val, y_val_pred_thr, pos_label=1)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = thr

print(f"Best threshold on validation: {best_thr:.2f} with F1={best_f1:.4f}")


Searching best decision threshold on validation set...
Best threshold on validation: 0.60 with F1=0.8451


In [10]:
# ----- Final evaluation on HOLD-OUT test set ----- 

# Retrain model on entire X_train with chosen hyperparameters
final_model.fit(X_train, y_train)

# Predict probabilities on test
y_test_proba = final_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_thr).astype(int)

print(f"\nTest F1-score (failure class=1, thr={best_thr:.2f}):",
      f1_score(y_test, y_test_pred, pos_label=1))

print("\nClassification report (test set):")
print(classification_report(y_test, y_test_pred, digits=3))

print("Confusion matrix (test set):")
print(confusion_matrix(y_test, y_test_pred))



Test F1-score (failure class=1, thr=0.60): 0.8435498627630376

Classification report (test set):
              precision    recall  f1-score   support

           0      0.965     0.988     0.976      3538
           1      0.915     0.783     0.844       589

    accuracy                          0.959      4127
   macro avg      0.940     0.885     0.910      4127
weighted avg      0.958     0.959     0.957      4127

Confusion matrix (test set):
[[3495   43]
 [ 128  461]]


In [1]:
import json, os
MODELS_DIR = os.path.join("..","models")
os.makedirs(MODELS_DIR, exist_ok=True)

# feature_cols should be defined in your notebook (e.g. `feature_cols = X.columns.tolist()`)
# If not, derive from your processed CSV:
csv_path = os.path.join("..","data","processed","train_features_FD001_no_leak.csv")
if 'feature_cols' not in globals():
    import pandas as pd
    df = pd.read_csv(csv_path, nrows=5)
    if "label" in df.columns:
        feature_cols = [c for c in df.columns if c != "label"]
    else:
        feature_cols = list(df.columns)

json_path = os.path.join(MODELS_DIR, "rf_FD001_features.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(feature_cols, f, indent=2, ensure_ascii=False)

print("Wrote feature list ->", json_path, "len:", len(feature_cols))


Wrote feature list -> ..\models\rf_FD001_features.json len: 172
