In [1]:
import pandas as pd

df = pd.read_csv("train_features.csv")

target = "failure_24h"
X = df.drop(columns=[target, "RUL"])
y = df[target]

from sklearn.model_selection import train_test_split

units = df["unit"].unique()

train_units, val_units = train_test_split(
    units,
    test_size=0.2,
    random_state=42
)

train_idx = df["unit"].isin(train_units)
val_idx   = df["unit"].isin(val_units)

X_train, X_val = X.loc[train_idx], X.loc[val_idx]
y_train, y_val = y.loc[train_idx], y.loc[val_idx]


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

rf_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("model", RandomForestClassifier(
        n_estimators=200,
        max_depth=12,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])

rf_pipe.fit(X_train, y_train)

 'sensor_4_roll_std_1' 'sensor_5_roll_std_1' 'sensor_6_roll_std_1'
 'sensor_7_roll_std_1' 'sensor_8_roll_std_1' 'sensor_9_roll_std_1'
 'sensor_10_roll_std_1' 'sensor_11_roll_std_1' 'sensor_12_roll_std_1'
 'sensor_13_roll_std_1' 'sensor_14_roll_std_1' 'sensor_15_roll_std_1'
 'sensor_16_roll_std_1' 'sensor_17_roll_std_1' 'sensor_18_roll_std_1'
 'sensor_19_roll_std_1' 'sensor_20_roll_std_1' 'sensor_21_roll_std_1']. At least one non-missing value is needed for imputation with strategy='median'.


In [3]:
from sklearn.metrics import average_precision_score, precision_score, recall_score

y_probs = rf_pipe.predict_proba(X_val)[:, 1]

prauc = average_precision_score(y_val, y_probs)

y_pred = (y_probs >= 0.5).astype(int)

precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)

prauc, precision, recall

 'sensor_4_roll_std_1' 'sensor_5_roll_std_1' 'sensor_6_roll_std_1'
 'sensor_7_roll_std_1' 'sensor_8_roll_std_1' 'sensor_9_roll_std_1'
 'sensor_10_roll_std_1' 'sensor_11_roll_std_1' 'sensor_12_roll_std_1'
 'sensor_13_roll_std_1' 'sensor_14_roll_std_1' 'sensor_15_roll_std_1'
 'sensor_16_roll_std_1' 'sensor_17_roll_std_1' 'sensor_18_roll_std_1'
 'sensor_19_roll_std_1' 'sensor_20_roll_std_1' 'sensor_21_roll_std_1']. At least one non-missing value is needed for imputation with strategy='median'.


(np.float64(0.9758468161399366), 1.0, 0.6666666666666666)

In [4]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "model__n_estimators": [200, 400],
    "model__max_depth": [8, 12, 16],
    "model__min_samples_leaf": [1, 5],
}

grid = GridSearchCV(
    rf_pipe,
    param_grid,
    scoring="average_precision",
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


 'sensor_4_roll_std_1' 'sensor_5_roll_std_1' 'sensor_6_roll_std_1'
 'sensor_7_roll_std_1' 'sensor_8_roll_std_1' 'sensor_9_roll_std_1'
 'sensor_10_roll_std_1' 'sensor_11_roll_std_1' 'sensor_12_roll_std_1'
 'sensor_13_roll_std_1' 'sensor_14_roll_std_1' 'sensor_15_roll_std_1'
 'sensor_16_roll_std_1' 'sensor_17_roll_std_1' 'sensor_18_roll_std_1'
 'sensor_19_roll_std_1' 'sensor_20_roll_std_1' 'sensor_21_roll_std_1']. At least one non-missing value is needed for imputation with strategy='median'.


In [5]:
best_rf = grid.best_estimator_

y_probs = best_rf.predict_proba(X_val)[:, 1]

prauc = average_precision_score(y_val, y_probs)

y_pred = (y_probs >= 0.5).astype(int)

precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)

prauc, precision, recall

 'sensor_4_roll_std_1' 'sensor_5_roll_std_1' 'sensor_6_roll_std_1'
 'sensor_7_roll_std_1' 'sensor_8_roll_std_1' 'sensor_9_roll_std_1'
 'sensor_10_roll_std_1' 'sensor_11_roll_std_1' 'sensor_12_roll_std_1'
 'sensor_13_roll_std_1' 'sensor_14_roll_std_1' 'sensor_15_roll_std_1'
 'sensor_16_roll_std_1' 'sensor_17_roll_std_1' 'sensor_18_roll_std_1'
 'sensor_19_roll_std_1' 'sensor_20_roll_std_1' 'sensor_21_roll_std_1']. At least one non-missing value is needed for imputation with strategy='median'.


(np.float64(0.9806812043350756), 1.0, 0.7333333333333333)

In [6]:
import json

metrics = {
    "model": "random_forest_tuned",
    "prauc": float(prauc),
    "precision": float(precision),
    "recall": float(recall),
    "best_params": grid.best_params_
}

with open("tree_model_metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)
