In [1]:
# XGBoost training pipeline for failure_risk (regression)
# Paste into a notebook cell and run.
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import xgboost as xgb
# import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


In [16]:
from sklearn.metrics import root_mean_squared_error

In [2]:

# ---------- 1) load data ----------
CSV_PATH = "failure_risk_dataset_500.csv"   # ensure file is in current directory
assert os.path.exists(CSV_PATH), f"CSV not found at {CSV_PATH} — put the file in notebook folder."

df = pd.read_csv(CSV_PATH, parse_dates=["validation_date_of_FC", "last_maintenance_date"])
print("Rows, cols:", df.shape)
df.head()


Rows, cols: (500, 10)


Unnamed: 0,train_id,fitness_certificate_status,validation_date_of_FC,mileage_total,last_maintenance_date,days_since_last_maintenance,days_since_FC_validation,open_jobcard_count,high_priority_jobcard_count,failure_risk
0,TR001,valid,2025-10-19,147710,2025-08-09,113,42,7,2,0.727
1,TR002,expired,2025-10-03,154856,2025-09-28,63,58,8,0,0.873
2,TR003,expired,2025-10-02,134022,2025-08-02,120,59,7,3,1.0
3,TR004,expired,2025-10-12,134720,2025-11-21,9,49,7,2,0.759
4,TR005,expired,2025-10-16,93837,2025-11-01,29,45,0,0,0.54


In [3]:

# ---------- 2) basic feature engineering ----------
# Convert boolean-like / categorical into numeric
# fitness_certificate_status -> binary (valid=0, expired=1)
df["fc_expired_flag"] = (df["fitness_certificate_status"].str.lower() == "expired").astype(int)


In [4]:

# If days_since_* are not present compute them from dates (they are present but safe to compute)
today = pd.to_datetime("2025-11-30")
if "days_since_last_maintenance" not in df.columns or df["days_since_last_maintenance"].isnull().any():
    df["days_since_last_maintenance"] = (today - df["last_maintenance_date"]).dt.days

if "days_since_FC_validation" not in df.columns or df["days_since_FC_validation"].isnull().any():
    df["days_since_FC_validation"] = (today - df["validation_date_of_FC"]).dt.days


In [5]:

# Fill or clamp suspicious values
num_cols = ["mileage_total", "days_since_last_maintenance", "days_since_FC_validation",
            "open_jobcard_count", "high_priority_jobcard_count"]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())
        # clamp to non-negative
        df[c] = df[c].clip(lower=0)


In [23]:
import sys
sys.executable

'c:\\Users\\HP\\AppData\\Local\\Programs\\Python\\Python311\\python.exe'

In [6]:

# target
y = df["failure_risk"].astype(float)

# features to use
feature_cols = [
    "mileage_total",
    "days_since_last_maintenance",
    "days_since_FC_validation",
    "open_jobcard_count",
    "high_priority_jobcard_count",
    "fc_expired_flag"
]
X = df[feature_cols].copy()


In [7]:

# ---------- 3) train/test split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_SEED
)
print("Train rows:", X_train.shape[0], "Test rows:", X_test.shape[0])


Train rows: 400 Test rows: 100


In [13]:

# ---------- 4) XGBoost model ----------
# Use a modest set of hyperparameters appropriate for small tabular data.
xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    early_stopping_rounds=50,
    random_state=RANDOM_SEED,
    tree_method="hist",     # fast for larger datasets; ok here
    eval_metric="rmse",
    n_jobs=-1
)


In [14]:

# Fit with early stopping on a validation set
eval_set = [(X_train, y_train), (X_test, y_test)]
xgb_model.fit(
    X_train, y_train,
    eval_set=eval_set,
    verbose=50,
    
)


[0]	validation_0-rmse:0.21777	validation_1-rmse:0.21599
[50]	validation_0-rmse:0.03742	validation_1-rmse:0.06592
[100]	validation_0-rmse:0.01298	validation_1-rmse:0.04933
[150]	validation_0-rmse:0.00797	validation_1-rmse:0.04694
[200]	validation_0-rmse:0.00591	validation_1-rmse:0.04629
[250]	validation_0-rmse:0.00442	validation_1-rmse:0.04609
[300]	validation_0-rmse:0.00336	validation_1-rmse:0.04600
[350]	validation_0-rmse:0.00257	validation_1-rmse:0.04597
[400]	validation_0-rmse:0.00200	validation_1-rmse:0.04592
[445]	validation_0-rmse:0.00158	validation_1-rmse:0.04594


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,50
,enable_categorical,False


In [17]:

# ---------- 5) evaluation ----------
y_pred = xgb_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R^2:   {r2:.4f}")


Test RMSE: 0.0459
Test R^2:   0.9580


In [18]:

# show some prediction samples
results = X_test.copy()
results["y_true"] = y_test.values
results["y_pred"] = np.round(y_pred, 3)
results_sorted = results.assign(diff=np.abs(results["y_true"] - results["y_pred"])).sort_values("diff", ascending=False)
print("\nTop 10 largest prediction errors (true vs pred):")
display(results_sorted.head(10))



Top 10 largest prediction errors (true vs pred):


Unnamed: 0,mileage_total,days_since_last_maintenance,days_since_FC_validation,open_jobcard_count,high_priority_jobcard_count,fc_expired_flag,y_true,y_pred,diff
321,53573,151,41,4,2,0,0.58,0.701,0.121
2,134022,120,59,7,3,1,1.0,0.885,0.115
194,66477,79,57,2,1,0,0.398,0.496,0.098
93,150064,25,2,4,1,0,0.343,0.44,0.097
18,136751,110,17,2,1,0,0.552,0.457,0.095
333,74669,80,58,2,2,1,0.813,0.722,0.091
394,25846,53,34,1,1,0,0.271,0.352,0.081
455,38775,140,2,4,0,0,0.376,0.453,0.077
381,29273,197,90,8,1,1,1.0,0.924,0.076
477,27134,10,78,0,0,1,0.459,0.534,0.075


In [20]:

# ---------- 7) save model and column info ----------
MODEL_PATH = "xgb_failure_risk.json"
xgb_model.save_model(MODEL_PATH)
print("Model saved to", MODEL_PATH)


Model saved to xgb_failure_risk.json


In [21]:

# Save metadata (feature columns) for later inference
joblib.dump({"feature_cols": feature_cols, "model_path": MODEL_PATH}, "model_meta.joblib")
print("Metadata saved to model_meta.joblib")


Metadata saved to model_meta.joblib


In [22]:

# ---------- 8) Optional: convert continuous risk -> classes ----------
# Useful if you want discrete labels (Low/Medium/High)
def risk_to_label(risk):
    if risk < 0.33:
        return "Low"
    elif risk < 0.66:
        return "Medium"
    else:
        return "High"

pred_labels = [risk_to_label(r) for r in y_pred]
true_labels = [risk_to_label(r) for r in y_test]
from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification report (coarse bins):")
print(classification_report(true_labels, pred_labels, digits=3))
print("Confusion matrix:")
print(confusion_matrix(true_labels, pred_labels))



Classification report (coarse bins):
              precision    recall  f1-score   support

        High      0.964     1.000     0.982        54
         Low      1.000     0.333     0.500         3
      Medium      0.953     0.953     0.953        43

    accuracy                          0.960       100
   macro avg      0.973     0.762     0.812       100
weighted avg      0.961     0.960     0.955       100

Confusion matrix:
[[54  0  0]
 [ 0  1  2]
 [ 2  0 41]]
