In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score, f1_score, accuracy_score, roc_auc_score
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_parquet('data/partition/train.parquet')
train.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
8094,8095,H37508,H,300.2,311.6,1662,28.8,162,0,0,0,0,0,0
1008,1009,L48188,L,296.2,307.1,1441,41.7,3,0,0,0,0,0,0
8016,8017,M22876,M,300.9,312.0,1447,41.4,193,0,0,0,0,0,0
1964,1965,M16824,M,297.8,307.5,1405,46.9,113,0,0,0,0,0,0
9844,9845,H39258,H,298.3,309.1,1484,37.0,28,0,0,0,0,0,0


In [3]:
train.shape[0]

7000

In [4]:
train.isnull().sum()

UDI                        0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

In [5]:
train.dtypes

UDI                          int64
Product ID                  object
Type                        object
Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
Machine failure              int64
TWF                          int64
HDF                          int64
PWF                          int64
OSF                          int64
RNF                          int64
dtype: object

In [6]:
train['Machine failure'].value_counts()

Machine failure
0    6755
1     245
Name: count, dtype: int64

In [7]:
train = train.drop(['UDI', 'Product ID'], axis=1)

In [8]:
le = LabelEncoder()
train['Type_encoded'] = le.fit_transform(train['Type'])

In [9]:
train

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_encoded
8094,H,300.2,311.6,1662,28.8,162,0,0,0,0,0,0,0
1008,L,296.2,307.1,1441,41.7,3,0,0,0,0,0,0,1
8016,M,300.9,312.0,1447,41.4,193,0,0,0,0,0,0,2
1964,M,297.8,307.5,1405,46.9,113,0,0,0,0,0,0,2
9844,H,298.3,309.1,1484,37.0,28,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3915,L,302.5,311.5,1700,29.5,133,0,0,0,0,0,0,1
9620,L,298.9,310.0,1470,45.9,109,0,0,0,0,0,0,1
7068,L,300.8,310.7,1461,42.8,173,0,0,0,0,0,0,1
7391,L,299.7,310.6,1675,31.3,129,0,0,0,0,0,0,1


In [10]:
binary_cols = ['Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
for col in binary_cols:
    train[col] = train[col].astype('category')

In [11]:
train = train.drop(['Type'], axis=1)

In [None]:
experiment_name = "predictive-maintenance-prediction"
bucket = "s3://predictive-maintenance-artifacts-victor-obi/mlflow"
mlflow_tracking_uri = "sql:///mlflow.db"
mlflow.set_tracking_uri(mlflow_tracking_uri)
experiment = mlflow.get_experiment_by_name(experiment_name)
if not experiment:
    mlflow.create_experiment(name=experiment_name, artifact_location=bucket)
mlflow.set_experiment(experiment_name)

In [18]:
def prepare_data(path):
    if path.endswith('.csv'):
        data = pd.read_csv(path)
    elif path.endswith('.parquet'):
        data = pd.read_parquet(path)
    
    mapping = {'L': 0, 'M': 1, 'H': 2}
    data['Type_encoded'] = data['Type'].map(mapping).astype(int)
        
    data = data.drop(['Type', 'UID', 'Product ID'], axis=1, errors='ignore')
    
    y = data['Machine failure']
    X = data.drop(['Machine failure'], axis=1)

    return X, y

In [25]:
def train_lr(X_train, y_train, X_test, y_test):
    def objective_lr(params):
        mlflow.sklearn.autolog(exclusive=False, log_models=False)
        
        with mlflow.start_run(nested=True):
            mlflow.log_params(params)
            
            lr = LogisticRegression(
                C=params['C'],
                solver=params['solver'],
                class_weight='balanced', # Critical
                max_iter=1000,
                random_state=42
            )
            
            lr.fit(X_train, y_train)
            
            y_pred = lr.predict(X_test)
            y_prob = lr.predict_proba(X_test)[:, 1]
            
            recall = recall_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            mlflow.log_metric('recall', recall)
            mlflow.log_metric('roc_auc', roc_auc)
            mlflow.log_metric('f1_score', f1)
            return {'loss': -roc_auc, 'status': STATUS_OK}
    
    search_space_lr = {
        'C': hp.loguniform('C', -4, 2), 
        'solver': hp.choice('solver', ['liblinear', 'lbfgs'])
    }
    
    best_result_lr = fmin(
        fn=objective_lr,
        space=search_space_lr,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    final_params_lr = best_result_lr.copy()
    solver_list = ['liblinear', 'lbfgs']
    final_params_lr['solver'] = solver_list[final_params_lr['solver']]
    
    with mlflow.start_run() as run:
        mlflow.log_params(final_params_lr)
        
        lr_final = LogisticRegression(**final_params_lr, class_weight='balanced', max_iter=1000)
        lr_final.fit(X_train, y_train)
        y_prob = lr_final.predict(X_test)      
        y_pred = (y_prob > 0.5).astype(int)  
            
        recall = recall_score(y_test, y_pred)
        
        mlflow.sklearn.log_model(
            sk_model=lr_final,
            artifact_path="model",
            registered_model_name="predictive-maintenance-logreg"
        )
        
        print("Logistic Regression Champion Saved.")
        # ARTIFACT 1: Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=['Normal', 'Fail'], 
                    yticklabels=['Normal', 'Fail'])
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        
        # Save locally, then upload
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # ARTIFACT 2: ROC Curve 
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc_val = auc(fpr, tpr)
        
        plt.figure(figsize=(6, 4))
        plt.plot(fpr, tpr, label=f'AUC = {roc_auc_val:.2f}')
        plt.plot([0, 1], [0, 1], 'k--') # Random guess line
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='lower right')
        
        plt.savefig("roc_curve.png")
        mlflow.log_artifact("roc_curve.png")
        plt.close()

        # ARTIFACT 3: Feature Importance 
        importances = np.abs(model.coef_[0])
        feature_names = X_train.columns
        indices = np.argsort(importances)[::-1]
         
        plt.figure(figsize=(10, 6))
        sns.barplot(x=importances[indices], y=feature_names[indices])
        plt.title('Feature Importance (Coefficients)')
        plt.savefig("feature_importance.png")
        mlflow.log_artifact("feature_importance.png")
        plt.close()
        return run.info.run_id, recall

In [26]:
def train_xgb(X_train, y_train, X_test, y_test):
    def objective_xgb(params):
        params['objective'] = 'binary:logistic'
        params['scale_pos_weight'] = 27.5 
        
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        
        mlflow.xgboost.autolog(exclusive=False, log_models=False, disable=True)
        
        with mlflow.start_run(nested=True):
            mlflow.log_params(params)
            
            booster = xgb.train(
                params=params,
                dtrain=dtrain,
                num_boost_round=100,
                evals=[(dtest, 'validation')],
                early_stopping_rounds=10,
                verbose_eval=False
            )
            
            y_prob = booster.predict(dtest)      
            y_pred = (y_prob > 0.5).astype(int)  
            
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_prob)
            
            mlflow.log_metric('recall', recall)
            mlflow.log_metric('f1_score', f1) 
            mlflow.log_metric('roc_auc', roc_auc)
        
            return {'loss': -roc_auc, 'status': STATUS_OK}
    
    search_space_xgb = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 20, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'seed': 42
    }
    
    best_result_xgb = fmin(
        fn=objective_xgb,
        space=search_space_xgb,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    final_params = best_result_xgb.copy()
    final_params['max_depth'] = int(final_params['max_depth'])
    final_params['objective'] = 'binary:logistic'
    final_params['scale_pos_weight'] = 27.5
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    with mlflow.start_run() as run:
        mlflow.log_params(final_params)
        
        xg_model = xgb.train(
            params=final_params, 
            dtrain=dtrain, 
            evals=[(dtest, 'validation')],
            num_boost_round=100,
            early_stopping_rounds=10
        )
        y_prob = xg_model.predict(dtest)      
        y_pred = (y_prob > 0.5).astype(int)  
            
        recall = recall_score(y_test, y_pred)
        
        mlflow.xgboost.log_model(
            xgb_model=xg_model, 
            artifact_path="model", # Correct argument name
            registered_model_name="predictive-maintenance-model"
        )
        # ARTIFACT 1: Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=['Normal', 'Fail'], 
                    yticklabels=['Normal', 'Fail'])
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # ARTIFACT 2: ROC Curve
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc_val = auc(fpr, tpr)
        
        plt.figure(figsize=(6, 4))
        plt.plot(fpr, tpr, label=f'AUC = {roc_auc_val:.2f}')
        plt.plot([0, 1], [0, 1], 'k--') # Random guess line
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='lower right')
        
        plt.savefig("roc_curve.png")
        mlflow.log_artifact("roc_curve.png")
        plt.close()

        # ARTIFACT 3: Feature Importance 
        importance_dict = xg_model.get_score(importance_type='gain')
        
        if importance_dict:
            # Convert to lists for plotting
            features = list(importance_dict.keys())
            scores = list(importance_dict.values())
            
            # Sort by score (Descending)
            indices = np.argsort(scores)[::-1]
            sorted_features = [features[i] for i in indices]
            sorted_scores = [scores[i] for i in indices]
            
            plt.figure(figsize=(10, 6))
            sns.barplot(x=sorted_scores, y=sorted_features)
            plt.title('XGBoost Feature Importance (Gain)')
            plt.xlabel('Importance Score')
            
            plt.savefig("feature_importance.png")
            mlflow.log_artifact("feature_importance.png")
            plt.close()
        return run.info.run_id, recall


In [27]:
def train_rf(X_train, y_train, X_test, y_test):
    def objective_rf(params):
        mlflow.sklearn.autolog(exclusive=False, log_models=False, disable=True)
        
        with mlflow.start_run(nested=True):
            mlflow.log_params(params)
            
            rf = RandomForestClassifier(
                n_estimators=int(params['n_estimators']),
                max_depth=int(params['max_depth']),
                min_samples_split=int(params['min_samples_split']),
                min_samples_leaf=int(params['min_samples_leaf']),
                criterion=params['criterion'],
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            )
            
            rf.fit(X_train, y_train)
            
            y_pred = rf.predict(X_test)
            y_prob = rf.predict_proba(X_test)[:, 1] 
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_prob)
            
            mlflow.log_metric('recall', recall)
            mlflow.log_metric('f1_score', f1)
            mlflow.log_metric('roc_auc', roc_auc)            
            return {'loss': -roc_auc, 'status': STATUS_OK}

    search_space_rf = {
        'n_estimators': scope.int(hp.quniform('n_estimators', 50, 500, 10)),
        'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 20, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 10, 1)),
        'criterion': hp.choice('criterion', ['gini', 'entropy'])
    }

    best_result = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )

    criteria_list = ['gini', 'entropy']
    final_params = {
        'n_estimators': int(best_result['n_estimators']),
        'max_depth': int(best_result['max_depth']),
        'min_samples_split': int(best_result['min_samples_split']),
        'min_samples_leaf': int(best_result['min_samples_leaf']),
        'criterion': criteria_list[best_result['criterion']]
    }

    print(f"Training Final RF with params: {final_params}")

    with mlflow.start_run() as run:
        mlflow.log_params(final_params)
        
        rf_final = RandomForestClassifier(**final_params, class_weight='balanced', random_state=42)
        rf_final.fit(X_train, y_train)
        y_prob = rf_final.predict_proba(X_test)[:, 1]
            
        y_pred = rf_final.predict(X_test) 
        recall = recall_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)
        
        mlflow.log_metric("recall", recall)
        mlflow.log_metric('roc_auc', roc_auc)
        mlflow.sklearn.log_model(
            sk_model=rf_final,
            artifact_path="model",
            registered_model_name="predictive-maintenance-rf"
        )
        
        print(f"Random Forest Saved. Run ID: {run.info.run_id}")
        # ARTIFACT 1: 
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=['Normal', 'Fail'], 
                    yticklabels=['Normal', 'Fail'])
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        
        # Save locally, then upload
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # ARTIFACT 2: ROC Curve
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc_val = auc(fpr, tpr)
        
        plt.figure(figsize=(6, 4))
        plt.plot(fpr, tpr, label=f'AUC = {roc_auc_val:.2f}')
        plt.plot([0, 1], [0, 1], 'k--') # Random guess line
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='lower right')
        
        plt.savefig("roc_curve.png")
        mlflow.log_artifact("roc_curve.png")
        plt.close()

        # ARTIFACT 3: Feature Importance 
        
        if hasattr(model, 'feature_importances_'): 
            # Get importance
            importances = rf_final.feature_importances_
            # Get feature names (X_train must be a DataFrame)
            feature_names = X_train.columns
            
            # Sort them
            indices = np.argsort(importances)[::-1]
            
            plt.figure(figsize=(10, 6))
            sns.barplot(x=importances[indices], y=feature_names[indices])
            plt.title('Feature Importance')
            plt.xlabel('Importance Score')
            
            plt.savefig("feature_importance.png")
            mlflow.log_artifact("feature_importance.png")
            plt.close()
        return run.info.run_id, recall

In [None]:
from mlflow.tracking import MlFlowClient

client = MlFlowClient(tracking_uri=mlflow_tracking_uri)

client.list_experiments()

from mlflow.entities import ViewType
runs = client.search_runs(
    experiment_ids=1,
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.recall DESC"]
)

for run in runs:
    print(f"run_id: {run.info.run_id}, recall:{run.data.metrics['recall']:.4f}")

In [None]:
def start():
mlflow.set_tracking_uri(mlflow_tracking_uri)
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name=experiment_name)

In [None]:
list_models_in_experiment = client.list_registered_models()
# get latest models
latest_models = client.get_latest_versions(name=experiment_name)

In [None]:
# moving models btw stages in the registry
model_verion=4
client.transition_model_version_stage(name=experiment_name, version=model_version, stage="Staging", archive_existing_version=False)
client.update_model_version(name=experiment_name, version=4)
