### Model development

In [76]:
import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, RobustScaler

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix


filepath = "C:/Users/WALDMJN/OneDrive - Schaeffler/Uni/Data Exploration Project/Pred Maintenance Project/Predictive-Maintenance/Data/predictive_maintenance.csv"
df = pd.read_csv(filepath)
df = df.drop(["UDI", "Product ID"], axis = 1)
df.head()


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,M,298.1,308.6,1551,42.8,0,0,No Failure
1,L,298.2,308.7,1408,46.3,3,0,No Failure
2,L,298.1,308.5,1498,49.4,5,0,No Failure
3,L,298.2,308.6,1433,39.5,7,0,No Failure
4,L,298.2,308.7,1408,40.0,9,0,No Failure


Let's drop out the target anomalies from notebook before.

In [77]:
fail_df = df[df['Target'] == 1]
indexPossibleFailure = fail_df[fail_df['Failure Type'] == 'No Failure'].index
df.drop(indexPossibleFailure, axis=0, inplace=True)
df.shape[0]

9991

In [78]:
fail_df  = df[df['Target'] == 0]
indexPossibleFailure = fail_df[fail_df['Failure Type'] == 'Random Failures'].index
df.drop(indexPossibleFailure, axis=0, inplace=True)
df.shape[0]

9973

In [79]:
# df['Power [W]'] = df['Torque [Nm]'] * (2 * np.pi * df['Rotational speed [rpm]'] / 60.0)
# df['Overstrain [minNm]'] = df['Torque [Nm]'] * df['Tool wear [min]']
# df['Heat dissipation [rpminK]'] = abs(df['Air temperature [K]'] - df['Process temperature [K]']) * df['Rotational speed [rpm]']

# df.head(5)

In [80]:
encoder = OrdinalEncoder()
df[['Type', 'Failure Type']] = encoder.fit_transform(df[['Type', 'Failure Type']])

The RobustScaler on Rotational Speed and Torque is necessary because of strong outliers.

In [81]:
df_scaled = df.copy()

columns = ['Rotational speed [rpm]', 'Torque [Nm]']
scaler = RobustScaler()
features_scaled = scaler.fit_transform(df[columns])
features_scaled = pd.DataFrame(features_scaled, columns=columns)
df_scaled.drop(columns, axis=1, inplace=True)
df_scaled = pd.concat([df,features_scaled], axis=1)

df_scaled.head(5)

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Rotational speed [rpm].1,Torque [Nm].1
0,2.0,298.1,308.6,1551.0,42.8,0.0,0.0,1.0,0.253968,0.2
1,1.0,298.2,308.7,1408.0,46.3,3.0,0.0,1.0,-0.502646,0.459259
2,1.0,298.1,308.5,1498.0,49.4,5.0,0.0,1.0,-0.026455,0.688889
3,1.0,298.2,308.6,1433.0,39.5,7.0,0.0,1.0,-0.37037,-0.044444
4,1.0,298.2,308.7,1408.0,40.0,9.0,0.0,1.0,-0.502646,-0.007407


Air temperature, Process temperature and tool wear get scaled over MinMaxScaler.

In [82]:
columns = ['Air temperature [K]', 'Process temperature [K]', 'Tool wear [min]']
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(df[columns])
features_scaled = pd.DataFrame(features_scaled, columns=columns)
df_scaled.drop(columns, axis=1, inplace=True)
df_scaled = pd.concat([df_scaled, features_scaled], axis=1)

df.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,2.0,298.1,308.6,1551,42.8,0,0,1.0
1,1.0,298.2,308.7,1408,46.3,3,0,1.0
2,1.0,298.1,308.5,1498,49.4,5,0,1.0
3,1.0,298.2,308.6,1433,39.5,7,0,1.0
4,1.0,298.2,308.7,1408,40.0,9,0,1.0


It is important that the values for training data and test data are well divided, as there is a small number of errors, especially in the existing data set. 

In [83]:
X = df.drop(['Target', 'Failure Type'], axis=1)
y = df['Target']

# Reset the indices
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

cv = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=42)

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

print('Checking the stratified split...')
print('Target proportion in original dataset:')
print(df['Target'].value_counts(normalize=True))

print('Target proportion in y_train dataset:')
print(y_train.value_counts(normalize=True))

print('Target proportion in y_test dataset:')
print(y_test.value_counts(normalize=True))


Checking the stratified split...
Target proportion in original dataset:
Target
0    0.966911
1    0.033089
Name: proportion, dtype: float64
Target proportion in y_train dataset:
Target
0    0.966974
1    0.033026
Name: proportion, dtype: float64
Target proportion in y_test dataset:
Target
0    0.96672
1    0.03328
Name: proportion, dtype: float64


Y_Train and Y_Test have an equally good distribution and a small difference in the target values.

### Model Training and Testing

Specifically, we aim to classify whether a machine is functioning correctly or if it is experiencing a fault. This initial step of binary classification - distinguishing between "faulty" and "operational" states - serves several crucial purposes:

- Simplicity and Clarity
- Early Fault Detection
- Resource Allocation

Given our binary classification problem, we want to test a variety of machine learning models to determine which one performs best on our dataset. The models we plan to test include:

- Logistic Regression: A simple yet effective linear model for binary classification.
- Decision Tree Classifier: Easy to interpret and visualize, capturing non-linear relationships.
- Random Forest Classifier: An ensemble method that builds multiple decision trees to improve accuracy and reduce overfitting.
- Balanced Random Forest Classifier:
- Gradient Boosting Classifier: Sequentially builds trees, each one correcting the errors of the previous one. We will also test variants like XGBoost, LightGBM, and CatBoost.
- Bagging Classifier:
- Balanced Bagging Classifier:
- Easy Ensemble Classifier:
- Support Vector Machine:


In [84]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'Bagging Classifier': BaggingClassifier(random_state=42),
    'Balanced Random Forest': BalancedRandomForestClassifier(random_state=42, n_jobs=-1),
    'Balanced Bagging': BalancedBaggingClassifier(random_state=42, n_jobs=-1),
    'Easy Ensemble': EasyEnsembleClassifier(random_state=42),
    'SVC': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Decision Tree Classifier': DecisionTreeClassifier(random_state=42)
}

results_df = pd.DataFrame()

for name, model in models.items():
    print(f'Fitting {name}')
    
    # Cross validation metrics test data
    scoring = ["f1_macro", "precision_macro", "recall_macro", "roc_auc"]
    cross_val_scores = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    
    # Calculate metrics
    f1_test_cv = round(cross_val_scores["test_f1_macro"].mean(), 4)
    precision_test_cv = round(cross_val_scores["test_precision_macro"].mean(), 4)
    recall_test_cv = round(cross_val_scores["test_recall_macro"].mean(), 4)
    roc_auc_test_cv = round(cross_val_scores["test_roc_auc"].mean(), 4)
    
    # Summary table
    score_df = pd.DataFrame({
                     'f1': f1_test_cv,
                     'precision': precision_test_cv,
                     'recall': recall_test_cv,
                     'roc_auc': roc_auc_test_cv},
                     index=[name])

    results_df = pd.concat([results_df, score_df])

results_df = results_df.sort_values(by='f1', ascending=False)
print(results_df.round(4))

# Get the names of the top 3 classifiers
top_3_classifiers = results_df.head(3).index

# Train and display confusion matrices for the top 3 classifiers
for name in top_3_classifiers:
    model = models[name]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f'Confusion Matrix for {name}:')
    print(confusion_matrix(y_test, y_pred))
    print()


Fitting Logistic Regression
Fitting Random Forest
Fitting Bagging Classifier
Fitting Balanced Random Forest
Fitting Balanced Bagging
Fitting Easy Ensemble
Fitting SVC
Fitting Gradient Boosting
Fitting Decision Tree Classifier
                              f1  precision  recall  roc_auc
Bagging Classifier        0.8626     0.9214  0.8198   0.9373
Random Forest             0.8449     0.9349  0.7879   0.9775
Decision Tree Classifier  0.8437     0.8459  0.8431   0.8431
Gradient Boosting         0.8405     0.9089  0.7931   0.9779
Balanced Bagging          0.7023     0.6457  0.9189   0.9728
Easy Ensemble             0.6590     0.6159  0.9138   0.9638
Balanced Random Forest    0.6558     0.6142  0.9252   0.9779
Logistic Regression       0.6479     0.8284  0.5996   0.9152
SVC                       0.5034     0.7836  0.5060   0.9190
Confusion Matrix for Bagging Classifier:
[[2404    7]
 [  30   53]]

Confusion Matrix for Random Forest:
[[2405    6]
 [  30   53]]

Confusion Matrix for Decision T

The top three classifiers based on F1 score are:

1. Bagging Classifier: F1 score of 0.8626 with a confusion matrix showing high precision and recall.
2. Random Forest: F1 score of 0.8449, slightly lower than Bagging, but with excellent precision and roc_auc.
3. Decision Tree Classifier: F1 score of 0.8437, closely following Random Forest, with balanced precision and recall.
4. Gradient Boosting: F1 score of 0.8405, slightly lower than Decision Tree, but with high precision and roc_auc.

These models demonstrate strong performance, particularly in precision and roc_auc, indicating effective classification capabilities.