### Model development

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, RobustScaler

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import SMOTE, RandomOverSampler, BorderlineSMOTE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np


filepath = "C:/Users/WALDMJN/OneDrive - Schaeffler/Uni/Data Exploration Project/Pred Maintenance Project/Predictive-Maintenance/Data/predictive_maintenance.csv"
df = pd.read_csv(filepath)
df = df.drop(["UDI", "Product ID"], axis = 1)
df.head()


Let's drop out the target anomalies from notebook before.

In [None]:
fail_df = df[df['Target'] == 1]
indexPossibleFailure = fail_df[fail_df['Failure Type'] == 'No Failure'].index
df.drop(indexPossibleFailure, axis=0, inplace=True)
df.shape[0]

In [None]:
fail_df  = df[df['Target'] == 0]
indexPossibleFailure = fail_df[fail_df['Failure Type'] == 'Random Failures'].index
df.drop(indexPossibleFailure, axis=0, inplace=True)
df.shape[0]

In [None]:
df['Power [W]'] = df['Torque [Nm]'] * (2 * np.pi * df['Rotational speed [rpm]'] / 60.0)
df['Overstrain [minNm]'] = df['Torque [Nm]'] * df['Tool wear [min]']
df['Heat dissipation [rpminK]'] = abs(df['Air temperature [K]'] - df['Process temperature [K]']) * df['Rotational speed [rpm]']

df.head(5)

In [None]:
encoder = OrdinalEncoder()
df[['Type', 'Failure Type']] = encoder.fit_transform(df[['Type', 'Failure Type']])

In [11]:
print(df.columns)

Index(['Type', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Target',
       'Failure Type', 'Power [W]', 'Overstrain [minNm]',
       'Heat dissipation [rpminK]'],
      dtype='object')


The RobustScaler on Rotational Speed and Torque is necessary because of strong outliers.

In [13]:
df_scaled = df.copy()

columns = ['Rotational speed [rpm]', 'Torque [Nm]', 'Power [W]', 'Overstrain [minNm]', 'Heat dissipation [rpminK]']
scaler = RobustScaler()
features_scaled = scaler.fit_transform(df[columns])
features_scaled = pd.DataFrame(features_scaled, columns=columns)
df_scaled.drop(columns, axis=1, inplace=True)
df_scaled = pd.concat([df,features_scaled], axis=1)

df_scaled.head(5)

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Power [W],Overstrain [minNm],Heat dissipation [rpminK],Rotational speed [rpm].1,Torque [Nm].1,Power [W].1,Overstrain [minNm].1,Heat dissipation [rpminK].1
0,2.0,298.1,308.6,1551.0,42.8,0.0,0.0,1.0,6951.59056,0.0,16285.5,0.253968,0.2,0.474003,-0.92907,0.381501
1,1.0,298.2,308.7,1408.0,46.3,3.0,0.0,1.0,6826.722724,138.9,14784.0,-0.502646,0.459259,0.387271,-0.896863,-0.122967
2,1.0,298.1,308.5,1498.0,49.4,5.0,0.0,1.0,7749.387543,247.0,15579.2,-0.026455,0.688889,1.02815,-0.871797,0.144201
3,1.0,298.2,308.6,1433.0,39.5,7.0,0.0,1.0,5927.504659,276.5,14903.2,-0.37037,-0.044444,-0.237322,-0.864957,-0.082919
4,1.0,298.2,308.7,1408.0,40.0,9.0,0.0,1.0,5897.816608,360.0,14784.0,-0.502646,-0.007407,-0.257943,-0.845596,-0.122967


Air temperature, Process temperature and tool wear get scaled over MinMaxScaler.

In [None]:
columns = ['Air temperature [K]', 'Process temperature [K]', 'Tool wear [min]']
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(df[columns])
features_scaled = pd.DataFrame(features_scaled, columns=columns)
df_scaled.drop(columns, axis=1, inplace=True)
df_scaled = pd.concat([df_scaled, features_scaled], axis=1)

df.head()

It is important that the values for training data and test data are well divided, as there is a small number of errors, especially in the existing data set. 

In [None]:
X = df.drop(['Target', 'Failure Type'], axis=1)
y = df['Target']

# Reset the indices
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

cv = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=42)

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

print('Checking the stratified split...')
print('Target proportion in original dataset:')
print(df['Target'].value_counts(normalize=True))

print('Target proportion in y_train dataset:')
print(y_train.value_counts(normalize=True))

print('Target proportion in y_test dataset:')
print(y_test.value_counts(normalize=True))


Y_Train and Y_Test have an equally good distribution and a small difference in the target values.

### Model Training and Testing

Specifically, we aim to classify whether a machine is functioning correctly or if it is experiencing a fault. This initial step of binary classification - distinguishing between "faulty" and "operational" states - serves several crucial purposes:

- Simplicity and Clarity
- Early Fault Detection
- Resource Allocation

Given our binary classification problem, we want to test a variety of machine learning models to determine which one performs best on our dataset. The models we plan to test include:

- Logistic Regression: A simple yet effective linear model for binary classification.
- Decision Tree Classifier: Easy to interpret and visualize, capturing non-linear relationships.
- Random Forest Classifier: An ensemble method that builds multiple decision trees to improve accuracy and reduce overfitting.
- Balanced Random Forest Classifier:
- Gradient Boosting Classifier: Sequentially builds trees, each one correcting the errors of the previous one. We will also test variants like XGBoost, LightGBM, and CatBoost.
- Bagging Classifier:
- Balanced Bagging Classifier:
- Easy Ensemble Classifier:
- Support Vector Machine:


In [14]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'Bagging Classifier': BaggingClassifier(random_state=42),
    'Balanced Random Forest': BalancedRandomForestClassifier(random_state=42, n_jobs=-1),
    'Balanced Bagging': BalancedBaggingClassifier(random_state=42, n_jobs=-1),
    'Easy Ensemble': EasyEnsembleClassifier(random_state=42),
    'SVC': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Decision Tree Classifier': DecisionTreeClassifier(random_state=42)
}

results_df = pd.DataFrame()

for name, model in models.items():
    print(f'Fitting {name}')
    
    # Cross validation metrics test data
    scoring = ["f1_macro", "precision_macro", "recall_macro", "roc_auc"]
    cross_val_scores = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    
    # Calculate metrics
    f1_test_cv = round(cross_val_scores["test_f1_macro"].mean(), 4)
    precision_test_cv = round(cross_val_scores["test_precision_macro"].mean(), 4)
    recall_test_cv = round(cross_val_scores["test_recall_macro"].mean(), 4)
    roc_auc_test_cv = round(cross_val_scores["test_roc_auc"].mean(), 4)
    
    # Summary table
    score_df = pd.DataFrame({
                     'f1': f1_test_cv,
                     'precision': precision_test_cv,
                     'recall': recall_test_cv,
                     'roc_auc': roc_auc_test_cv},
                     index=[name])

    results_df = pd.concat([results_df, score_df])

results_df = results_df.sort_values(by='f1', ascending=False)
print(results_df.round(4))

# Get the names of the top 3 classifiers
top_3_classifiers = results_df.head(3).index

# Train and display confusion matrices for the top 3 classifiers
for name in top_3_classifiers:
    model = models[name]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f'Confusion Matrix for {name}:')
    print(confusion_matrix(y_test, y_pred))
    print()


Fitting Logistic Regression
Fitting Random Forest
Fitting Bagging Classifier
Fitting Balanced Random Forest
Fitting Balanced Bagging
Fitting Easy Ensemble
Fitting SVC
Fitting Gradient Boosting
Fitting Decision Tree Classifier
                              f1  precision  recall  roc_auc
Random Forest             0.9344     0.9640  0.9086   0.9860
Bagging Classifier        0.9341     0.9714  0.9029   0.9522
Gradient Boosting         0.9307     0.9590  0.9061   0.9910
Decision Tree Classifier  0.9072     0.9047  0.9100   0.9100
Balanced Bagging          0.7426     0.6781  0.9317   0.9792
Easy Ensemble             0.7246     0.6624  0.9396   0.9847
Balanced Random Forest    0.6899     0.6362  0.9433   0.9882
Logistic Regression       0.6559     0.8112  0.6111   0.9174
SVC                       0.5081     0.8836  0.5084   0.9302
Confusion Matrix for Random Forest:
[[2407    4]
 [  15   68]]

Confusion Matrix for Bagging Classifier:
[[2406    5]
 [  18   65]]

Confusion Matrix for Gradient B

The top three classifiers based on F1 score are:

1. Bagging Classifier: F1 score of 0.8626 with a confusion matrix showing high precision and recall.
2. Random Forest: F1 score of 0.8449, slightly lower than Bagging, but with excellent precision and roc_auc.
3. Decision Tree Classifier: F1 score of 0.8437, closely following Random Forest, with balanced precision and recall.
4. Gradient Boosting: F1 score of 0.8405, slightly lower than Decision Tree, but with high precision and roc_auc.

These models demonstrate strong performance, particularly in precision and roc_auc, indicating effective classification capabilities.

### Summary on Sampling and Hyperparameter Optimization

In machine learning, especially when dealing with imbalanced datasets, it is crucial to apply sampling techniques to ensure that models are trained effectively. Alongside sampling, hyperparameter optimization is essential to fine-tune model performance and achieve the best results.

Sampling involves adjusting the dataset to balance the class distribution. This can be done through:

- Oversampling: Increasing the number of instances in the minority class.
- Undersampling: Reducing the number of instances in the majority class.

Hyperparameter Optimization involves searching for the optimal set of parameters for a machine learning model. 

In [15]:
from imblearn.combine import SMOTETomek

# Define models
RdFo = RandomForestClassifier(random_state=42, criterion='entropy')
BBC = BaggingClassifier(random_state=42, n_jobs=-1)
DTC = DecisionTreeClassifier(random_state=42)
GBC = GradientBoostingClassifier(random_state=42)

# Create NearestNeighbors object with n_jobs set
nn = NearestNeighbors(n_jobs=-1)

# Define sampling methods with NearestNeighbors object
OverSamp_1 = RandomOverSampler(random_state=42)
OverSamp_2 = SMOTE(random_state=42, k_neighbors=nn)
OverSamp_3 = BorderlineSMOTE(random_state=42, k_neighbors=nn)
UnderSamp_1 = ClusterCentroids(random_state=42)
UnderSamp_2 = TomekLinks(n_jobs=-1)
UnderSamp_3 = NearMiss(version=3, n_jobs=-1)
Samp_7 = SMOTETomek()

# Combine over- and undersampling methods into a list
Samp_list = [OverSamp_1, OverSamp_2, OverSamp_3, UnderSamp_1, UnderSamp_2, UnderSamp_3, Samp_7]

# Initialize results DataFrame
results_df = pd.DataFrame()

# Loop through each model and each sampling method
for model in [RdFo, BBC, DTC, GBC]:
    print("Fitting: ", model)
    if isinstance(model, RandomForestClassifier):
        grid_param = {
            'n_estimators': np.arange(10, 300, 10),
            'max_depth': np.arange(10, 100, 10),
            'min_samples_split': [2, 5, 10]
        }
    elif isinstance(model, BaggingClassifier):
        grid_param = {
            'n_estimators': np.arange(10, 160, 10)
        }
    elif isinstance(model, DecisionTreeClassifier):
        grid_param = {
            'max_depth': np.arange(1, 50, 5),
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    elif isinstance(model, GradientBoostingClassifier):
        grid_param = {
            'n_estimators': np.arange(10, 300, 10),
            'learning_rate': np.logspace(-3, 0, 4),
            'max_depth': np.arange(1, 10, 2),
            'min_samples_split': [2, 5, 10]
        }
                      
    for samp in Samp_list:
        # Resample the training data
        X_train_resampled, y_train_resampled = samp.fit_resample(X_train, y_train)

        # Perform Randomized Search with cross-validation
        random_search = RandomizedSearchCV(model, grid_param, cv=3, n_jobs=-1, scoring='f1_macro', refit='f1_macro', random_state=42)
        random_search.fit(X_train_resampled, y_train_resampled)
        y_pred = random_search.predict(X_test)
        
        # Calculate metrics
        f1 = f1_score(y_test, y_pred, average="macro")
        auc = roc_auc_score(y_test, y_pred)
        
        # Create DataFrame for results
        score_df = pd.DataFrame({
            'model': [str(model).split('(')[0]],
            'f1': [f1],
            'auc': [auc],
            'sampling_method': [str(samp).split('(')[0]]
        })
        
        # Append best parameters
        best_params = random_search.best_params_
        for param in best_params:
            score_df[param] = best_params[param]
        
        results_df = pd.concat([results_df, score_df])

# Sort results by f1 score and display
results_df = results_df.sort_values(by='f1', ascending=False)
print(results_df.round(4))


Fitting:  RandomForestClassifier(criterion='entropy', random_state=42)
Fitting:  BaggingClassifier(n_jobs=-1, random_state=42)
Fitting:  DecisionTreeClassifier(random_state=42)
Fitting:  GradientBoostingClassifier(random_state=42)
                        model      f1     auc    sampling_method  \
0      RandomForestClassifier  0.9338  0.9086         TomekLinks   
0           BaggingClassifier  0.9244  0.9022         TomekLinks   
0  GradientBoostingClassifier  0.9188  0.9017         TomekLinks   
0      RandomForestClassifier  0.9069  0.8951  RandomOverSampler   
0      RandomForestClassifier  0.9051  0.9121    BorderlineSMOTE   
0  GradientBoostingClassifier  0.9020  0.8833  RandomOverSampler   
0      DecisionTreeClassifier  0.8941  0.8826         TomekLinks   
0           BaggingClassifier  0.8941  0.8941  RandomOverSampler   
0  GradientBoostingClassifier  0.8904  0.9109         SMOTETomek   
0  GradientBoostingClassifier  0.8891  0.8937    BorderlineSMOTE   
0           BaggingCl

The best results were achieved with various models and the sampling method TomekLinks. The top three models are:

1. RandomForestClassifier with TomekLinks: f1-Score of 0.9338 and AUC of 0.9086.
2. BaggingClassifier with TomekLinks: f1-Score of 0.9244 and AUC of 0.9022.
3. GradientBoostingClassifier with TomekLinks: f1-Score of 0.9188 and AUC of 0.9017.