### Model development

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, RobustScaler

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import SMOTE, RandomOverSampler, BorderlineSMOTE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np


filepath = "C:/Users/WALDMJN/OneDrive - Schaeffler/Uni/Data Exploration Project/Pred Maintenance Project/Predictive-Maintenance/Data/predictive_maintenance.csv"
df = pd.read_csv(filepath)
df = df.drop(["UDI", "Product ID"], axis = 1)
df.head()


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,M,298.1,308.6,1551,42.8,0,0,No Failure
1,L,298.2,308.7,1408,46.3,3,0,No Failure
2,L,298.1,308.5,1498,49.4,5,0,No Failure
3,L,298.2,308.6,1433,39.5,7,0,No Failure
4,L,298.2,308.7,1408,40.0,9,0,No Failure


Let's drop out the target anomalies from notebook before.

In [2]:
fail_df = df[df['Target'] == 1]
indexPossibleFailure = fail_df[fail_df['Failure Type'] == 'No Failure'].index
df.drop(indexPossibleFailure, axis=0, inplace=True)
df.shape[0]

9991

In [3]:
fail_df  = df[df['Target'] == 0]
indexPossibleFailure = fail_df[fail_df['Failure Type'] == 'Random Failures'].index
df.drop(indexPossibleFailure, axis=0, inplace=True)
df.shape[0]

9973

### Feature Engineering

In [4]:
df['Power [W]'] = df['Torque [Nm]'] * (2 * np.pi * df['Rotational speed [rpm]'] / 60.0)
df['Overstrain [minNm]'] = df['Torque [Nm]'] * df['Tool wear [min]']
df['Heat dissipation [rpminK]'] = abs(df['Air temperature [K]'] - df['Process temperature [K]']) * df['Rotational speed [rpm]']

df.head(5)

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Power [W],Overstrain [minNm],Heat dissipation [rpminK]
0,M,298.1,308.6,1551,42.8,0,0,No Failure,6951.59056,0.0,16285.5
1,L,298.2,308.7,1408,46.3,3,0,No Failure,6826.722724,138.9,14784.0
2,L,298.1,308.5,1498,49.4,5,0,No Failure,7749.387543,247.0,15579.2
3,L,298.2,308.6,1433,39.5,7,0,No Failure,5927.504659,276.5,14903.2
4,L,298.2,308.7,1408,40.0,9,0,No Failure,5897.816608,360.0,14784.0


In [5]:
encoder = OrdinalEncoder()
df[['Type', 'Failure Type']] = encoder.fit_transform(df[['Type', 'Failure Type']])

In [6]:
print(df.columns)

Index(['Type', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Target',
       'Failure Type', 'Power [W]', 'Overstrain [minNm]',
       'Heat dissipation [rpminK]'],
      dtype='object')


The RobustScaler on Rotational Speed and Torque is necessary because of strong outliers.

In [7]:
# Create a copy of the original dataframe
df_scaled = df.copy()

# Define the columns to be scaled
columns = ['Rotational speed [rpm]', 'Torque [Nm]', 'Power [W]', 'Overstrain [minNm]', 'Heat dissipation [rpminK]']

# Initialize the RobustScaler
scaler = RobustScaler()

# Fit and transform the specified columns in the dataframe
features_scaled = scaler.fit_transform(df[columns])

# Convert the scaled features back to a DataFrame
features_scaled = pd.DataFrame(features_scaled, columns=columns)

# Drop the original columns from the dataframe
df_scaled.drop(columns, axis=1, inplace=True)

# Concatenate the scaled features back to the dataframe
df_scaled = pd.concat([df, features_scaled], axis=1)

# Display the first 5 rows of the scaled dataframe
df_scaled.head(5)


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Power [W],Overstrain [minNm],Heat dissipation [rpminK],Rotational speed [rpm].1,Torque [Nm].1,Power [W].1,Overstrain [minNm].1,Heat dissipation [rpminK].1
0,2.0,298.1,308.6,1551.0,42.8,0.0,0.0,1.0,6951.59056,0.0,16285.5,0.253968,0.2,0.474003,-0.92907,0.381501
1,1.0,298.2,308.7,1408.0,46.3,3.0,0.0,1.0,6826.722724,138.9,14784.0,-0.502646,0.459259,0.387271,-0.896863,-0.122967
2,1.0,298.1,308.5,1498.0,49.4,5.0,0.0,1.0,7749.387543,247.0,15579.2,-0.026455,0.688889,1.02815,-0.871797,0.144201
3,1.0,298.2,308.6,1433.0,39.5,7.0,0.0,1.0,5927.504659,276.5,14903.2,-0.37037,-0.044444,-0.237322,-0.864957,-0.082919
4,1.0,298.2,308.7,1408.0,40.0,9.0,0.0,1.0,5897.816608,360.0,14784.0,-0.502646,-0.007407,-0.257943,-0.845596,-0.122967


Air temperature, Process temperature and tool wear get scaled over MinMaxScaler.

In [8]:
# Define the columns to be scaled
columns = ['Air temperature [K]', 'Process temperature [K]', 'Tool wear [min]']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the specified columns in the dataframe
features_scaled = scaler.fit_transform(df[columns])

# Convert the scaled features back to a DataFrame
features_scaled = pd.DataFrame(features_scaled, columns=columns)

# Drop the original columns from the dataframe
df.drop(columns, axis=1, inplace=True)

# Concatenate the scaled features back to the dataframe
df_scaled = pd.concat([df, features_scaled], axis=1)

# Display the first few rows of the scaled dataframe
df_scaled.head()


Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Target,Failure Type,Power [W],Overstrain [minNm],Heat dissipation [rpminK],Air temperature [K],Process temperature [K],Tool wear [min]
0,2.0,1551.0,42.8,0.0,1.0,6951.59056,0.0,16285.5,0.304348,0.358025,0.0
1,1.0,1408.0,46.3,0.0,1.0,6826.722724,138.9,14784.0,0.315217,0.37037,0.011858
2,1.0,1498.0,49.4,0.0,1.0,7749.387543,247.0,15579.2,0.304348,0.345679,0.019763
3,1.0,1433.0,39.5,0.0,1.0,5927.504659,276.5,14903.2,0.315217,0.358025,0.027668
4,1.0,1408.0,40.0,0.0,1.0,5897.816608,360.0,14784.0,0.315217,0.37037,0.035573


It is important that the values for training data and test data are well divided, as there is a small number of errors, especially in the existing data set. 

In [9]:
# Prepare features and target
X = df.drop(['Target', 'Failure Type'], axis=1)
y = df['Target']

# Reset indices
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Stratified shuffle split
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=42)

# Create train-test splits
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

# Check target proportions
print('Checking the stratified split...')
print('Target proportion in original dataset:')
print(df['Target'].value_counts(normalize=True))

print('Target proportion in y_train dataset:')
print(y_train.value_counts(normalize=True))

print('Target proportion in y_test dataset:')
print(y_test.value_counts(normalize=True))


Checking the stratified split...
Target proportion in original dataset:
Target
0    0.966911
1    0.033089
Name: proportion, dtype: float64
Target proportion in y_train dataset:
Target
0    0.966974
1    0.033026
Name: proportion, dtype: float64
Target proportion in y_test dataset:
Target
0    0.96672
1    0.03328
Name: proportion, dtype: float64


Y_Train and Y_Test have an equally good distribution and a small difference in the target values.

### Model Training and Testing

Specifically, we aim to classify whether a machine is functioning correctly or if it is experiencing a fault. This initial step of binary classification - distinguishing between "faulty" and "operational" states - serves several crucial purposes:

- Simplicity and Clarity
- Early Fault Detection
- Resource Allocation

Given our binary classification problem, we want to test a variety of machine learning models to determine which one performs best on our dataset. The models we plan to test include:

- Logistic Regression: A simple yet effective linear model for binary classification.
- Decision Tree Classifier: Easy to interpret and visualize, capturing non-linear relationships.
- Random Forest Classifier: An ensemble method that builds multiple decision trees to improve accuracy and reduce overfitting.
- Balanced Random Forest Classifier:
- Gradient Boosting Classifier: Sequentially builds trees, each one correcting the errors of the previous one. We will also test variants like XGBoost, LightGBM, and CatBoost.
- Bagging Classifier:
- Balanced Bagging Classifier:
- Easy Ensemble Classifier:
- Support Vector Machine:


In [10]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'Bagging Classifier': BaggingClassifier(random_state=42),
    'Balanced Random Forest': BalancedRandomForestClassifier(random_state=42, n_jobs=-1),
    'Balanced Bagging': BalancedBaggingClassifier(random_state=42, n_jobs=-1),
    'Easy Ensemble': EasyEnsembleClassifier(random_state=42),
    'SVC': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Decision Tree Classifier': DecisionTreeClassifier(random_state=42)
}

results_df = pd.DataFrame()

for name, model in models.items():
    print(f'Fitting {name}')
    
    # Cross validation metrics test data
    scoring = ["f1_macro", "precision_macro", "recall_macro", "roc_auc"]
    cross_val_scores = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    
    # Calculate metrics
    f1_test_cv = round(cross_val_scores["test_f1_macro"].mean(), 4)
    precision_test_cv = round(cross_val_scores["test_precision_macro"].mean(), 4)
    recall_test_cv = round(cross_val_scores["test_recall_macro"].mean(), 4)
    roc_auc_test_cv = round(cross_val_scores["test_roc_auc"].mean(), 4)
    
    # Summary table
    score_df = pd.DataFrame({
                     'f1': f1_test_cv,
                     'precision': precision_test_cv,
                     'recall': recall_test_cv,
                     'roc_auc': roc_auc_test_cv},
                     index=[name])

    results_df = pd.concat([results_df, score_df])

results_df = results_df.sort_values(by='f1', ascending=False)
print(results_df.round(4))

# Get the names of the top 3 classifiers
top_3_classifiers = results_df.head(3).index

# Train and display confusion matrices for the top 3 classifiers
for name in top_3_classifiers:
    model = models[name]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f'Confusion Matrix for {name}:')
    print(confusion_matrix(y_test, y_pred))
    print()


Fitting Logistic Regression
Fitting Random Forest
Fitting Bagging Classifier
Fitting Balanced Random Forest
Fitting Balanced Bagging
Fitting Easy Ensemble
Fitting SVC
Fitting Gradient Boosting
Fitting Decision Tree Classifier
                              f1  precision  recall  roc_auc
Random Forest             0.9315     0.9575  0.9084   0.9748
Bagging Classifier        0.9308     0.9621  0.9038   0.9450
Gradient Boosting         0.9230     0.9356  0.9113   0.9815
Decision Tree Classifier  0.9098     0.9099  0.9102   0.9102
Balanced Bagging          0.7745     0.7076  0.9335   0.9827
Balanced Random Forest    0.7230     0.6616  0.9274   0.9843
Easy Ensemble             0.6928     0.6389  0.9077   0.9702
Logistic Regression       0.5381     0.8111  0.5248   0.8320
SVC                       0.5129     0.9837  0.5108   0.9235
Confusion Matrix for Random Forest:
[[2405    6]
 [  16   67]]

Confusion Matrix for Bagging Classifier:
[[2405    6]
 [  19   64]]

Confusion Matrix for Gradient B

The top three classifiers based on F1 score are:

1. Bagging Classifier: F1 score of 0.8626 with a confusion matrix showing high precision and recall.
2. Random Forest: F1 score of 0.8449, slightly lower than Bagging, but with excellent precision and roc_auc.
3. Decision Tree Classifier: F1 score of 0.8437, closely following Random Forest, with balanced precision and recall.
4. Gradient Boosting: F1 score of 0.8405, slightly lower than Decision Tree, but with high precision and roc_auc.

These models demonstrate strong performance, particularly in precision and roc_auc, indicating effective classification capabilities.

### Summary on Sampling and Hyperparameter Optimization

In machine learning, especially when dealing with imbalanced datasets, it is crucial to apply sampling techniques to ensure that models are trained effectively. Alongside sampling, hyperparameter optimization is essential to fine-tune model performance and achieve the best results.

Sampling involves adjusting the dataset to balance the class distribution. This can be done through:

- Oversampling: Increasing the number of instances in the minority class.
- Undersampling: Reducing the number of instances in the majority class.

Hyperparameter Optimization involves searching for the optimal set of parameters for a machine learning model. 

In [11]:
from imblearn.combine import SMOTETomek
import joblib

# Define models
RdFo = RandomForestClassifier(random_state=42, criterion='entropy')
BBC = BaggingClassifier(random_state=42, n_jobs=-1)
DTC = DecisionTreeClassifier(random_state=42)
GBC = GradientBoostingClassifier(random_state=42)

# Create NearestNeighbors object with n_jobs set
nn = NearestNeighbors(n_jobs=-1)

# Define sampling methods with NearestNeighbors object
OverSamp_1 = RandomOverSampler(random_state=42)
OverSamp_2 = SMOTE(random_state=42, k_neighbors=nn)
OverSamp_3 = BorderlineSMOTE(random_state=42, k_neighbors=nn)
UnderSamp_1 = ClusterCentroids(random_state=42)
UnderSamp_2 = TomekLinks(n_jobs=-1)
UnderSamp_3 = NearMiss(version=3, n_jobs=-1)
Samp_7 = SMOTETomek()

# Combine over- and undersampling methods into a list
Samp_list = [OverSamp_1, OverSamp_2, OverSamp_3, UnderSamp_1, UnderSamp_2, UnderSamp_3, Samp_7]

# Initialize results DataFrame
results_df = pd.DataFrame()

#Placeholder for models
best_models = []

# Loop through each model and each sampling method
for model in [RdFo, BBC, DTC, GBC]:
    print("Fitting: ", model)
    if isinstance(model, RandomForestClassifier):
        grid_param = {
            'n_estimators': np.arange(10, 300, 10),
            'max_depth': np.arange(10, 100, 10),
            'min_samples_split': [2, 5, 10]
        }
    elif isinstance(model, BaggingClassifier):
        grid_param = {
            'n_estimators': np.arange(10, 160, 10)
        }
    elif isinstance(model, DecisionTreeClassifier):
        grid_param = {
            'max_depth': np.arange(1, 50, 5),
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    elif isinstance(model, GradientBoostingClassifier):
        grid_param = {
            'n_estimators': np.arange(10, 300, 10),
            'learning_rate': np.logspace(-3, 0, 4),
            'max_depth': np.arange(1, 10, 2),
            'min_samples_split': [2, 5, 10]
        }
                      
    for samp in Samp_list:
        # Resample the training data
        X_train_resampled, y_train_resampled = samp.fit_resample(X_train, y_train)

        # Perform Randomized Search with cross-validation
        random_search = RandomizedSearchCV(model, grid_param, cv=3, n_jobs=-1, scoring='f1_macro', refit='f1_macro', random_state=42)
        random_search.fit(X_train_resampled, y_train_resampled)
        y_pred = random_search.predict(X_test)
        
        # Calculate metrics
        f1 = f1_score(y_test, y_pred, average="macro")
        auc = roc_auc_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="macro")
        recall = recall_score(y_test, y_pred, average="macro")
        
        # Create DataFrame for results
        score_df = pd.DataFrame({
            'model': [str(model).split('(')[0]],
            'f1': [f1],
            'auc': [auc],
            'precision': [precision],
            'recall': [recall],
            'sampling_method': [str(samp).split('(')[0]]
        })
        
        # Append best parameters
        best_params = random_search.best_params_
        for param in best_params:
            
            score_df[param] = best_params[param]
        
        results_df = pd.concat([results_df, score_df])

        if len(best_models) < 2:
            best_models.append((random_search.best_estimator_, samp, f1))
        else:
            min_f1 = min(best_models, key=lambda x: x[2])
            if f1 > min_f1[2]:
                best_models.remove(min_f1)
                best_models.append((random_search.best_estimator_, samp, f1))


# Sort results by f1 score and display
results_df = results_df.sort_values(by='f1', ascending=False)
print(results_df.round(4))

for i, (model, samp, f1) in enumerate(best_models):
    joblib.dump(model, f"best_model_{i+1}.joblib")
    joblib.dump(samp, f"best_sampling_{i+1}.joblib")

Fitting:  RandomForestClassifier(criterion='entropy', random_state=42)
Fitting:  BaggingClassifier(n_jobs=-1, random_state=42)
Fitting:  DecisionTreeClassifier(random_state=42)
Fitting:  GradientBoostingClassifier(random_state=42)
                        model      f1     auc  precision  recall  \
0      RandomForestClassifier  0.9281  0.9082     0.9502  0.9082   
0           BaggingClassifier  0.9234  0.8963     0.9548  0.8963   
0  GradientBoostingClassifier  0.9225  0.9078     0.9385  0.9078   
0      RandomForestClassifier  0.9144  0.9071     0.9219  0.9071   
0  GradientBoostingClassifier  0.9043  0.8949     0.9142  0.8949   
0           BaggingClassifier  0.9040  0.9063     0.9016  0.9063   
0           BaggingClassifier  0.9012  0.9175     0.8862  0.9175   
0      RandomForestClassifier  0.8976  0.9115     0.8847  0.9115   
0      DecisionTreeClassifier  0.8954  0.8885     0.9025  0.8885   
0  GradientBoostingClassifier  0.8928  0.9111     0.8762  0.9111   
0  GradientBoostingCl

The best results were achieved with various models and the sampling method TomekLinks.

In [13]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score

# Define models
RdFo = RandomForestClassifier(random_state=42, criterion='entropy')
BBC = BaggingClassifier(random_state=42, n_jobs=-1)
DTC = DecisionTreeClassifier(random_state=42)
GBC = GradientBoostingClassifier(random_state=42)

# Create NearestNeighbors object with n_jobs set
nn = NearestNeighbors(n_jobs=-1)

# Define sampling methods with NearestNeighbors object
OverSamp_1 = RandomOverSampler(random_state=42)
OverSamp_2 = SMOTE(random_state=42, k_neighbors=nn)
OverSamp_3 = BorderlineSMOTE(random_state=42, k_neighbors=nn)
UnderSamp_1 = ClusterCentroids(random_state=42)
UnderSamp_2 = TomekLinks(n_jobs=-1)
UnderSamp_3 = NearMiss(version=3, n_jobs=-1)
Samp_7 = SMOTETomek()

# Combine over- and undersampling methods into a list
Samp_list = [OverSamp_1, OverSamp_2, OverSamp_3, UnderSamp_1, UnderSamp_2, UnderSamp_3, Samp_7]

# Initialize results DataFrame
results_df = pd.DataFrame()

# Placeholder for models
best_models = []

# Loop through each model and each sampling method
for model in [RdFo, BBC, DTC, GBC]:
    print("Fitting: ", model)
    if isinstance(model, RandomForestClassifier):
        grid_param = {
            'n_estimators': np.arange(10, 300, 10),
            'max_depth': np.arange(10, 100, 10),
            'min_samples_split': [2, 5, 10]
        }
    elif isinstance(model, BaggingClassifier):
        grid_param = {
            'n_estimators': np.arange(10, 160, 10)
        }
    elif isinstance(model, DecisionTreeClassifier):
        grid_param = {
            'max_depth': np.arange(1, 50, 5),
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    elif isinstance(model, GradientBoostingClassifier):
        grid_param = {
            'n_estimators': np.arange(10, 300, 10),
            'learning_rate': np.logspace(-3, 0, 4),
            'max_depth': np.arange(1, 10, 2),
            'min_samples_split': [2, 5, 10]
        }
                      
    for samp in Samp_list:
        # Resample the training data
        X_train_resampled, y_train_resampled = samp.fit_resample(X_train, y_train)

        # Perform Randomized Search with cross-validation
        random_search = RandomizedSearchCV(model, grid_param, cv=3, n_jobs=-1, scoring='f1_macro', refit='f1_macro', random_state=42)
        random_search.fit(X_train_resampled, y_train_resampled)
        y_pred = random_search.predict(X_test)
        
        # Calculate metrics
        f1 = f1_score(y_test, y_pred, average="macro")
        auc = roc_auc_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="macro")
        recall = recall_score(y_test, y_pred, average="macro")
        
        # Create DataFrame for results
        score_df = pd.DataFrame({
            'model': [str(model).split('(')[0]],
            'f1': [f1],
            'auc': [auc],
            'precision': [precision],
            'recall': [recall],
            'sampling_method': [str(samp).split('(')[0]]
        })
        
        # Append best parameters
        best_params = random_search.best_params_
        for param in best_params:
            score_df[param] = best_params[param]
        
        results_df = pd.concat([results_df, score_df])

# Sort results by f1 score and display
results_df = results_df.sort_values(by='f1', ascending=False)
print(results_df.round(4))

# Define a Stacking Classifier with the best models
estimators = [(f'best_model_{i+1}', best_models[i][0]) for i in range(len(best_models))]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(random_state=42))

# Train the Stacking Classifier
stacking_clf.fit(X_train, y_train)

# Evaluate the Stacking Classifier
y_pred_stacking = stacking_clf.predict(X_test)
f1_stacking = f1_score(y_test, y_pred_stacking, average="macro")
auc_stacking = roc_auc_score(y_test, y_pred_stacking)
precision_stacking = precision_score(y_test, y_pred_stacking, average="macro")
recall_stacking = recall_score(y_test, y_pred_stacking, average="macro")

print(f"Stacking Model - F1: {f1_stacking}, AUC: {auc_stacking}, Precision: {precision_stacking}, Recall: {recall_stacking}")

# Save Stacking Classifier Performance
stacking_performance = pd.DataFrame({
    'model': ['StackingClassifier'],
    'f1': [f1_stacking],
    'auc': [auc_stacking],
    'precision': [precision_stacking],
    'recall': [recall_stacking]
})

results_df = pd.concat([results_df, stacking_performance])
print(results_df.round(4))



Fitting:  RandomForestClassifier(criterion='entropy', random_state=42)
Fitting:  BaggingClassifier(n_jobs=-1, random_state=42)
Fitting:  DecisionTreeClassifier(random_state=42)
Fitting:  GradientBoostingClassifier(random_state=42)
                        model      f1     auc  precision  recall  \
0      RandomForestClassifier  0.9281  0.9082     0.9502  0.9082   
0           BaggingClassifier  0.9234  0.8963     0.9548  0.8963   
0  GradientBoostingClassifier  0.9225  0.9078     0.9385  0.9078   
0      RandomForestClassifier  0.9144  0.9071     0.9219  0.9071   
0  GradientBoostingClassifier  0.9043  0.8949     0.9142  0.8949   
0           BaggingClassifier  0.9040  0.9063     0.9016  0.9063   
0           BaggingClassifier  0.9012  0.9175     0.8862  0.9175   
0      RandomForestClassifier  0.8976  0.9115     0.8847  0.9115   
0      DecisionTreeClassifier  0.8954  0.8885     0.9025  0.8885   
0  GradientBoostingClassifier  0.8928  0.9111     0.8762  0.9111   
0  GradientBoostingCl

The best models and their results demonstrate strong performance across various metrics. The RandomForestClassifier with the hyperparameters n_estimators=60, min_samples_split=5, and max_depth=30, combined with the sampling method TomekLinks, achieved the highest F1 score (0.9281) and AUC (0.9082). These results indicate that this model strikes a very good balance between precision and recall, delivering robust classification outcomes.

The BaggingClassifier with n_estimators=100, also using TomekLinks, reached an F1 score of 0.9234 and an AUC of 0.8963. Despite a slightly lower AUC, the model shows a high precision of 0.9548, meaning it is very effective at correctly identifying the positive classes.

The GradientBoostingClassifier with the hyperparameters n_estimators=140, min_samples_split=10, and max_depth=9, combined with the RandomOverSampler sampling method, achieved an F1 score of 0.9225 and an AUC of 0.9078. This combination shows that Gradient Boosting is also very effective, particularly in improving recall values through the RandomOverSampler method, leading to balanced and accurate classification.

### Pickle Library

The Pickle library in Python serializes and deserializes Python objects, known as pickling and unpickling. This allows saving objects to a file and loading them later while preserving their state. It's commonly used for saving machine learning models, data preprocessing steps, and complex data structures between sessions.

In [27]:
import pickle

# Define the best models and sampling methods
best_model_1 = RandomForestClassifier(n_estimators=60, min_samples_split=5, max_depth=30, criterion='entropy', random_state=42)
best_sampling_1 = TomekLinks(n_jobs=-1)

best_model_2 = BaggingClassifier(n_estimators=100, n_jobs=-1, random_state=42)
best_sampling_2 = TomekLinks(n_jobs=-1)

best_model_3 = GradientBoostingClassifier(n_estimators=140, min_samples_split=10, max_depth=9, random_state=42)
best_sampling_3 = RandomOverSampler(random_state=42)

# Store the best models and sampling methods in a dictionary
best_models_sampling = {
    "best_model_1": best_model_1,
    "best_sampling_1": best_sampling_1,
    "best_model_2": best_model_2,
    "best_sampling_2": best_sampling_2,
    "best_model_3": best_model_3,
    "best_sampling_3": best_sampling_3
}

# Save the dictionary to a pickle file
path_to_pickle_file = ".././best_models_sampling.pkl"
with open(path_to_pickle_file, "wb") as file:
    pickle.dump(best_models_sampling, file)

print(f"The best models and sampling methods have been saved to {path_to_pickle_file}.")


The best models and sampling methods have been saved to .././best_models_sampling.pkl.
