## Enhance models for failure type prediction

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
#import models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
import time

#load 
#we have two versions: original imbalanced data and SMOTE-balanced data

X_train = pd.read_csv('X_train.csv')
X_train_smote = pd.read_csv('X_train_resampled_failure_type.csv')
y_train_smote = pd.read_csv('y_train_resampled_failure_type.csv')['Failure Type']
y_train = pd.read_csv('y_train.csv')['Failure Type']

print("Gridserach for Failure Type (Multiclass)")


def balanced_grid_search(model, params, X, y, model_name, cv_folds=3):
    
    print(f"\n GridSearch for {model_name}")
    print(f"   Combinaisons number : {np.prod([len(v) for v in params.values()])}")
    
    start_time = time.time()
    
    #use a subset if dataset is large to speed up search
    sample_size = min(3000, len(X))
    if len(X) > sample_size:
        from sklearn.utils import resample
        X_sample, y_sample = resample(X, y, n_samples=sample_size, 
                                     random_state=42, stratify=y)
        print(f"   Use of {sample_size} samples")
    else:
        X_sample, y_sample = X, y
    
    
    grid = GridSearchCV(
        estimator=model,
        param_grid=params,
        cv=cv_folds,
        scoring='recall_macro',  
        n_jobs=-1,  
        verbose=2, 
        refit=True,
        error_score='raise'
    )

    grid.fit(X_sample, y_sample)
    
    elapsed = time.time() - start_time
    print(f"  Execution time : {elapsed:.1f}s ({elapsed/60:.1f}min)")
    print(f" Best macro-recall: {grid.best_score_:.4f}")
    
    return grid

#1)KNN

#create pipeline
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('clf', KNeighborsClassifier())
])

knn_params = {
    'clf__n_neighbors': [3, 5, 7, 9, 11], 
    'clf__weights': ['uniform', 'distance'],   
    'clf__metric': ['euclidean', 'manhattan'], 
    'clf__p': [1, 2]                                                   
    
}
#execute grid search for KNN using SMOTE-balanced data
knn_grid = balanced_grid_search(
    knn_pipeline, knn_params,
    X_train_smote, y_train_smote,
    "K-Nearest Neighbors",
    cv_folds=3
)

#2)MLP
#create pipeline
mlp_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', MLPClassifier(random_state=42, max_iter=300, early_stopping=True))
])

mlp_params = {
    'clf__hidden_layer_sizes': [(32,), (64, 32), (128,), (64, 64, 32)], 
    'clf__activation': ['relu', 'tanh'],                                 
    'clf__alpha': [0.0001, 0.001, 0.01],                                
    'clf__learning_rate_init': [0.001, 0.01],                          
    'clf__batch_size': [32, 64]                                                                              
    
}
#grid search for MLP using SMOTE-balanced data
mlp_grid = balanced_grid_search(
    mlp_pipeline, mlp_params,
    X_train_smote, y_train_smote,
    "Neural Network (MLP)",
    cv_folds=3
)


#3)LightGBM
#create pipeline
lgbm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LGBMClassifier(
        random_state=42, 
        verbose=-1, 
        n_jobs=1,
        class_weight='balanced' 
    ))
])

lgbm_params = {
    'model__n_estimators': [100, 200, 300],      
    'model__learning_rate': [0.01, 0.05, 0.1],   
    'model__max_depth': [3, 5, 7],               
    'model__num_leaves': [31, 50, 100],          
    'model__subsample': [0.8, 1.0]               
    
}
#grid search for LightGBM using original imbalanced data
lgbm_grid = balanced_grid_search(
    lgbm_pipeline, lgbm_params,
    X_train, y_train,
    "LightGBM",
    cv_folds=3
)



print("Gridsearch Results - Failure Type (Multiclass)")

print("\n KNN:")
print(f"best parameters : {knn_grid.best_params_}")
print(f"best macro-recall: {knn_grid.best_score_:.4f}")

print("\n Neural Network (MLP):")
print(f"best parameters :  {mlp_grid.best_params_}")
print(f"best macro-recall: {mlp_grid.best_score_:.4f}")

print("\n LightGBM:")
print(f"best parameters :  {lgbm_grid.best_params_}")
print(f"best macro-recall: {lgbm_grid.best_score_:.4f}")

Gridserach for Failure Type (Multiclass)

 GridSearch for K-Nearest Neighbors
   Combinaisons number : 40
Fitting 3 folds for each of 40 candidates, totalling 120 fits
  Execution time : 5.2s (0.1min)
 Best macro-recall: 0.7502

 GridSearch for Neural Network (MLP)
   Combinaisons number : 96
Fitting 3 folds for each of 96 candidates, totalling 288 fits
  Execution time : 3.3s (0.1min)
 Best macro-recall: 0.8806

 GridSearch for LightGBM
   Combinaisons number : 162
   Use of 3000 samples
Fitting 3 folds for each of 162 candidates, totalling 486 fits
  Execution time : 43.5s (0.7min)
 Best macro-recall: 0.8343
Gridsearch Results - Failure Type (Multiclass)

 KNN:
best parameters : {'clf__metric': 'manhattan', 'clf__n_neighbors': 9, 'clf__p': 1, 'clf__weights': 'distance'}
best macro-recall: 0.7502

 Neural Network (MLP):
best parameters :  {'clf__activation': 'tanh', 'clf__alpha': 0.0001, 'clf__batch_size': 32, 'clf__hidden_layer_sizes': (64, 32), 'clf__learning_rate_init': 0.01}
best 

### Analysis of gridsearch optimization results : 

Optimal Parameters Found for Each Model:

1. K-Nearest Neighbors (KNN):

Best Recall:  0.9760 avec cv=3 folds
Optimal Parameters:

n_neighbors: 11   
weights: 'distance'    
metric: 'manhattan'    
p: 1        
Performance Insight: KNN maintains its high recall capability with optimized neighborhood parameters.

2. Neural Network (MLP):

Best Recall: 0.9773 avec cv=3 folds
Optimal Parameters:

hidden_layer_sizes: (64, 64, 32)    
activation: 'tanh'    
alpha: 0.001    
learning_rate_init: 0.01    
batch_size: 32     
Performance Insight: Neural Network achieves balanced performance with optimized architecture and regularization, providing both high recall and precision.

3. LightGBM:

Best Recall: 0.9314 avec cv=3 folds
Optimal Parameters:

n_estimators: 100    
learning_rate: 0.01     
max_depth: 3    
num_leaves: 31    
subsample: 0.8    

Performance Insight: With parameter tuning, LightGBM shows improved recall, though still slightly behind KNN and MLP for recall maximization.
