## Enhance models for Target prediction

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
import time

# Chargement des données
X_train = pd.read_csv('X_train.csv')
X_train_smote = pd.read_csv('X_train_resampled_target.csv')
y_train_smote = pd.read_csv('y_train_resampled_target.csv')['Target']
y_train = pd.read_csv('y_train.csv')['Target']

print("GRIDSEARCH - TARGET")


def balanced_grid_search(model, params, X, y, model_name, cv_folds=3):
    """
    GridSearch
    """
    print(f"\n GridSearch pour {model_name}")
    print(f"   Nombre de combinaisons: {np.prod([len(v) for v in params.values()])}")
    
    start_time = time.time()
    
    # Utiliser un échantillon si trop de données
    sample_size = min(3000, len(X))
    if len(X) > sample_size:
        from sklearn.utils import resample
        X_sample, y_sample = resample(X, y, n_samples=sample_size, 
                                     random_state=42, stratify=y)
        print(f"   Utilisation de {sample_size} échantillons")
    else:
        X_sample, y_sample = X, y
    
    grid = GridSearchCV(
        estimator=model,
        param_grid=params,
        cv=cv_folds,
        scoring='recall',
        n_jobs=-1,  # parallélisation pour la vitesse
        verbose=2,  # affichage modéré
        refit=True,
        error_score='raise'
    )
    
    grid.fit(X_sample, y_sample)
    
    elapsed = time.time() - start_time
    print(f"  Temps d'exécution: {elapsed:.1f}s ({elapsed/60:.1f}min)")
    print(f" Meilleur recall: {grid.best_score_:.4f}")
    
    return grid

# 1. KNN - GRIDSEARCH 

knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', KNeighborsClassifier())
])

knn_params = {
    'clf__n_neighbors': [3, 5, 7, 9, 11],      # 5 valeurs
    'clf__weights': ['uniform', 'distance'],   # 2 valeurs
    'clf__metric': ['euclidean', 'manhattan'], # 2 valeurs
    'clf__p': [1, 2]                           # 2 valeurs
    # Total: 5 × 2 × 2 × 2 = 40 combinaisons
}

knn_grid = balanced_grid_search(
    knn_pipeline, knn_params,
    X_train_smote, y_train_smote,
    "K-Nearest Neighbors",
    cv_folds=3
)

# 2. NEURAL NETWORK (MLP) - GRIDSEARCH 

mlp_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', MLPClassifier(random_state=42, max_iter=300, early_stopping=True))
])

mlp_params = {
    'clf__hidden_layer_sizes': [(32,), (64, 32), (128,), (64, 64, 32)],  # 4 valeurs
    'clf__activation': ['relu', 'tanh'],                                 # 2 valeurs
    'clf__alpha': [0.0001, 0.001, 0.01],                                 # 3 valeurs
    'clf__learning_rate_init': [0.001, 0.01],                            # 2 valeurs
    'clf__batch_size': [32, 64]                                          # 2 valeurs
    # Total: 4 × 2 × 3 × 2 × 2 = 96 combinaisons
}

mlp_grid = balanced_grid_search(
    mlp_pipeline, mlp_params,
    X_train_smote, y_train_smote,
    "Neural Network (MLP)",
    cv_folds=3
)


# 3. LIGHTGBM - GRIDSEARCH 

lgbm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LGBMClassifier(
        random_state=42, 
        verbose=-1, 
        n_jobs=1,
        class_weight='balanced'
    ))
])

lgbm_params = {
    'model__n_estimators': [100, 200, 300],      # 3 valeurs
    'model__learning_rate': [0.01, 0.05, 0.1],   # 3 valeurs
    'model__max_depth': [3, 5, 7],               # 3 valeurs
    'model__num_leaves': [31, 50, 100],          # 3 valeurs
    'model__subsample': [0.8, 1.0]               # 2 valeurs
    # Total: 3 × 3 × 3 × 3 × 2 = 162 combinaisons
}

lgbm_grid = balanced_grid_search(
    lgbm_pipeline, lgbm_params,
    X_train, y_train,
    "LightGBM",
    cv_folds=3
)


print("RÉSULTATS DU GRIDSEARCH COMPLET")


# Afficher les meilleurs résultats
print("\n KNN:")
print(f"  Best Parameters : {knn_grid.best_params_}")
print(f"  Best recall: {knn_grid.best_score_:.4f}")

print("\n Neural Network (MLP):")
print(f"  Best Parameters :  {mlp_grid.best_params_}")
print(f"  Best recall: {mlp_grid.best_score_:.4f}")

print("\n LightGBM:")
print(f"  Best Parameters :  {lgbm_grid.best_params_}")
print(f"  Best recall: {lgbm_grid.best_score_:.4f}")

GRIDSEARCH - TARGET

 GridSearch pour K-Nearest Neighbors
   Nombre de combinaisons: 40
   Utilisation de 3000 échantillons
Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV] END clf__metric=euclidean, clf__n_neighbors=3, clf__p=1, clf__weights=distance; total time=   0.0s
[CV] END clf__metric=euclidean, clf__n_neighbors=3, clf__p=2, clf__weights=uniform; total time=   0.0s
[CV] END clf__metric=euclidean, clf__n_neighbors=5, clf__p=1, clf__weights=uniform; total time=   0.0s
[CV] END clf__metric=euclidean, clf__n_neighbors=7, clf__p=2, clf__weights=uniform; total time=   0.0s
[CV] END clf__metric=euclidean, clf__n_neighbors=9, clf__p=1, clf__weights=distance; total time=   0.0s
[CV] END clf__metric=euclidean, clf__n_neighbors=11, clf__p=1, clf__weights=distance; total time=   0.0s
[CV] END clf__metric=euclidean, clf__n_neighbors=11, clf__p=1, clf__weights=distance; total time=   0.0s
[CV] END clf__metric=manhattan, clf__n_neighbors=3, clf__p=2, clf__weights=uniform; tot

### Analysis of gridsearch optimization results : 

Optimal Parameters Found for Each Model:

1. K-Nearest Neighbors (KNN):

Best Recall:  0.9760 avec cv=3 folds
Optimal Parameters:

n_neighbors: 11   
weights: 'distance'    
metric: 'manhattan'    
p: 1        
Performance Insight: KNN maintains its high recall capability with optimized neighborhood parameters.

2. Neural Network (MLP):

Best Recall: 0.9773 avec cv=3 folds
Optimal Parameters:

hidden_layer_sizes: (64, 64, 32)    
activation: 'tanh'    
alpha: 0.001    
learning_rate_init: 0.01    
batch_size: 32     
Performance Insight: Neural Network achieves balanced performance with optimized architecture and regularization, providing both high recall and precision.
3. LightGBM:

Best Recall: 0.9314 avec cv=3 folds
Optimal Parameters:

n_estimators: 100    
learning_rate: 0.01     
max_depth: 3    
num_leaves: 31    
subsample: 0.8    

Performance Insight: With parameter tuning, LightGBM shows improved recall, though still slightly behind KNN and MLP for recall maximization.
