### PART 2

#### Interesting to test and compare 3 to 4 different non linear models (maybe testing more but only 3 to 4 in the report), try to not have 3 models of the same 'family' 

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.inspection import permutation_importance 
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# Data paths
labeled_data_path = '../data/data_labeled/'
unlabeled_data_path = '../data/data_unlabeled/'

# Image paths
img_test_unlabeled_path = '../data/data_labeled/Img_test/'
img_train_unlabeled_path = '../data/data_labeled/Img_train/'
X_img_path = '../data/data_unlabeled/Img/'

# Load csv's
X_test = pd.read_csv(labeled_data_path + 'X_test.csv')
X_train = pd.read_csv(labeled_data_path + 'X_train.csv')
y_test = pd.read_csv(labeled_data_path + 'y_test.csv', header=None)
y_train = pd.read_csv(labeled_data_path + 'y_train.csv', header=None)
X_unlabeled = pd.read_csv(unlabeled_data_path + 'X.csv')

X_test.head()

Unnamed: 0,age,blood pressure,calcium,cholesterol,hemoglobin,height,potassium,profession,sarsaparilla,smurfberry liquor,smurfin donuts,vitamin D,weight,img_filename
0,85,106.92,2.25,120.85,14.9,7.41,3.26,resource extraction,Low,High,Moderate,36.27,93.2,heart_0.png
1,130,94.73,2.61,121.74,16.88,7.53,4.74,manufacturing,Very high,Very high,Moderate,25.97,105.24,heart_6.png
2,180,110.31,2.47,75.35,15.52,8.02,4.29,manufacturing,High,Moderate,Very low,37.53,100.61,heart_7.png
3,78,95.46,2.28,125.3,11.68,7.75,4.17,services,Very low,Low,Moderate,28.19,103.94,heart_10.png
4,116,106.13,2.11,89.98,14.83,7.89,4.73,resource extraction,Low,Very high,Low,30.07,82.53,heart_11.png


In [None]:

#Save data before cleaning
X_train_clean = X_train.copy()
X_test_clean = X_test.copy()
X_unlabeled_clean = X_unlabeled.copy()

X_train = X_train.drop(columns=['img_filename'])
X_test = X_test.drop(columns=['img_filename'])
X_unlabeled = X_unlabeled.drop(columns=['img_filename'])

X_train = pd.get_dummies(X_train, columns=['profession'])
X_test = pd.get_dummies(X_test, columns=['profession'])
X_unlabeled = pd.get_dummies(X_unlabeled, columns=['profession'])

# Ordinal encoding of consumption features
consumption_map = {
    'Very low': 1,
    'Low': 2,
    'Moderate': 3,
    'High': 4,
    'Very high': 5
}

for col in ['sarsaparilla', 'smurfberry liquor', 'smurfin donuts']:
    X_train[col] = X_train[col].map(consumption_map)
    X_test[col] = X_test[col].map(consumption_map)
    X_unlabeled[col] = X_unlabeled[col].map(consumption_map)

# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

print("Train subset:", X_train.shape)
print("Validation set:", X_val.shape)


# Standardize numerical features
numerical_features = [
    'age', 'blood pressure', 'calcium', 'cholesterol',
    'hemoglobin', 'height', 'potassium',
    'vitamin D', 'weight'
]

X_train_original = X_train.copy()
X_val_original = X_val.copy()
X_test_original = X_test.copy()
X_unlabeled_original = X_unlabeled.copy()

scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_val[numerical_features] = scaler.transform(X_val[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])
X_unlabeled[numerical_features] = scaler.transform(X_unlabeled[numerical_features])
X_test.head()


Train subset: (800, 18)
Validation set: (200, 18)


Unnamed: 0,age,blood pressure,calcium,cholesterol,hemoglobin,height,potassium,sarsaparilla,smurfberry liquor,smurfin donuts,vitamin D,weight,profession_administration and governance,profession_craftsmanship,profession_food production,profession_manufacturing,profession_resource extraction,profession_services
0,-1.088731,-0.153842,-0.9658,0.382095,0.328558,-0.463442,-2.107149,2,4,3,1.202304,-0.726562,False,False,False,False,True,False
1,0.404898,-1.246817,0.777981,0.42291,1.290066,-0.161914,0.864484,5,5,3,-0.849324,-0.021621,False,False,False,True,False,False
2,2.064486,0.15011,0.099844,-1.704512,0.629636,1.069325,-0.039053,4,3,1,1.45328,-0.292707,False,False,False,True,False,False
3,-1.321073,-1.181364,-0.820485,0.58617,-1.235107,0.390887,-0.279996,1,2,3,-0.407129,-0.097736,False,False,False,False,False,True
4,-0.059787,-0.224675,-1.643937,-1.033588,0.294565,0.74267,0.844406,2,5,2,-0.032657,-1.351289,False,False,False,False,True,False


In [None]:
def compute_rmse(y_pred, y_true):
    return root_mean_squared_error(y_true, y_pred)

def optimize_model(model, params, X_train, y_train, scoring_metric='neg_root_mean_squared_error'):
    """
    Optimise les hyperparamètres du modèle en utilisant GridSearchCV.
    """
    # GridSearchCV utilise la validation croisée (CV) pour trouver la meilleure combinaison de paramètres
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring=scoring_metric,
        cv=5, # 5-fold cross-validation
        n_jobs=-1,
        verbose=1
    )
    
    # Entraînement et optimisation
    grid_search.fit(X_train, y_train)
    
    # Affichage des meilleurs résultats
    best_score = -grid_search.best_score_ # Le score est négatif car nous maximisons un score d'erreur
    print(f"Meilleurs paramètres trouvés: {grid_search.best_params_}")
    print(f"RMSE de Cross-Validation (Moyenne): {best_score:.4f}")
    
    return grid_search.best_estimator_, best_score



In [None]:
print("\n--- Optimisation du Random Forest ---")
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None], # None = expande jusqu'à ce que les feuilles soient pures
    'min_samples_split': [2, 5]
}

best_rf_model, cv_rmse_rf = optimize_model(RandomForestRegressor(random_state=42), 
                                          rf_params, 
                                          X_train_full_scaled_df, 
                                          y_train_full)
rf_test_pred = best_rf_model.predict(X_test_scaled_df)
rmse_test_rf = compute_rmse(rf_test_pred, y_test)
print(f"Random Forest RMSE sur le jeu de TEST: {rmse_test_rf:.4f}")

In [None]:
print("\n--- Optimisation du K-NN ---")
knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'], # Poids uniformes ou pondérés par l'inverse de la distance
    'p': [1, 2] # 1 pour distance de Manhattan, 2 pour distance Euclidienne
}

best_knn_model, cv_rmse_knn = optimize_model(KNeighborsRegressor(), 
                                            knn_params, 
                                            X_train_full_scaled_df, 
                                            y_train_full)
knn_test_pred = best_knn_model.predict(X_test_scaled_df)
rmse_test_knn = compute_rmse(knn_test_pred, y_test)
print(f"K-NN RMSE sur le jeu de TEST: {rmse_test_knn:.4f}")

In [None]:
print("\n--- Optimisation du SVR ---")
svr_params = {
    'C': [0.1, 1, 10], # Paramètre de régularisation
    'gamma': ['scale', 'auto'], # Coefficient du noyau (kernel)
    'kernel': ['rbf']
}

best_svr_model, cv_rmse_svr = optimize_model(SVR(), 
                                           svr_params, 
                                           X_train_full_scaled_df, 
                                           y_train_full)
svr_test_pred = best_svr_model.predict(X_test_scaled_df)
rmse_test_svr = compute_rmse(svr_test_pred, y_test)
print(f"SVR RMSE sur le jeu de TEST: {rmse_test_svr:.4f}")

In [None]:
print("\n--- Optimisation du MLP (Simple) ---")
mlp_params = {
    'hidden_layer_sizes': [(50,), (100,), (50, 25)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001], # Terme de régularisation L2
    'max_iter': [300]
}

# Le MLP peut nécessiter une mise à l'échelle (déjà fait ici) et est très sensible aux initialisations.
best_mlp_model, cv_rmse_mlp = optimize_model(MLPRegressor(random_state=42), 
                                           mlp_params, 
                                           X_train_full_scaled_df, 
                                           y_train_full)
mlp_test_pred = best_mlp_model.predict(X_test_scaled_df)
rmse_test_mlp = compute_rmse(mlp_test_pred, y_test)
print(f"MLP RMSE sur le jeu de TEST: {rmse_test_mlp:.4f}")

In [None]:
# Rassembler les résultats des meilleurs modèles
results = {
    "Random Forest": {'model': best_rf_model, 'rmse': rmse_test_rf},
    "K-NN": {'model': best_knn_model, 'rmse': rmse_test_knn},
    "SVR": {'model': best_svr_model, 'rmse': rmse_test_svr},
    "MLP": {'model': best_mlp_model, 'rmse': rmse_test_mlp},
}

# Trouver le meilleur modèle
best_model_name = min(results, key=lambda k: results[k]['rmse'])
best_model = results[best_model_name]['model']

print(f"\n=======================================================")
print(f"Le meilleur modèle non linéaire global est: {best_model_name} avec RMSE = {results[best_model_name]['rmse']:.4f}")
print(f"=======================================================")

# --- ANALYSE D'IMPORTANCE PAR PERMUTATION ---
# Appliquée au meilleur modèle global
print(f"\n--- Importance par Permutation pour le meilleur modèle: {best_model_name} ---")

result = permutation_importance(
    best_model, 
    X_test_scaled_df, 
    y_test, 
    n_repeats=30, # Augmenter les répétitions pour plus de stabilité
    random_state=42, 
    n_jobs=-1
)

# Créer un DataFrame pour la visualisation
sorted_idx = result.importances_mean.argsort()[::-1]
feature_importance_df = pd.DataFrame({
    'Feature': X_test_scaled_df.columns[sorted_idx],
    'Importance_Mean': result.importances_mean[sorted_idx],
    'Importance_Std': result.importances_std[sorted_idx]
})

print(feature_importance_df)

#