# 01/04/2023  LOOCV

Voici des explications pour chaque section de code:

1. Function read_and_split_data
- lire le fichier de données
- diviser l'ensemble de données en ensembles train et de test par LOOCV
- prétraiter la variable cible pour Convertir multi-classe en bi-classe(remplaçant 1 et 3 par 2, et 2 par 1).

2. Funtion evaluate_model_performance
- évaluer les performances du modèle sur l'ensemble de test, 
- compris la précision (accuracy), la matrice de confusion (confusion matrix), l'AUC et la valeur F1.

3. Pipeline:
   Définissez l'objet Pipeline, qui contient le classificateur (DecisionTreeClassifier, Random Forest, SVC, Logistic Regression).

4. Param_grid: 
   Définissez l'espace de paramètres hyperparamétriques param_grid.

5. GridSearchCV:
   Utilisez l'objet GridSearchCV pour rechercher la meilleure combinaison de paramètres hyperparamétriques, en utilisant une validation croisée LOOCV et une recherche en grille pour évaluer la précision.

## Les résultats de l'exécution des quatre modèles sont les suivants :

# Logistic Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score

# Define a function to read and split the dataset
def read_and_split_data_(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)
    # Copy target variable
    y = data['session'].copy()
    # Replace 1 and 3 with 2, and 2 with 1 in y
    y.replace({1: 2, 2: 1, 3: 2}, inplace=True)
    # Split the dataset into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], y, test_size=test_size, 
                                                        stratify=y, random_state=random_state)
    return X_train, X_test, y_train, y_test

def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)

    y_binary = data['session'].copy()
    y_binary.replace({1: 2, 2: 1, 3: 2}, inplace=True)

    # Divided with same IDs split together
    unique_ids = data['ID'].unique()
    test_ids = set(pd.Series(unique_ids).sample(frac=0.2, random_state=40))
    train_data = data[data['ID'].apply(lambda x: x not in test_ids)]
    test_data = data[data['ID'].apply(lambda x: x in test_ids)]
    X_train, y_train = train_data.iloc[:, :-1], y_binary[train_data.index]
    X_test, y_test = test_data.iloc[:, :-1], y_binary[test_data.index]

    return X_train, X_test, y_train, y_test


# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted', labels=[1, 2])

    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Accuracy for class Stress: {:.3f}'.format(accuracy_score(y_test[y_test==1], y_pred[y_test==1])))
    print('Accuracy for class Relax: {:.3f}'.format(accuracy_score(y_test[y_test==2], y_pred[y_test==2])))
    print('\nConfusion Matrix: \n', confusion_matrix(y_test, y_pred, normalize='true'))
    print('\nAUC: ', roc_auc_score(y_test, y_pred))
    print('\nF1-score for class Stress: {:.3f}'.format(f1_score(y_test[y_test==1], y_pred[y_test==1], average='weighted')))
    print('F1-score for class Relax: {:.3f}'.format(f1_score(y_test[y_test==2], y_pred[y_test==2], average='weighted')))
    print('F1-score:', f1_score(y_test, y_pred))


# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('classifier', LogisticRegression(random_state=40))
])

# Define hyperparameter space
param_grid = {
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=LeaveOneOut(), # LeaveOneOut
    scoring='accuracy',
    n_jobs=-1,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit GridSearchCV object on the training dataset
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', grid_search.best_params_)

# Fit LogisticRegression object with best hyperparameters on the training dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test) 

83430 fits failed out of a total of 185400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9270 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\huxua\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\huxua\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\huxua\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.du

Best parameters: {'classifier__C': 1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Accuracy:  0.6764705882352942
Accuracy for class Stress: 0.573
Accuracy for class Relax: 0.757

Confusion Matrix: 
 [[0.57303371 0.42696629]
 [0.24347826 0.75652174]]

AUC:  0.6647777234978016

F1-score for class Stress: 0.729
F1-score for class Relax: 0.861
F1-score: 0.6071428571428571




# Decision Tree

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score

# Define a function to read and split the dataset
def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)

    y_binary = data['session'].copy()
    y_binary.replace({1: 2, 2: 1, 3: 2}, inplace=True)

    # Divided with same IDs split together
    unique_ids = data['ID'].unique()
    test_ids = set(pd.Series(unique_ids).sample(frac=0.2, random_state=40))
    train_data = data[data['ID'].apply(lambda x: x not in test_ids)]
    test_data = data[data['ID'].apply(lambda x: x in test_ids)]
    X_train, y_train = train_data.iloc[:, :-1], y_binary[train_data.index]
    X_test, y_test = test_data.iloc[:, :-1], y_binary[test_data.index]

    return X_train, X_test, y_train, y_test


# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted', labels=[1, 2])

    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Accuracy for class Stress: {:.3f}'.format(accuracy_score(y_test[y_test==1], y_pred[y_test==1])))
    print('Accuracy for class Relax: {:.3f}'.format(accuracy_score(y_test[y_test==2], y_pred[y_test==2])))
    print('\nConfusion Matrix: \n', confusion_matrix(y_test, y_pred, normalize='true'))
    print('\nAUC: ', roc_auc_score(y_test, y_pred))
    print('\nF1-score for class Stress: {:.3f}'.format(f1_score(y_test[y_test==1], y_pred[y_test==1], average='weighted')))
    print('F1-score for class Relax: {:.3f}'.format(f1_score(y_test[y_test==2], y_pred[y_test==2], average='weighted')))
    print('F1-score:', f1_score(y_test, y_pred))


# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('classifier', DecisionTreeClassifier(random_state=40))
])

# Define hyperparameter space
param_grid = {
    'classifier__max_depth': [2, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=LeaveOneOut(), # LeaveOneOut
    scoring='accuracy',
    n_jobs=-1,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit GridSearchCV object on the training dataset
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', grid_search.best_params_)

# Fit DecisionTreeClassifier object with best hyperparameters on the training dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test)

Best parameters: {'classifier__max_depth': 15, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}
Accuracy:  0.6372549019607843
Accuracy for class Stress: 0.629
Accuracy for class Relax: 0.643

Confusion Matrix: 
 [[0.62921348 0.37078652]
 [0.35652174 0.64347826]]

AUC:  0.6363458720078162

F1-score for class Stress: 0.772
F1-score for class Relax: 0.783
F1-score: 0.6021505376344086


# SVC

In [2]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler


# Define a function to read and split the dataset
def read_and_split_data_(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)
    # Copy target variable
    y = data['session'].copy()
    # Replace 1 and 3 with 2, and 2 with 1 in y
    y.replace({1: 2, 2: 1, 3: 2}, inplace=True)
    # Split the dataset into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], y, test_size=test_size, 
                                                        stratify=y, random_state=random_state)
    return X_train, X_test, y_train, y_test


def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)

    y_binary = data['session'].copy()
    y_binary.replace({1: 2, 2: 1, 3: 2}, inplace=True)

    # Divided with same IDs split together
    unique_ids = data['ID'].unique()
    test_ids = set(pd.Series(unique_ids).sample(frac=0.2, random_state=40))
    train_data = data[data['ID'].apply(lambda x: x not in test_ids)]
    test_data = data[data['ID'].apply(lambda x: x in test_ids)]
    X_train, y_train = train_data.iloc[:, :-1], y_binary[train_data.index]
    X_test, y_test = test_data.iloc[:, :-1], y_binary[test_data.index]

    return X_train, X_test, y_train, y_test


# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted', labels=[1, 2])

    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Accuracy for class Stress: {:.3f}'.format(accuracy_score(y_test[y_test==1], y_pred[y_test==1])))
    print('Accuracy for class Relax: {:.3f}'.format(accuracy_score(y_test[y_test==2], y_pred[y_test==2])))
    print('\nConfusion Matrix: \n', confusion_matrix(y_test, y_pred, normalize='true'))
    print('\nAUC: ', roc_auc_score(y_test, y_pred))
    print('\nF1-score for class Stress: {:.3f}'.format(f1_score(y_test[y_test==1], y_pred[y_test==1], average='weighted')))
    print('F1-score for class Relax: {:.3f}'.format(f1_score(y_test[y_test==2], y_pred[y_test==2], average='weighted')))
    print('F1-score:', f1_score(y_test, y_pred))


# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC(random_state=40))
])

# Define hyperparameter distributions for random search
param_distributions = {
    'classifier__C': np.logspace(-3, 3, 7),
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto'] + list(np.logspace(-3, 3, 7)),
}

# Use RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    cv=LeaveOneOut(), # LeaveOneOut
    scoring='accuracy',
    n_jobs=-1,
    n_iter=50, # number of random search iterations
    random_state=40,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit RandomizedSearchCV object on the training dataset
random_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', random_search.best_params_)

# Fit SVM object with best hyperparameters on the training dataset
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test)

Best parameters: {'classifier__kernel': 'rbf', 'classifier__gamma': 0.1, 'classifier__C': 10.0}
Accuracy:  0.5686274509803921
Accuracy for class Stress: 0.534
Accuracy for class Relax: 0.596

Confusion Matrix: 
 [[0.53370787 0.46629213]
 [0.40434783 0.59565217]]

AUC:  0.5646800195407915

F1-score for class Stress: 0.696
F1-score for class Relax: 0.747
F1-score: 0.5191256830601093


# Random forest

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score

# Define a function to read and split the dataset
def read_and_split_data_(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)
    # Copy target variable
    y = data['session'].copy()
    # Replace 1 and 3 with 2, and 2 with 1 in y
    y.replace({1: 2, 2: 1, 3: 2}, inplace=True)
    # Split the dataset into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], y, test_size=test_size, 
                                                        stratify=y, random_state=random_state)
    return X_train, X_test, y_train, y_test

def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)

    y_binary = data['session'].copy()
    y_binary.replace({1: 2, 2: 1, 3: 2}, inplace=True)

    # Divided with same IDs split together
    unique_ids = data['ID'].unique()
    test_ids = set(pd.Series(unique_ids).sample(frac=0.2, random_state=40))
    train_data = data[data['ID'].apply(lambda x: x not in test_ids)]
    test_data = data[data['ID'].apply(lambda x: x in test_ids)]
    X_train, y_train = train_data.iloc[:, :-1], y_binary[train_data.index]
    X_test, y_test = test_data.iloc[:, :-1], y_binary[test_data.index]

    return X_train, X_test, y_train, y_test


# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted', labels=[1, 2])

    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Accuracy for class Stress: {:.3f}'.format(accuracy_score(y_test[y_test==1], y_pred[y_test==1])))
    print('Accuracy for class Relax: {:.3f}'.format(accuracy_score(y_test[y_test==2], y_pred[y_test==2])))
    print('\nConfusion Matrix: \n', confusion_matrix(y_test, y_pred, normalize='true'))
    print('\nAUC: ', roc_auc_score(y_test, y_pred))
    print('\nF1-score for class Stress: {:.3f}'.format(f1_score(y_test[y_test==1], y_pred[y_test==1], average='weighted')))
    print('F1-score for class Relax: {:.3f}'.format(f1_score(y_test[y_test==2], y_pred[y_test==2], average='weighted')))
    print('F1-score:', f1_score(y_test, y_pred))


# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=40))
])

# Define hyperparameter space
param_grid = {
    'classifier__n_estimators': [100, 300, 500],
    'classifier__max_depth': [2, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=LeaveOneOut(), # LeaveOneOut
    scoring='accuracy',
    n_jobs=-1,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit GridSearchCV object on the training dataset
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', grid_search.best_params_)

# Fit RandomForestClassifier object with best hyperparameters on the training dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test)

Best parameters: {'classifier__max_depth': 15, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500}
Accuracy:  0.6348039215686274
Accuracy for class Stress: 0.629
Accuracy for class Relax: 0.639

Confusion Matrix: 
 [[0.62921348 0.37078652]
 [0.36086957 0.63913043]]

AUC:  0.6341719589643381

F1-score for class Stress: 0.772
F1-score for class Relax: 0.780
F1-score: 0.6005361930294906
