# 24/03/2023

Voici des explications pour chaque section de code:

1. Function read_and_split_data
- lire le fichier de données
- diviser l'ensemble de données en ensembles train et de test
- prétraiter la variable cible pour Convertir multi-classe en bi-classe(remplaçant 1 et 3 par 2, et 2 par 1).

2. Funtion evaluate_model_performance
- évaluer les performances du modèle sur l'ensemble de test, 
- compris la précision (accuracy), la matrice de confusion (confusion matrix), l'AUC et la valeur F1.

3. Pipeline:
   Définissez l'objet Pipeline, qui contient le classificateur (DecisionTreeClassifier, Random Forest, SVC, Logistic Regression).

4. Param_grid: 
   Définissez l'espace de paramètres hyperparamétriques param_grid.

5. GridSearchCV:
   Utilisez l'objet GridSearchCV pour rechercher la meilleure combinaison de paramètres hyperparamétriques, en utilisant une validation croisée à 5 plis et une recherche en grille pour évaluer la précision.

## Les résultats de l'exécution des quatre modèles sont les suivants :

# Logistic Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score

# Define a function to read and split the dataset
def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)

    y_binary = data['session'].copy()
    y_binary.replace({1: 2, 2: 1, 3: 2}, inplace=True)

    # Divided with same IDs split together
    unique_ids = data['ID'].unique()
    test_ids = set(pd.Series(unique_ids).sample(frac=0.2, random_state=40))
    train_data = data[data['ID'].apply(lambda x: x not in test_ids)]
    test_data = data[data['ID'].apply(lambda x: x in test_ids)]
    X_train, y_train = train_data.iloc[:, :-1], y_binary[train_data.index]
    X_test, y_test = test_data.iloc[:, :-1], y_binary[test_data.index]

    return X_train, X_test, y_train, y_test

# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted', labels=[1, 2])

    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Accuracy for class Stress: {:.3f}'.format(accuracy_score(y_test[y_test==1], y_pred[y_test==1])))
    print('Accuracy for class Relax: {:.3f}'.format(accuracy_score(y_test[y_test==2], y_pred[y_test==2])))
    print('\nConfusion Matrix: \n', confusion_matrix(y_test, y_pred, normalize='true'))
    print('\nAUC: ', roc_auc_score(y_test, y_pred))
    print('\nF1-score for class Stress: {:.3f}'.format(f1_score(y_test[y_test==1], y_pred[y_test==1], average='weighted')))
    print('F1-score for class Relax: {:.3f}'.format(f1_score(y_test[y_test==2], y_pred[y_test==2], average='weighted')))
    print('F1-score:', f1_score(y_test, y_pred))


# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('classifier', DecisionTreeClassifier(random_state=40))
])

# Define hyperparameter space
param_grid = {
    'classifier__max_depth': [2, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5, # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit GridSearchCV object on the training dataset
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', grid_search.best_params_)

# Fit DecisionTreeClassifier object with best hyperparameters on the training dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test)

Best parameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5}
Accuracy:  0.5857843137254902
Accuracy for class Stress: 0.556
Accuracy for class Relax: 0.609

Confusion Matrix: 
 [[0.55617978 0.44382022]
 [0.39130435 0.60869565]]

AUC:  0.582437713727406

F1-score for class Stress: 0.715
F1-score for class Relax: 0.757
F1-score: 0.5395095367847412


# Decision Tree

In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score

# Define a function to read and split the dataset
def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)

    y_binary = data['session'].copy()
    y_binary.replace({1: 2, 2: 1, 3: 2}, inplace=True)

    # Divided with same IDs split together
    unique_ids = data['ID'].unique()
    test_ids = set(pd.Series(unique_ids).sample(frac=0.2, random_state=40))
    train_data = data[data['ID'].apply(lambda x: x not in test_ids)]
    test_data = data[data['ID'].apply(lambda x: x in test_ids)]
    X_train, y_train = train_data.iloc[:, :-1], y_binary[train_data.index]
    X_test, y_test = test_data.iloc[:, :-1], y_binary[test_data.index]

    return X_train, X_test, y_train, y_test


# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted', labels=[1, 2])

    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Accuracy for class Stress: {:.3f}'.format(accuracy_score(y_test[y_test==1], y_pred[y_test==1])))
    print('Accuracy for class Relax: {:.3f}'.format(accuracy_score(y_test[y_test==2], y_pred[y_test==2])))
    print('\nConfusion Matrix: \n', confusion_matrix(y_test, y_pred, normalize='true'))
    print('\nAUC: ', roc_auc_score(y_test, y_pred))
    print('\nF1-score for class Stress: {:.3f}'.format(f1_score(y_test[y_test==1], y_pred[y_test==1], average='weighted')))
    print('F1-score for class Relax: {:.3f}'.format(f1_score(y_test[y_test==2], y_pred[y_test==2], average='weighted')))
    print('F1-score:', f1_score(y_test, y_pred))


# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('classifier', DecisionTreeClassifier(random_state=40))
])

# Define hyperparameter space
param_grid = {
    'classifier__max_depth': [2, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5, # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit GridSearchCV object on the training dataset
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', grid_search.best_params_)

# Fit DecisionTreeClassifier object with best hyperparameters on the training dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test)

Best parameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5}
Accuracy:  0.5857843137254902
Accuracy for class Stress: 0.556
Accuracy for class Relax: 0.609

Confusion Matrix: 
 [[0.55617978 0.44382022]
 [0.39130435 0.60869565]]

AUC:  0.582437713727406

F1-score for class Stress: 0.715
F1-score for class Relax: 0.757
F1-score: 0.5395095367847412


# SVC

In [3]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler


# Define a function to read and split the dataset
def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)

    y_binary = data['session'].copy()
    y_binary.replace({1: 2, 2: 1, 3: 2}, inplace=True)

    # Divided with same IDs split together
    unique_ids = data['ID'].unique()
    test_ids = set(pd.Series(unique_ids).sample(frac=0.2, random_state=40))
    train_data = data[data['ID'].apply(lambda x: x not in test_ids)]
    test_data = data[data['ID'].apply(lambda x: x in test_ids)]
    X_train, y_train = train_data.iloc[:, :-1], y_binary[train_data.index]
    X_test, y_test = test_data.iloc[:, :-1], y_binary[test_data.index]

    return X_train, X_test, y_train, y_test

# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted', labels=[1, 2])

    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Accuracy for class Stress: {:.3f}'.format(accuracy_score(y_test[y_test==1], y_pred[y_test==1])))
    print('Accuracy for class Relax: {:.3f}'.format(accuracy_score(y_test[y_test==2], y_pred[y_test==2])))
    print('\nConfusion Matrix: \n', confusion_matrix(y_test, y_pred, normalize='true'))
    print('\nAUC: ', roc_auc_score(y_test, y_pred))
    print('\nF1-score for class Stress: {:.3f}'.format(f1_score(y_test[y_test==1], y_pred[y_test==1], average='weighted')))
    print('F1-score for class Relax: {:.3f}'.format(f1_score(y_test[y_test==2], y_pred[y_test==2], average='weighted')))
    print('F1-score:', f1_score(y_test, y_pred))


# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC(random_state=40))
])

# Define hyperparameter distributions for random search
param_distributions = {
    'classifier__C': np.logspace(-3, 3, 7),
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto'] + list(np.logspace(-3, 3, 7)),
}

# Use RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    cv=5, # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
    n_iter=50, # number of random search iterations
    random_state=40,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit RandomizedSearchCV object on the training dataset
random_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', random_search.best_params_)

# Fit SVM object with best hyperparameters on the training dataset
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test)

Best parameters: {'classifier__kernel': 'rbf', 'classifier__gamma': 0.001, 'classifier__C': 10.0}
Accuracy:  0.6421568627450981
Accuracy for class Stress: 0.663
Accuracy for class Relax: 0.626

Confusion Matrix: 
 [[0.66292135 0.33707865]
 [0.37391304 0.62608696]]

AUC:  0.6445041524181729

F1-score for class Stress: 0.797
F1-score for class Relax: 0.770
F1-score: 0.6178010471204188


# Random forest

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score

# Define a function to read and split the dataset
def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)

    y_binary = data['session'].copy()
    y_binary.replace({1: 2, 2: 1, 3: 2}, inplace=True)

    # Divided with same IDs split together
    unique_ids = data['ID'].unique()
    test_ids = set(pd.Series(unique_ids).sample(frac=0.2, random_state=40))
    train_data = data[data['ID'].apply(lambda x: x not in test_ids)]
    test_data = data[data['ID'].apply(lambda x: x in test_ids)]
    X_train, y_train = train_data.iloc[:, :-1], y_binary[train_data.index]
    X_test, y_test = test_data.iloc[:, :-1], y_binary[test_data.index]

    return X_train, X_test, y_train, y_test

# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted', labels=[1, 2])

    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Accuracy for class Stress: {:.3f}'.format(accuracy_score(y_test[y_test==1], y_pred[y_test==1])))
    print('Accuracy for class Relax: {:.3f}'.format(accuracy_score(y_test[y_test==2], y_pred[y_test==2])))
    print('\nConfusion Matrix: \n', confusion_matrix(y_test, y_pred, normalize='true'))
    print('\nAUC: ', roc_auc_score(y_test, y_pred))
    print('\nF1-score for class Stress: {:.3f}'.format(f1_score(y_test[y_test==1], y_pred[y_test==1], average='weighted')))
    print('F1-score for class Relax: {:.3f}'.format(f1_score(y_test[y_test==2], y_pred[y_test==2], average='weighted')))
    print('F1-score:', f1_score(y_test, y_pred))


# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=40))
])

# Define hyperparameter space
param_grid = {
    'classifier__n_estimators': [100, 300, 500],
    'classifier__max_depth': [2, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5, # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit GridSearchCV object on the training dataset
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', grid_search.best_params_)

# Fit RandomForestClassifier object with best hyperparameters on the training dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test)

Best parameters: {'classifier__max_depth': 5, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 300}
Accuracy:  0.6127450980392157
Accuracy for class Stress: 0.607
Accuracy for class Relax: 0.617

Confusion Matrix: 
 [[0.60674157 0.39325843]
 [0.3826087  0.6173913 ]]

AUC:  0.6120664386907669

F1-score for class Stress: 0.755
F1-score for class Relax: 0.763
F1-score: 0.5775401069518716
