# 24/03/2023

Voici des explications pour chaque section de code:

1. Function read_and_split_data
- lire le fichier de données
- diviser l'ensemble de données en ensembles train et de test
- prétraiter la variable cible pour Convertir multi-classe en bi-classe(remplaçant 1 et 3 par 2, et 2 par 1).

2. Funtion evaluate_model_performance
- évaluer les performances du modèle sur l'ensemble de test, 
- compris la précision (accuracy), la matrice de confusion (confusion matrix), l'AUC et la valeur F1.

3. Pipeline:
   Définissez l'objet Pipeline, qui contient le classificateur (DecisionTreeClassifier, Random Forest, SVC, Logistic Regression).

4. Param_grid: 
   Définissez l'espace de paramètres hyperparamétriques param_grid.

5. GridSearchCV:
   Utilisez l'objet GridSearchCV pour rechercher la meilleure combinaison de paramètres hyperparamétriques, en utilisant une validation croisée à 5 plis et une recherche en grille pour évaluer la précision.

## Les résultats de l'exécution des quatre modèles sont les suivants :

# Logistic Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score

# Define a function to read and split the dataset
def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)
    # Copy target variable
    y = data['session'].copy()
    # Replace 1 and 3 with 2, and 2 with 1 in y
    y.replace({1: 2, 2: 1, 3: 2}, inplace=True)
    # Split the dataset into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], y, test_size=test_size, 
                                                        stratify=y, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred, normalize='true'))
    print('AUC: ', roc_auc_score(y_test, y_pred))
    print('F1-score: ', f1_score(y_test, y_pred))

# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('classifier', DecisionTreeClassifier(random_state=40))
])

# Define hyperparameter space
param_grid = {
    'classifier__max_depth': [2, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5, # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit GridSearchCV object on the training dataset
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', grid_search.best_params_)

# Fit DecisionTreeClassifier object with best hyperparameters on the training dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test)

Best parameters: {'classifier__max_depth': 15, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}
Accuracy:  0.7953964194373402
Confusion Matrix: 
 [[0.8128655  0.1871345 ]
 [0.21818182 0.78181818]]
AUC:  0.7973418394471026
F1-score:  0.7765363128491621


# Decision Tree

In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score

# Define a function to read and split the dataset
def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)
    # Copy target variable
    y = data['session'].copy()
    # Replace 1 and 3 with 2, and 2 with 1 in y
    y.replace({1: 2, 2: 1, 3: 2}, inplace=True)
    # Split the dataset into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], y, test_size=test_size, 
                                                        stratify=y, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred, normalize='true'))
    print('AUC: ', roc_auc_score(y_test, y_pred))
    print('F1-score: ', f1_score(y_test, y_pred))

# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('classifier', DecisionTreeClassifier(random_state=40))
])

# Define hyperparameter space
param_grid = {
    'classifier__max_depth': [2, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5, # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit GridSearchCV object on the training dataset
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', grid_search.best_params_)

# Fit DecisionTreeClassifier object with best hyperparameters on the training dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test)

Best parameters: {'classifier__max_depth': 15, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}
Accuracy:  0.7953964194373402
Confusion Matrix: 
 [[0.8128655  0.1871345 ]
 [0.21818182 0.78181818]]
AUC:  0.7973418394471026
F1-score:  0.7765363128491621


# SVC

In [3]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler


# Define a function to read and split the dataset
def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)
    # Copy target variable
    y = data['session'].copy()
    # Replace 1 and 3 with 2, and 2 with 1 in y
    y.replace({1: 2, 2: 1, 3: 2}, inplace=True)
    # Split the dataset into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], y, test_size=test_size, 
                                                        stratify=y, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
    print('AUC: ', roc_auc_score(y_test, y_pred))
    print('F1-score: ', f1_score(y_test, y_pred))

# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC(random_state=40))
])

# Define hyperparameter distributions for random search
param_distributions = {
    'classifier__C': np.logspace(-3, 3, 7),
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto'] + list(np.logspace(-3, 3, 7)),
}

# Use RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    cv=5, # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
    n_iter=50, # number of random search iterations
    random_state=40,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit RandomizedSearchCV object on the training dataset
random_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', random_search.best_params_)

# Fit SVM object with best hyperparameters on the training dataset
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test)

Best parameters: {'classifier__kernel': 'rbf', 'classifier__gamma': 0.1, 'classifier__C': 1000.0}
Accuracy:  0.8567774936061381
Confusion Matrix: 
 [[139  32]
 [ 24 196]]
AUC:  0.8518872939925571
F1-score:  0.8323353293413173


# Random forest

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score

# Define a function to read and split the dataset
def read_and_split_data(file_path, test_size=0.2, random_state=40):
    data = pd.read_csv(file_path)
    # Copy target variable
    y = data['session'].copy()
    # Replace 1 and 3 with 2, and 2 with 1 in y
    y.replace({1: 2, 2: 1, 3: 2}, inplace=True)
    # Split the dataset into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], y, test_size=test_size, 
                                                        stratify=y, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Define a function to evaluate model performance
def evaluate_model_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred, normalize='true'))
    print('AUC: ', roc_auc_score(y_test, y_pred))
    print('F1-score: ', f1_score(y_test, y_pred))

# Define a pipeline to link data preprocessing and model training and evaluation
pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=40))
])

# Define hyperparameter space
param_grid = {
    'classifier__n_estimators': [100, 300, 500],
    'classifier__max_depth': [2, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5, # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
)

# Read and split the dataset
X_train, X_test, y_train, y_test = read_and_split_data('HRV_ECG_step60.csv')

# Fit GridSearchCV object on the training dataset
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print('Best parameters:', grid_search.best_params_)

# Fit RandomForestClassifier object with best hyperparameters on the training dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate model performance
evaluate_model_performance(best_model, X_test, y_test)

Best parameters: {'classifier__max_depth': 15, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}
Accuracy:  0.8900255754475703
Confusion Matrix: 
 [[0.86549708 0.13450292]
 [0.09090909 0.90909091]]
AUC:  0.8872939925571504
F1-score:  0.8731563421828908
