In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [2]:
df = pd.read_csv('../Datasets/data_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,P_Id,PPG_Rate_Mean,HRV_MeanNN,HRV_SDNN,HRV_RMSSD,HRV_SDSD,HRV_CVNN,HRV_CVSD,HRV_MedianNN,HRV_MadNN,...,HRV_ShanEn,HRV_FuzzyEn,HRV_MSEn,HRV_CMSEn,HRV_RCMSEn,HRV_CD,HRV_HFD,HRV_KFD,HRV_LZC,anxiety_meter
0,101,72.160656,838.787356,161.144697,216.334225,216.962029,0.192116,0.257913,853.5,106.0059,...,7.103751,1.291662,1.229793,1.392336,1.912396,1.691472,1.965384,4.166548,1.197715,7.0
1,102,94.333514,648.155039,196.751139,269.605729,270.131772,0.303556,0.415959,636.5,152.7078,...,7.555224,1.371911,1.174007,1.288954,1.839586,1.707419,1.967041,2.788036,1.117846,16.0
2,103,91.269287,673.426396,240.470162,313.700844,314.500938,0.357085,0.465828,609.0,203.1162,...,7.31614,1.471038,1.387296,1.315389,1.876066,1.833765,1.939231,4.690344,1.122028,10.0
3,104,76.862836,800.455621,295.652405,419.562698,420.816766,0.369355,0.524155,744.0,182.3598,...,7.143425,1.156025,1.196983,1.302091,1.612782,1.572197,1.974733,2.787734,1.051012,8.0
4,105,68.40994,918.483221,655.450522,920.36067,923.485198,0.713623,1.002044,776.0,324.6894,...,7.08494,1.133012,1.458232,1.30275,1.426864,1.427651,1.984928,2.193969,1.162819,15.0


In [4]:
df.shape

(101, 86)

In [5]:
X = df.drop(columns=['anxiety_meter', 'P_Id'], axis=1)
y = df['anxiety_meter']

In [6]:
X

Unnamed: 0,PPG_Rate_Mean,HRV_MeanNN,HRV_SDNN,HRV_RMSSD,HRV_SDSD,HRV_CVNN,HRV_CVSD,HRV_MedianNN,HRV_MadNN,HRV_MCVNN,...,HRV_SampEn,HRV_ShanEn,HRV_FuzzyEn,HRV_MSEn,HRV_CMSEn,HRV_RCMSEn,HRV_CD,HRV_HFD,HRV_KFD,HRV_LZC
0,72.160656,838.787356,161.144697,216.334225,216.962029,0.192116,0.257913,853.5,106.0059,0.124201,...,1.662548,7.103751,1.291662,1.229793,1.392336,1.912396,1.691472,1.965384,4.166548,1.197715
1,94.333514,648.155039,196.751139,269.605729,270.131772,0.303556,0.415959,636.5,152.7078,0.239918,...,1.868132,7.555224,1.371911,1.174007,1.288954,1.839586,1.707419,1.967041,2.788036,1.117846
2,91.269287,673.426396,240.470162,313.700844,314.500938,0.357085,0.465828,609.0,203.1162,0.333524,...,1.746639,7.316140,1.471038,1.387296,1.315389,1.876066,1.833765,1.939231,4.690344,1.122028
3,76.862836,800.455621,295.652405,419.562698,420.816766,0.369355,0.524155,744.0,182.3598,0.245107,...,1.377711,7.143425,1.156025,1.196983,1.302091,1.612782,1.572197,1.974733,2.787734,1.051012
4,68.409940,918.483221,655.450522,920.360670,923.485198,0.713623,1.002044,776.0,324.6894,0.418414,...,1.668343,7.084940,1.133012,1.458232,1.302750,1.426864,1.427651,1.984928,2.193969,1.162819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,106.643368,570.586957,199.166962,245.539186,246.212727,0.349056,0.430327,501.5,64.4931,0.128600,...,0.678190,6.976664,0.903258,1.056822,1.177499,1.092981,1.148055,1.907768,2.744730,0.736001
97,88.979798,691.214286,429.343966,515.791973,517.634875,0.621145,0.746211,550.5,123.7971,0.224881,...,0.770202,6.889927,0.857505,0.837755,1.038622,0.956516,0.913066,1.897684,1.796485,1.018469
98,109.566313,553.231834,155.993383,196.165914,196.496027,0.281967,0.354582,500.0,50.4084,0.100817,...,0.805586,7.184780,0.775767,0.978753,1.191947,1.138375,1.145786,1.932554,2.855720,0.763747
99,126.073719,478.073171,112.120465,133.252377,133.476196,0.234526,0.278728,456.0,34.8411,0.076406,...,0.744972,6.726446,0.600827,0.474042,1.009792,0.961145,0.984105,1.917123,1.941572,0.839453


In [7]:
y = (y >= 16).astype(int)
y.name = 'IsAnxious'

In [8]:
y

0      0
1      1
2      0
3      0
4      0
      ..
96     1
97     1
98     1
99     1
100    0
Name: IsAnxious, Length: 101, dtype: int64

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
def create_pipeline(classifier):
    return Pipeline([
        ('imputer', KNNImputer(n_neighbors=5)),  # Replace missing values with the mean
        ('scaler', RobustScaler()),  # Scale features using RobustScaler
        ('kbest', SelectKBest(score_func=f_classif, k=5)),  # Select features using SelectKBest
        ('classifier', classifier),  # Classifier
    ])

# Define the parameter grid for GridSearchCV
param_grids = {
    'svc': {
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__gamma': ['scale', 'auto']
    },
    'rfc': {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': ['sqrt', 'log2']
    },
    'logreg': {
        'classifier': [LogisticRegression(random_state=42)],
        'classifier__solver': ['lbfgs', 'liblinear'],
        'classifier__C': [0.1, 1.0, 10.0]
    },
    'xgboost': {
        'classifier': [XGBClassifier(random_state=42)],
        'classifier__learning_rate': [0.1, 0.01],
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7]
    },
    'dtc': {
        'classifier': [DecisionTreeClassifier(random_state=42)],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': ['sqrt', 'log2']
    },
    'nbayes': {
        'classifier': [GaussianNB()]
    },
    'voting-algorithm': {
        'classifier': [VotingClassifier(estimators=[('svc', SVC(random_state=42))])]
    }
}

# Create StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform GridSearchCV for each classifier
results = {}
for model_name, param_grid in param_grids.items():
    if 'classifier' in param_grid:
        classifier = param_grid['classifier'][0]
        del param_grid['classifier']
        pipeline = create_pipeline(classifier)
    else:
        pipeline = create_pipeline(SVC(random_state=42))
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X, y)  # Assuming X and y are your features and target variables
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'best_std_deviation': grid_search.cv_results_['std_test_score'][grid_search.best_index_]
    }

# Print results for each model
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print("Best parameters:", result['best_params'])
    print("Best cross-validation score:", result['best_score'])
    print("Best standard deviation:", result['best_std_deviation'])
    print("-" * 50)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Model: svc
Best parameters: {'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'}
Best cross-validation score: 0.5742857142857142
Best standard deviation: 0.022406267344797285
--------------------------------------------------
Model: rfc
Best parameters: {'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}
Best cross-validation score: 0.4152380952380953
Best standard deviation: 0.06268012592473823
--------------------------------------------------
Model: logreg
Best parameters: {'clas

In [11]:
def create_pipeline(classifier):
    return Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Replace missing values with the mean
        ('scaler', RobustScaler()),  # Scale features using RobustScaler
        ('kbest', SelectKBest(score_func=f_classif, k=5)),  # Select features using SelectKBest
        ('classifier', classifier),  # Classifier
    ])
    

# Define the parameter grid for GridSearchCV
param_grids = {
    'svc': {
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__gamma': ['scale', 'auto']
    },
    'rfc': {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': ['sqrt', 'log2']
    },
    'logreg': {
        'classifier': [LogisticRegression(random_state=42)],
        'classifier__solver': ['lbfgs', 'liblinear'],
        'classifier__C': [0.1, 1.0, 10.0]
    },
    'xgboost': {
        'classifier': [XGBClassifier(random_state=42)],
        'classifier__learning_rate': [0.1, 0.01],
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7]
    },
    'dtc': {
        'classifier': [DecisionTreeClassifier(random_state=42)],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': ['sqrt', 'log2']
    },
    'nbayes': {
        'classifier': [GaussianNB()]
    },
    'voting-algorithm': {
        'classifier': [VotingClassifier(estimators=[('svc', SVC(random_state=42))])]
    }
}

# Create StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform GridSearchCV for each classifier
results = {}
for model_name, param_grid in param_grids.items():
    if 'classifier' in param_grid:
        classifier = param_grid['classifier'][0]
        del param_grid['classifier']
        pipeline = create_pipeline(classifier)
    else:
        pipeline = create_pipeline(SVC(random_state=42))
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X, y)  # Assuming X and y are your features and target variables
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'best_std_deviation': grid_search.cv_results_['std_test_score'][grid_search.best_index_]
    }

# Print results for each model
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print("Best parameters:", result['best_params'])
    print("Best cross-validation score:", result['best_score'])
    print("Best standard deviation:", result['best_std_deviation'])
    print("-" * 50)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Model: svc
Best parameters: {'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'}
Best cross-validation score: 0.5742857142857142
Best standard deviation: 0.022406267344797285
--------------------------------------------------
Model: rfc
Best parameters: {'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}
Best cross-validation score: 0.41571428571428576
Best standard deviation: 0.049156144383100724
------------------------