In [15]:
# import os
# os.chdir('../')
# %pwd

'/Users/uvaishnav/stress_detection'

In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split

preprocess Features

In [17]:
scaler = StandardScaler()

In [18]:
def remove_null_columns( df):
        """Remove columns with all null values."""
        return df.dropna(axis=1, how='all')

def remove_highly_correlated_features(df):
    """Remove columns with high correlation."""
    highly_corelated_features = {'HRV_IQRNN', 'HRV_CVNN', 'HRV_SDSD', 'BVP_psd_std', 'HRV_MaxNN', 'EDA_SCL_mean', 'HRV_MCVNN', 'HRV_CVSD', 'HRV_RMSSD', 'HRV_pNN20', 'EDA_SCR_amplitude_mean'}
    return df.drop(columns=highly_corelated_features, errors='ignore')

def get_important_features(df):
    important_features = ['ACC_Z_mean', 'ACC_Y_mean', 'ACC_X_mean',
       'ACC_magnitude_std', 'EDA_mean', 'TEMP_mean',]  
    return df[important_features]






In [19]:
data  = pd.read_csv("data/features/extracted_features.csv")

In [20]:
data.head()

Unnamed: 0,BVP_mean,BVP_std,ACC_X_mean,ACC_Y_mean,ACC_Z_mean,TEMP_mean,EDA_mean,EDA_std,BVP_psd_mean,BVP_psd_std,...,HRV_pNN20,HRV_MinNN,HRV_MaxNN,HRV_HTI,HRV_TINN,EDA_SCL_mean,EDA_SCR_mean,EDA_SCR_peaks,EDA_SCR_amplitude_mean,stress_label
0,0.013829,2.291471,27.861877,-25.862279,19.613106,35.460832,1.217864,0.078524,0.010645,0.070833,...,90.47619,546.875,9515.625,14.0,281.25,1.221463,-0.0023,4,2e-05,0
1,0.000731,0.994065,33.813407,-7.297195,24.827901,35.516166,1.195514,0.08525,0.008269,0.059856,...,78.125,437.5,2640.625,10.666667,328.125,1.194588,0.000375,6,2.1e-05,0
2,0.001173,0.46547,43.838196,-1.755778,35.592263,35.519333,1.115099,0.030581,0.005122,0.044267,...,74.324324,500.0,1281.25,6.166667,250.0,1.115679,-0.000198,5,1e-05,0
3,-2.3e-05,0.294084,44.846485,-11.244263,42.561256,35.546331,1.064092,0.062517,0.003477,0.042426,...,77.777778,609.375,1015.625,6.545455,156.25,1.063762,0.000854,5,1.3e-05,0
4,0.000576,0.897162,44.01637,-9.123131,40.483478,35.669996,1.020672,0.071941,0.014656,0.089303,...,82.857143,531.25,1265.625,8.75,234.375,1.020715,-0.000169,5,3.1e-05,0


In [21]:
# Separate features and target variable
X = data.drop(columns=['stress_label'])
y = data['stress_label']

# Remove columns with all null values
X = remove_null_columns(X)

# Remove highly correlated features
X = remove_highly_correlated_features(X)

# Get the most important features
X = get_important_features(X)

# Normalize the features
X = scaler.fit_transform(X)

# Convert back to DataFrame to add the target variable
preprocessed_features = pd.DataFrame(X, columns=get_important_features(data).columns)
preprocessed_features['stress_label'] = y.values

In [23]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Separate features and target variable
X = preprocessed_features.drop(columns=['stress_label'])
y = preprocessed_features['stress_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Display the class distribution after resampling
print("Class distribution after resampling:")
print(pd.Series(y_train_resampled).value_counts())

Class distribution after resampling:
stress_label
0    2044
1    2044
Name: count, dtype: int64


In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Define a function to train and evaluate models
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {model.__class__.__name__}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Non-Stress', 'Stress']))
    print("\n")

# Initialize models
models = [
    RandomForestClassifier(n_estimators=100, random_state=42),
    LogisticRegression(random_state=42),
    SVC(random_state=42)
]

# Train and evaluate each model
for model in models:
    train_and_evaluate_model(model, X_train_resampled, y_train_resampled, X_test, y_test)

Model: RandomForestClassifier
Confusion Matrix:
[[506   6]
 [  8  58]]
Classification Report:
              precision    recall  f1-score   support

  Non-Stress       0.98      0.99      0.99       512
      Stress       0.91      0.88      0.89        66

    accuracy                           0.98       578
   macro avg       0.95      0.93      0.94       578
weighted avg       0.98      0.98      0.98       578



Model: LogisticRegression
Confusion Matrix:
[[355 157]
 [ 23  43]]
Classification Report:
              precision    recall  f1-score   support

  Non-Stress       0.94      0.69      0.80       512
      Stress       0.21      0.65      0.32        66

    accuracy                           0.69       578
   macro avg       0.58      0.67      0.56       578
weighted avg       0.86      0.69      0.74       578



Model: SVC
Confusion Matrix:
[[469  43]
 [  7  59]]
Classification Report:
              precision    recall  f1-score   support

  Non-Stress       0.99     