In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC  # Support Vector Classifier for SVM
from sklearn.neighbors import KNeighborsClassifier  # for KNN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV



# 1. Data Loading and Preliminary Analysis

In [58]:
# Load the dataset
file_path = 'filtered_data_newfeatures.csv'
data = pd.read_csv(file_path, delimiter='\t')

In [59]:
data.shape

(38333, 195)

In [60]:
data.head()

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,...,Agreeableness,Conscientiousness,Emotional_Stability,Openness,Depression_Score,Anxiety_Score,Stress_Score,Depression_Severity,Anxiety_Severity,Stress_Severity
0,3,28,3890,3,25,2122,1,16,1944,3,...,5.0,5.0,1.0,7.0,27,34,40,Severe,Extremely Severe,Extremely Severe
1,3,2,8118,0,36,2890,1,35,4777,2,...,5.0,2.5,1.0,4.0,24,17,27,Severe,Severe,Severe
2,2,7,5784,0,33,4373,3,41,3242,0,...,4.0,2.5,4.5,5.5,39,12,17,Extremely Severe,Moderate,Mild
3,1,23,5081,2,11,6837,1,37,5521,0,...,6.5,7.0,5.0,6.5,16,17,16,Moderate,Severe,Mild
4,1,36,3215,1,13,7731,2,5,4156,3,...,4.0,2.5,2.5,5.0,32,40,29,Extremely Severe,Extremely Severe,Severe


In [61]:
missing_value_counts = data.isnull().sum()
columns_with_missing_values = missing_value_counts[missing_value_counts > 0]
columns_with_missing_values

country                    2
TIPI1                    473
TIPI2                    555
TIPI3                    575
TIPI4                    453
TIPI5                    504
TIPI6                    480
TIPI7                    587
TIPI8                    645
TIPI9                    416
TIPI10                   543
education                485
urban                    361
gender                    64
engnat                    50
hand                     166
religion                 343
orientation             3039
voted                    310
married                  192
familysize              1074
major                  10823
education_label          485
urban_label              361
gender_label              64
engnat_label              50
hand_label               166
religion_label           343
orientation_label       3039
voted_label              310
married_label            192
Extraversion             635
Agreeableness            734
Conscientiousness        827
Emotional_Stab

In [62]:
data.drop("major", axis=1, inplace=True)
data.dropna(inplace=True)

In [63]:
data.head()

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,...,Agreeableness,Conscientiousness,Emotional_Stability,Openness,Depression_Score,Anxiety_Score,Stress_Score,Depression_Severity,Anxiety_Severity,Stress_Severity
0,3,28,3890,3,25,2122,1,16,1944,3,...,5.0,5.0,1.0,7.0,27,34,40,Severe,Extremely Severe,Extremely Severe
2,2,7,5784,0,33,4373,3,41,3242,0,...,4.0,2.5,4.5,5.5,39,12,17,Extremely Severe,Moderate,Mild
3,1,23,5081,2,11,6837,1,37,5521,0,...,6.5,7.0,5.0,6.5,16,17,16,Moderate,Severe,Mild
4,1,36,3215,1,13,7731,2,5,4156,3,...,4.0,2.5,2.5,5.0,32,40,29,Extremely Severe,Extremely Severe,Severe
5,0,18,6116,0,28,3193,1,2,12542,0,...,7.0,6.0,6.5,4.0,13,6,12,Mild,Normal,Normal


In [64]:
data.head()

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,...,Agreeableness,Conscientiousness,Emotional_Stability,Openness,Depression_Score,Anxiety_Score,Stress_Score,Depression_Severity,Anxiety_Severity,Stress_Severity
0,3,28,3890,3,25,2122,1,16,1944,3,...,5.0,5.0,1.0,7.0,27,34,40,Severe,Extremely Severe,Extremely Severe
2,2,7,5784,0,33,4373,3,41,3242,0,...,4.0,2.5,4.5,5.5,39,12,17,Extremely Severe,Moderate,Mild
3,1,23,5081,2,11,6837,1,37,5521,0,...,6.5,7.0,5.0,6.5,16,17,16,Moderate,Severe,Mild
4,1,36,3215,1,13,7731,2,5,4156,3,...,4.0,2.5,2.5,5.0,32,40,29,Extremely Severe,Extremely Severe,Severe
5,0,18,6116,0,28,3193,1,2,12542,0,...,7.0,6.0,6.5,4.0,13,6,12,Mild,Normal,Normal


In [65]:
data.drop(["Depression_Severity", "Anxiety_Severity", "Stress_Severity"], axis=1, inplace=True)

In [66]:
columns_to_drop = [f'Q{i}{suffix}' for i in range(1, 43) for suffix in [
    'E', 'I', 'A']]
data.drop(columns_to_drop, axis=1, inplace=True)
data.drop(["VCL1", "VCL2", "VCL3", "VCL4", "VCL5", "VCL6", "VCL7", "VCL8", "VCL9",
          "VCL10", "VCL11", "VCL12", "VCL13", "VCL14", "VCL15", "VCL16"], axis=1, inplace=True)
other_columns_to_drop = [
    'urban_label',
    'gender_label', 'engnat_label', 'hand_label', 'religion_label',
    'orientation_label', 'race_label', 'voted_label', 'married_label', 'Extraversion',
    'Agreeableness', 'Conscientiousness', 'Emotional_Stability', 'Openness', 'education_label', "country", 'introelapse', 'testelapse', 'surveyelapse', 'mean_response_time',
    'std_deviation_dass', 'source', 'screensize', "uniquenetworklocation"
]
data.drop(other_columns_to_drop, axis=1, inplace=True)

In [67]:
data.columns

Index(['TIPI1', 'TIPI2', 'TIPI3', 'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8',
       'TIPI9', 'TIPI10', 'education', 'urban', 'gender', 'engnat', 'age',
       'hand', 'religion', 'orientation', 'race', 'voted', 'married',
       'familysize', 'Depression_Score', 'Anxiety_Score', 'Stress_Score'],
      dtype='object')

In [68]:
data.head()

Unnamed: 0,TIPI1,TIPI2,TIPI3,TIPI4,TIPI5,TIPI6,TIPI7,TIPI8,TIPI9,TIPI10,...,hand,religion,orientation,race,voted,married,familysize,Depression_Score,Anxiety_Score,Stress_Score
0,1.0,3.0,7.0,1.0,7.0,1.0,7.0,3.0,1.0,7.0,...,1.0,12.0,1.0,10,2.0,1.0,2.0,27,34,40
2,2.0,3.0,2.0,6.0,5.0,2.0,5.0,3.0,3.0,6.0,...,1.0,4.0,3.0,60,1.0,1.0,3.0,39,12,17
3,1.0,7.0,7.0,4.0,6.0,4.0,6.0,7.0,6.0,7.0,...,2.0,4.0,5.0,70,2.0,1.0,5.0,16,17,16
4,2.0,3.0,3.0,2.0,5.0,3.0,5.0,2.0,3.0,5.0,...,3.0,10.0,1.0,10,2.0,1.0,4.0,32,40,29
5,2.0,7.0,6.0,7.0,7.0,1.0,7.0,6.0,6.0,1.0,...,1.0,4.0,1.0,70,2.0,1.0,4.0,13,6,12


In [69]:
# Apply the function to each of the score columns
data['Depression_Score'] = data['Depression_Score'].apply(lambda x: 0 if x <= 13 else 1)
data['Anxiety_Score'] = data['Anxiety_Score'].apply(lambda x: 0 if x <= 9 else 1)
data['Stress_Score'] = data['Stress_Score'].apply(lambda x: 0 if x <= 18 else 1)

In [70]:
X_depression = data.drop(['Anxiety_Score', 'Stress_Score'], axis=1)
X_anxiety = data.drop(['Depression_Score', 'Stress_Score'], axis=1)
X_stress = data.drop(['Depression_Score', 'Anxiety_Score'], axis=1)

# 2. Data Splitting

## 2.1.1 Splitting into depression train and test sets

In [71]:
y_depression = X_depression['Depression_Score']
X_depression = X_depression.drop('Depression_Score', axis=1)

## 2.1.2 Splitting into stress train and test sets

In [72]:
y_stress = X_stress['Stress_Score']
X_stress = X_stress.drop('Stress_Score', axis=1)

## 2.1.1 Splitting into anxiety train and test sets

In [73]:
y_anxiety = X_anxiety['Anxiety_Score']
X_anxiety = X_anxiety.drop('Anxiety_Score', axis=1)

## 3. Model Training

In this chapter we will train the model on the training data and evaluate it on the test data. We will use the following models:
- Logistic Regression
- Random Forest
- Decision Tree
- Gradient Boosting

## 3.1 Training depression model

In [74]:
# Identify numerical and categorical columns
numerical_cols = X_depression.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_depression.select_dtypes(include=['object']).columns

In [75]:
categorical_cols

Index([], dtype='object')

In [76]:
# Create the preprocessing pipelines for both numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
    ])

In [77]:

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

rf_params = {
    'classifier__n_estimators': [10, 20, 50, 100, 200],
    'classifier__max_depth': [2, 3, 4, 5, 6, 7, 10, 20],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__bootstrap': [True, False],
}

rf_random_search = RandomizedSearchCV(
    rf_pipeline, rf_params, n_iter=100, cv=5, scoring='accuracy', random_state=42)

In [78]:
gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier())
])

gb_params = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [2, 3, 4, 5]
}

gb_random_search = RandomizedSearchCV(gb_pipeline, gb_params, n_iter=100, cv=5, scoring='accuracy', random_state=42)


In [79]:
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC())
])

svm_params = {
    'classifier__C': [0.1, 1, 10, 100, 1000],
    'classifier__kernel': ['rbf', 'linear', 'sigmoid'],
    'classifier__gamma': [1, 0.1, 0.01, 0.001, 0.0001, 'scale']
}

svm_random_search = RandomizedSearchCV(
    svm_pipeline, svm_params, n_iter=100, cv=5, scoring='accuracy', random_state=42)

In [80]:
# Example for RandomForestClassifier
rf_random_search.fit(X_depression, y_depression)
print("Best Parameters for RandomForestClassifier:", rf_random_search.best_params_)
print("Best Score for RandomForestClassifier:", rf_random_search.best_score_)


Best Parameters for RandomForestClassifier: {'classifier__n_estimators': 200, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 2, 'classifier__max_depth': 20, 'classifier__bootstrap': True}
Best Score for RandomForestClassifier: 0.7713833126852124


In [81]:
# Example for GradientBoostingClassifier
gb_random_search.fit(X_depression, y_depression)
print("Best Parameters for GradientBoostingClassifier:", gb_random_search.best_params_)
print("Best Score for GradientBoostingClassifier:", gb_random_search.best_score_)



Best Parameters for GradientBoostingClassifier: {'classifier__n_estimators': 200, 'classifier__max_depth': 3, 'classifier__learning_rate': 0.1}
Best Score for GradientBoostingClassifier: 0.7727841587132043


In [82]:
# # Example for Logistic Regression
# svm_random_search.fit(X_depression, y_depression)
# print("Best Parameters for GradientBoostingClassifier:", svm_random_search.best_params_)
# print("Best Score for GradientBoostingClassifier:", svm_random_search.best_score_)

In [83]:
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

lr_params = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10],
    'classifier__solver': ['liblinear', 'lbfgs', "newton-cg", "sag", "saga"]
}

lr_grid_search = GridSearchCV(lr_pipeline, lr_params, cv=5, scoring='accuracy')


In [84]:
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

dt_params = {
    'classifier__max_depth': [5, 6, 7, 8, 10, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__max_features': [1, 2, 3, 5, 10, 20, 25, 30, 'sqrt', 'log2'],
}

dt_grid_search = GridSearchCV(dt_pipeline, dt_params, cv=5, scoring='accuracy')

In [85]:
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

knn_params = {
    'classifier__n_neighbors': [3, 5, 7, 10, 20, 30, 50],
    'classifier__weights': ['uniform', 'distance']
}

knn_grid_search = GridSearchCV(knn_pipeline, knn_params, cv=5, scoring='accuracy')


In [86]:
# Example for Logistic Regression
lr_grid_search.fit(X_depression, y_depression)
print("Best Parameters for Logistic Regression:", lr_grid_search.best_params_)
print("Best Score for Logistic Regression:", lr_grid_search.best_score_)


Best Parameters for Logistic Regression: {'classifier__C': 0.01, 'classifier__solver': 'liblinear'}
Best Score for Logistic Regression: 0.7636955122732189


In [87]:
# Example for DecisionTreeClassifier
dt_grid_search.fit(X_depression, y_depression)
print("Best Parameters for DecisionTreeClassifier:", dt_grid_search.best_params_)
print("Best Score for DecisionTreeClassifier:", dt_grid_search.best_score_)


Best Parameters for DecisionTreeClassifier: {'classifier__max_depth': 6, 'classifier__max_features': 20, 'classifier__min_samples_split': 5}
Best Score for DecisionTreeClassifier: 0.7594623398862356


In [88]:
# Example for KNeighborsClassifier
knn_grid_search.fit(X_depression, y_depression)
print("Best Parameters for KNeighborsClassifier:", knn_grid_search.best_params_)
print("Best Score for KNeighborsClassifier:", knn_grid_search.best_score_)


Best Parameters for KNeighborsClassifier: {'classifier__n_neighbors': 50, 'classifier__weights': 'uniform'}
Best Score for KNeighborsClassifier: 0.7613298504010337
