In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset
df = pd.read_csv('/kaggle/input/mental-health/mental_health.csv')

In [13]:
label_encoder = LabelEncoder()
categorical_columns = ['Gender', 'City', 'Program', 'Residence Type', 'Academic Pressure',
                       'Does the end of the semester affect sense of the academic pressure?', 
                       'How often you feel Academic Pressure in a running semester?', 
                       'Are you satisfied with your results each semester?', 
                       'How much academic help do you get from your friends?', 
                       'Did you receive a waiver or scholarship at your university?', 
                       'How does the lack of financial aid affect your academic performance?', 
                       'Chronic_Illness', 'Sleep Quality', 'Financial Stress', 
                       'Have you ever had suicidal thoughts ?', 'Overall Student mental Health']

In [16]:
# Apply label encoding to categorical columns
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

In [17]:
df

Unnamed: 0,Gender,Age,City,CGPA,Program,Residence Type,Academic Pressure,Does the end of the semester affect sense of the academic pressure?,How often you feel Academic Pressure in a running semester?,Are you satisfied with your results each semester?,How much academic help do you get from your friends?,Did you receive a waiver or scholarship at your university?,How does the lack of financial aid affect your academic performance?,Chronic_Illness,Sleep Duration(hours),Sleep Quality,Financial Stress,Have you ever had suicidal thoughts ?,Overall Student mental Health
0,1,-0.985495,4,0.870459,5,1,2,0,1,2,0,1,2,0,-0.723097,2,0,0,1
1,0,-0.985495,4,-0.913487,5,1,0,0,0,1,1,0,2,0,-0.723097,2,1,0,0
2,1,-1.440604,3,0.247153,1,1,1,1,0,1,2,1,0,1,-0.723097,0,0,0,2
3,0,-1.440604,5,-0.891993,3,1,0,0,2,1,2,0,2,0,-0.723097,0,0,0,2
4,0,-0.985495,6,0.032220,5,2,0,1,2,1,1,1,1,0,1.276128,1,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,0,-0.530387,10,0.354620,14,2,0,0,2,2,2,1,2,0,-0.723097,0,0,0,1
1054,1,-0.075278,6,1.300326,21,1,0,0,0,2,0,1,2,0,-1.389506,0,0,0,1
1055,0,-0.530387,0,1.300326,3,1,1,0,1,2,1,1,0,1,0.609720,1,1,0,2
1056,1,-0.075278,11,0.870459,10,1,0,1,2,2,2,1,1,0,1.276128,1,1,0,2


In [19]:
# Handling numerical features
numerical_features = ['Age', 'CGPA', 'Sleep Duration(hours)']

In [21]:
# Scaling the numerical features
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [22]:
# Splitting the data into features (X) and target (y)
X = df.drop('Overall Student mental Health', axis=1)  # Drop target variable from features
y = df['Overall Student mental Health']  # Target variable

In [23]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Train Logistic Regression
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_log_reg = log_reg_model.predict(X_test)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)

# Output Results
print("Logistic Regression Accuracy:", accuracy_log_reg)
print(confusion_matrix(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))

Logistic Regression Accuracy: 0.8066037735849056
[[44  0  7]
 [ 0 54 16]
 [ 8 10 73]]
              precision    recall  f1-score   support

           0       0.85      0.86      0.85        51
           1       0.84      0.77      0.81        70
           2       0.76      0.80      0.78        91

    accuracy                           0.81       212
   macro avg       0.82      0.81      0.81       212
weighted avg       0.81      0.81      0.81       212



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Output Results
print("Random Forest Accuracy:", accuracy_rf)
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9952830188679245
[[51  0  0]
 [ 0 70  0]
 [ 0  1 90]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       0.99      1.00      0.99        70
           2       1.00      0.99      0.99        91

    accuracy                           1.00       212
   macro avg       1.00      1.00      1.00       212
weighted avg       1.00      1.00      1.00       212



In [26]:
from sklearn.svm import SVC

# Train Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

# Output Results
print("SVM Accuracy:", accuracy_svm)
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.7735849056603774
[[41  0 10]
 [ 0 51 19]
 [ 8 11 72]]
              precision    recall  f1-score   support

           0       0.84      0.80      0.82        51
           1       0.82      0.73      0.77        70
           2       0.71      0.79      0.75        91

    accuracy                           0.77       212
   macro avg       0.79      0.77      0.78       212
weighted avg       0.78      0.77      0.77       212



In [27]:
from sklearn.ensemble import GradientBoostingClassifier

# Train Gradient Boosting
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_gb = gb_model.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)

# Output Results
print("Gradient Boosting Accuracy:", accuracy_gb)
print(confusion_matrix(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))


Gradient Boosting Accuracy: 0.9952830188679245
[[51  0  0]
 [ 0 70  0]
 [ 0  1 90]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       0.99      1.00      0.99        70
           2       1.00      0.99      0.99        91

    accuracy                           1.00       212
   macro avg       1.00      1.00      1.00       212
weighted avg       1.00      1.00      1.00       212



In [28]:
from sklearn.neighbors import KNeighborsClassifier

# Train K-Nearest Neighbors
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Output Results
print("KNN Accuracy:", accuracy_knn)
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


KNN Accuracy: 0.7547169811320755
[[42  3  6]
 [ 3 53 14]
 [12 14 65]]
              precision    recall  f1-score   support

           0       0.74      0.82      0.78        51
           1       0.76      0.76      0.76        70
           2       0.76      0.71      0.74        91

    accuracy                           0.75       212
   macro avg       0.75      0.76      0.76       212
weighted avg       0.76      0.75      0.75       212



In [29]:
from sklearn.tree import DecisionTreeClassifier

# Train Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Output Results
print("Decision Tree Accuracy:", accuracy_dt)
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.9905660377358491
[[51  0  0]
 [ 0 70  0]
 [ 0  2 89]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       0.97      1.00      0.99        70
           2       1.00      0.98      0.99        91

    accuracy                           0.99       212
   macro avg       0.99      0.99      0.99       212
weighted avg       0.99      0.99      0.99       212



In [30]:
from sklearn.ensemble import AdaBoostClassifier

# Train AdaBoost
ada_model = AdaBoostClassifier()
ada_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_ada = ada_model.predict(X_test)
accuracy_ada = accuracy_score(y_test, y_pred_ada)

# Output Results
print("AdaBoost Accuracy:", accuracy_ada)
print(confusion_matrix(y_test, y_pred_ada))
print(classification_report(y_test, y_pred_ada))


AdaBoost Accuracy: 0.7075471698113207
[[37  2 12]
 [ 0 47 23]
 [ 1 24 66]]
              precision    recall  f1-score   support

           0       0.97      0.73      0.83        51
           1       0.64      0.67      0.66        70
           2       0.65      0.73      0.69        91

    accuracy                           0.71       212
   macro avg       0.76      0.71      0.73       212
weighted avg       0.73      0.71      0.71       212



In [31]:
import xgboost as xgb

# Train XGBoost
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# Output Results
print("XGBoost Accuracy:", accuracy_xgb)
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


XGBoost Accuracy: 0.9905660377358491
[[51  0  0]
 [ 0 70  0]
 [ 0  2 89]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       0.97      1.00      0.99        70
           2       1.00      0.98      0.99        91

    accuracy                           0.99       212
   macro avg       0.99      0.99      0.99       212
weighted avg       0.99      0.99      0.99       212



In [32]:
import lightgbm as lgb

# Train LightGBM
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_lgb = lgb_model.predict(X_test)
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)

# Output Results
print("LightGBM Accuracy:", accuracy_lgb)
print(confusion_matrix(y_test, y_pred_lgb))
print(classification_report(y_test, y_pred_lgb))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003241 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 846, number of used features: 18
[LightGBM] [Info] Start training from score -1.328873
[LightGBM] [Info] Start training from score -1.074093
[LightGBM] [Info] Start training from score -0.932377
LightGBM Accuracy: 0.9952830188679245
[[51  0  0]
 [ 0 70  0]
 [ 0  1 90]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       0.99      1.00      0.99        70
           2       1.00      0.99      0.99        91

    accuracy                           1.00       212
   macro avg       1.00      1.00      1.00       212
weighted avg       1.00      1.00      1.00       212



In [33]:
from sklearn.naive_bayes import GaussianNB

# Train Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_nb = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

# Output Results
print("Naive Bayes Accuracy:", accuracy_nb)
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.7924528301886793
[[41  2  8]
 [ 0 61  9]
 [11 14 66]]
              precision    recall  f1-score   support

           0       0.79      0.80      0.80        51
           1       0.79      0.87      0.83        70
           2       0.80      0.73      0.76        91

    accuracy                           0.79       212
   macro avg       0.79      0.80      0.79       212
weighted avg       0.79      0.79      0.79       212

