MODEL SELECTION

Student depression prediction is a classification problem, and require a supervised machine learning algorithm for doing such task.

Things to consider when choosing a machine learning algorithm:
- Final goal: accuracy, speed, scalability
- Data nature: outliters, size, quality, characteristic
- Our data is the combination of both categorical data (Ex: Gender, Sleep Duration,...) and numerical data (CGPA, Work Hour,...)
- Constraints: such as computational limitations



There are serveral supervised machine learning algorithm that can be used for our task, such as:
- Logistic Regression:
- Decision tree
- Random forest
- Support Vector Machine
- Naive Bayes


As this is a classification problem, we use Accuracy, Recall, Precision, F1 Score to measure our model



In [1]:
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [4]:
def evaluate_model(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_true, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_true, y_pred, average='weighted', zero_division=0),
    }

def train_and_evaluate_all_models(base_filename, target_column, n_folds=5):
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(n_estimators=100),
        "Support Vector Machine": SVC(),
        "Naive Bayes": GaussianNB()
    }

    results = {name: [] for name in models}

    for fold in range(1, n_folds + 1):
        print(f"\n📁 Fold {fold}")
        train_path = f"../datasets/train/train_fold{fold}_{base_filename}"
        test_path = f"../datasets/test/test_fold{fold}_{base_filename}"

        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        X_train = train_df.drop(columns=[target_column])
        y_train = train_df[target_column]
        X_test = test_df.drop(columns=[target_column])
        y_test = test_df[target_column]

        for model_name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            metrics = evaluate_model(y_test, y_pred)
            results[model_name].append(metrics)

            print(f"🔹 {model_name}")
            print(f"  Accuracy:  {metrics['accuracy']:.4f}")
            print(f"  Precision: {metrics['precision']:.4f}")
            print(f"  Recall:    {metrics['recall']:.4f}")
            print(f"  F1 Score:  {metrics['f1_score']:.4f}")

    # Calculate average metrics
    print("\n📊 Average Performance Across Folds:")
    for model_name in models:
        avg_metrics = {
            metric: sum(r[metric] for r in results[model_name]) / n_folds
            for metric in ["accuracy", "precision", "recall", "f1_score"]
        }
        print(f"\n🔹 {model_name}")
        for metric, score in avg_metrics.items():
            print(f"  {metric.capitalize()}: {score:.4f}")

In [None]:
train_and_evaluate_all_models(
    base_filename="preprocessed_student_depression.csv",
    target_column="Depression",  # change to your actual label column
    n_folds=5
)


📁 Fold 1


KeyError: "['label'] not found in axis"