In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    GradientBoostingClassifier, AdaBoostClassifier
)
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [2]:
df = pd.read_csv('/content/dataset_after_feature_selection.csv')

# Drop Age (if present)
if "Age" in df.columns:
    df = df.drop(columns=["Age"])

X = df.drop(columns=["Depression"])
y = df["Depression"]

In [22]:
X

Unnamed: 0,Gender,Academic Pressure,Study Satisfaction,Sleep Duration,Dietary Habits,Degree,Study Hours,Financial Stress,Age Category
0,Male,Yes,No,6,Healthy,B.Pharm,3.0,No,30+
1,Female,No,Yes,6,Moderate,BSc,3.0,No,19-25
2,Male,No,Yes,5,Healthy,BA,9.0,No,30+
3,Female,No,No,7,Moderate,BCA,4.0,Yes,26-30
4,Female,Yes,No,6,Moderate,M.Tech,1.0,No,19-25
...,...,...,...,...,...,...,...,...,...
27878,Female,Yes,Yes,6,Unhealthy,'Class 12',7.0,No,26-30
27879,Male,No,No,5,Healthy,MSc,0.0,No,26-30
27880,Male,No,Yes,6,Unhealthy,MD,12.0,No,30+
27881,Female,Yes,No,5,Healthy,'Class 12',10.0,Yes,0-18


In [23]:
y

Unnamed: 0,Depression
0,1
1,0
2,0
3,1
4,0
...,...
27878,0
27879,0
27880,0
27881,1


In [3]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    # NOTE: in latest sklearn use sparse_output=False (not sparse=False)
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

In [4]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_include=np.number)),
        ("cat", categorical_transformer, selector(dtype_exclude=np.number)),
    ],
    remainder="drop",  # avoid private remainder classes
    verbose_feature_names_out=False,
)

In [5]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "SVC": SVC(probability=True, random_state=42),
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=600, random_state=42),
}

In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

results = []
for name, model in models.items():
    pipe = Pipeline(steps=[("prep", preprocessor), ("model", model)])

    # Accuracy
    cv_acc = cross_val_score(pipe, X, y, cv=kf, scoring="accuracy", n_jobs=-1)
    # F1-score (macro average for multi-class/balanced eval)
    cv_f1 = cross_val_score(pipe, X, y, cv=kf, scoring="f1_macro", n_jobs=-1)

    results.append({
        "Model": name,
        "Accuracy": np.mean(cv_acc),
        "F1_macro": np.mean(cv_f1)
    })

results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False).reset_index(drop=True)
print(results_df)



                Model  Accuracy  F1_macro
0  LogisticRegression  0.765628  0.756610
1    GradientBoosting  0.764696  0.755805
2            AdaBoost  0.764409  0.755239
3                 SVC  0.758312  0.748197
4       MLPClassifier  0.731377  0.721192
5        RandomForest  0.724779  0.715294
6          ExtraTrees  0.704372  0.695092
7        DecisionTree  0.674533  0.666180


In [7]:
from sklearn.model_selection import GridSearchCV, train_test_split

In [8]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_include=np.number)),
        ("cat", categorical_transformer, selector(dtype_exclude=np.number)),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)


In [29]:
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = GradientBoostingClassifier(random_state=42)

pipe = Pipeline(steps=[
    ("prep", preprocessor),   # your ColumnTransformer
    ("model", clf)
])

param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [3, 5, 10],     # ⚠️ GradientBoostingClassifier does not accept None for max_depth
    "model__min_samples_split": [2, 5, 10],
}

grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)

# Clean up best params
best_params = grid_search.best_params_
cleaned_params = {key.replace("model__", ""): value for key, value in best_params.items()}

# Build final pipeline with best params
final_pipeline = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", GradientBoostingClassifier(**cleaned_params, random_state=42))
])

# ⚡ Train on all data
final_pipeline.fit(X, y)

# ✅ Save the pipeline for Streamlit
joblib.dump(final_pipeline, "pipeline.pkl")


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters found: {'model__max_depth': 3, 'model__min_samples_split': 2, 'model__n_estimators': 300}
Best CV accuracy: 0.7631130035686544


['pipeline.pkl']

In [15]:
import joblib

In [17]:
data = [['Male', 'Yes', 'No', '6', 'Healthy', 'B.Pharm', np.float64(3.0),
       'No', '30+']]

In [18]:
columns = ['Gender', 'Academic Pressure', 'Study Satisfaction', 'Sleep Duration',
       'Dietary Habits', 'Degree', 'Study Hours', 'Financial Stress',
       'Age Category']

In [19]:
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,Gender,Academic Pressure,Study Satisfaction,Sleep Duration,Dietary Habits,Degree,Study Hours,Financial Stress,Age Category
0,Male,Yes,No,6,Healthy,B.Pharm,3.0,No,30+


In [21]:
final_pipeline.predict(one_df)

array([1])

In [24]:
joblib.dump(X, "df.pkl")

['df.pkl']