# Mental Health Survey CRISP-DM Notebook

Use this notebook on Colab or locally to run the full pipeline: data download, EDA, preprocessing, clustering, outlier detection, and happiness regression.

**Checklist before running:**
- Upload the Kaggle dataset CSV to a path you set in `DATA_PATH` or configure the Kaggle API cell.
- Ensure runtime has Python 3.10+.
- If running locally, `pip install -r requirements.txt` first.


In [None]:
# Optional: install dependencies on Colab (safe to skip if already installed)
!pip -q install -r "requirements.txt" || echo "Install locally as needed"


In [None]:
import json
import os
import pathlib
import random
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, IsolationForest, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.cluster import KMeans

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)


## Configure data location
- Set `DATA_PATH` to your Kaggle CSV.
- Optional: uncomment the Kaggle API cell if you want to download programmatically.


In [None]:
# Path to the mental health CSV (update this for your environment)
DATA_PATH = pathlib.Path("./Mental_Health_and_Social_Media_Balance_Dataset.csv")
TARGET_COL = "Happiness_Index(1-10)"  # target present in the provided dataset

# Example: configure Kaggle API if running on Colab (requires kaggle.json upload)
# from kaggle.api.kaggle_api_extended import KaggleApi
# api = KaggleApi(); api.authenticate()
# api.dataset_download_files("USERNAME/DATASET-NAME", path="./data", unzip=True)
# DATA_PATH = pathlib.Path("./data/your_file.csv")


In [None]:
# Load data
assert DATA_PATH.exists(), f"CSV not found at {DATA_PATH}. Please update DATA_PATH."
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


In [None]:
# Quick info
display(df.dtypes)
display(df.describe(include="all"))

def missingness_table(frame: pd.DataFrame) -> pd.DataFrame:
    miss_pct = frame.isna().mean().sort_values(ascending=False) * 100
    return miss_pct.to_frame(name="missing_pct").query("missing_pct > 0")

missingness_table(df).head(20)


## Column typing
- Update `CAT_COLS` and `NUM_COLS` if you want to override the automatic detection.
- The target column is removed from predictors automatically.


In [None]:
# Manually set these if auto-detection is not correct for your dataset
CAT_COLS: List[str] = []
NUM_COLS: List[str] = []

def auto_columns(frame: pd.DataFrame, target: str) -> Dict[str, List[str]]:
    cat = frame.select_dtypes(include=["object", "category", "bool"])
    num = frame.select_dtypes(include=["number"]).copy()
    cat_cols = [c for c in cat.columns if c != target]
    num_cols = [c for c in num.columns if c != target]
    return {"cat": cat_cols, "num": num_cols}

if not CAT_COLS and not NUM_COLS:
    columns = auto_columns(df, TARGET_COL)
    CAT_COLS, NUM_COLS = columns["cat"], columns["num"]

FEATURE_COLS = CAT_COLS + NUM_COLS
print("Categorical: ", CAT_COLS)
print("Numeric: ", NUM_COLS)
assert TARGET_COL in df.columns, "Target column not found. Update TARGET_COL."


## EDA: distributions and correlations
- Run a small set of plots to understand distributions.


In [None]:
# Numeric distributions
num_sample = df[NUM_COLS].sample(min(len(df), 5000), random_state=RANDOM_STATE)
num_sample.hist(figsize=(14, 10), bins=30)
plt.tight_layout()
plt.show()

# Categorical distributions (top 15 categories)
for col in CAT_COLS[:8]:
    plt.figure(figsize=(8, 4))
    df[col].value_counts(dropna=False).head(15).plot(kind="bar")
    plt.title(col)
    plt.tight_layout()
    plt.show()


In [None]:
# Correlation heatmap for numeric features
if NUM_COLS:
    corr = df[NUM_COLS + [TARGET_COL]].corr(numeric_only=True)
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=False, cmap="coolwarm", center=0)
    plt.title("Correlation Heatmap")
    plt.tight_layout()
    plt.show()


## Train/validation/test split and preprocessing
- Uses 70/15/15 split with simple imputers, one-hot encoding, and scaling.


In [None]:
X = df[FEATURE_COLS].copy()
y = df[TARGET_COL].copy()

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=RANDOM_STATE
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=RANDOM_STATE
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, CAT_COLS),
        ("numeric", numeric_transformer, NUM_COLS),
    ]
)


## Model training and validation
- Baseline and lightweight models compared using MAE, RMSE, R^2 on the validation set.


In [None]:
def regression_metrics(y_true, y_pred) -> Dict[str, float]:
    return {
        "mae": mean_absolute_error(y_true, y_pred),
        "rmse": mean_squared_error(y_true, y_pred) ** 0.5,
        "r2": r2_score(y_true, y_pred),
    }

models = {
    "dummy_mean": DummyRegressor(strategy="mean"),
    "linear": LinearRegression(),
    "ridge": Ridge(alpha=1.0, random_state=RANDOM_STATE),
    "lasso": Lasso(alpha=0.001, random_state=RANDOM_STATE, max_iter=5000),
    "rf": RandomForestRegressor(
        n_estimators=200, max_depth=8, random_state=RANDOM_STATE, n_jobs=-1
    ),
    "gbr": GradientBoostingRegressor(random_state=RANDOM_STATE),
}

results = []
trained_models = {}

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocess", preprocess), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    metrics = regression_metrics(y_val, preds)
    row = {"model": name, **metrics}
    results.append(row)
    trained_models[name] = pipe

results_df = pd.DataFrame(results).sort_values("mae")
results_df


In [None]:
# Evaluate the best model on the held-out test set
best_model_name = results_df.iloc[0]["model"]
best_model = trained_models[best_model_name]
test_preds = best_model.predict(X_test)
test_metrics = regression_metrics(y_test, test_preds)
print(f"Best model: {best_model_name}")
print(json.dumps(test_metrics, indent=2))


In [None]:
# Permutation importance on validation data for the best model
perm = permutation_importance(
    best_model, X_val, y_val, n_repeats=5, random_state=RANDOM_STATE, n_jobs=-1
)

# Align names to original feature columns (permutation_importance permutes original X)
feature_names = FEATURE_COLS

importance_df = pd.DataFrame(
    {
        "feature": feature_names,
        "importance_mean": perm.importances_mean,
        "importance_std": perm.importances_std,
    }
).sort_values("importance_mean", ascending=False)

importance_df.head(20)


## Clustering (k-means) for respondent segments
- Uses scaled numeric features; categorical columns are one-hot encoded.


In [None]:
# Build a numeric-only view for clustering
numeric_view = df[NUM_COLS].copy()
numeric_view = numeric_view.fillna(numeric_view.median())
scaled_numeric = StandardScaler().fit_transform(numeric_view)

kmeans = KMeans(n_clusters=3, random_state=RANDOM_STATE, n_init=10)
clusters = kmeans.fit_predict(scaled_numeric)
df_clusters = df.copy()
df_clusters["cluster"] = clusters

cluster_summary = df_clusters.groupby("cluster").agg({
    TARGET_COL: ["mean", "median"],
})
cluster_summary


## Outlier detection
- Isolation Forest on scaled numeric features; adjusts contamination to ~2% by default.


In [None]:
iso = IsolationForest(contamination=0.02, random_state=RANDOM_STATE)
outlier_flags = iso.fit_predict(scaled_numeric)
df_outliers = df.copy()
df_outliers["outlier"] = (outlier_flags == -1)
print(df_outliers["outlier"].value_counts())
df_outliers[df_outliers["outlier"]].head()


## Next steps
- Calibrate models with hyperparameter search (grid or randomized) if compute allows.
- Add fairness checks for demographic subgroups where applicable.
- Refresh metrics when new survey vintages arrive and monitor for label drift.
