In [None]:
# ! pip install cliffs-delta

In [None]:
# === Import
# import pandas as pd
import sys
import json
from pathlib import Path
# from collections import Counter

# === Define the path to the auxiliary modules ===
ROOT = Path.cwd().parent
SRC = (ROOT / "src").resolve()

if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

import importlib
import analysis.restructure as restr

In [None]:
# === Define the path to the data and the pattern for retrieval ==
HOME = Path.home()
DATA_DIR = (HOME / "My Drive" / "_VectorData" / "projects" / "identifying_depression_with_rst" / "data").resolve(strict=True)

corpus_path = DATA_DIR / "interim"
corpus_file = corpus_path / "preprocesssed_corpora.json"

with open(corpus_file, "r") as file:
    corpora = json.load(file)

diagnoses_path = DATA_DIR / "interim"
diagnoses_file = diagnoses_path / "all_diagnoses.json"

with open(diagnoses_file, "r") as file:
    diagnoses = json.load(file)

rst_data_path = DATA_DIR / "interim"
rst_data_file = rst_data_path / "rst_data.json"

with open(rst_data_file, "r") as file:
    rst_data = json.load(file)

In [None]:
"""
rst_data.setdefault(CORPUS_NAME_1, {})["all_features"] = all_features_1
rst_data.setdefault(CORPUS_NAME_1, {})["all_features_neg"] = all_features_neg_1
rst_data.setdefault(CORPUS_NAME_1, {})["all_features_pos"] = all_features_pos_1
rst_data.setdefault(CORPUS_NAME_1, {})["relations_pos"] = relations_pos_1
rst_data.setdefault(CORPUS_NAME_1, {})["relations_neg"] = relations_neg_1
rst_data.setdefault(CORPUS_NAME_1, {})["all_relations"] = list(all_relations_1)

rst_data.setdefault(CORPUS_NAME_2, {})["all_features"] = all_features_2
rst_data.setdefault(CORPUS_NAME_2, {})["all_features_neg"] = all_features_neg_2
rst_data.setdefault(CORPUS_NAME_2, {})["all_features_pos"] = all_features_pos_2
rst_data.setdefault(CORPUS_NAME_2, {})["relations_pos"] = relations_pos_2
rst_data.setdefault(CORPUS_NAME_2, {})["relations_neg"] = relations_neg_2
rst_data.setdefault(CORPUS_NAME_2, {})["all_relations"] = list(all_relations_2)
"""

rst_data["ked"]["all_relations"]

## Restructure the data so it's conducive to stat tests and ML pipelines

In [None]:
# In case we need to reload the module
rst = importlib.reload(restr)

In [None]:
data_cols_pos_1 = restr.get_data_vectors(rst_data["ked"]["all_relations"], rst_data["ked"]["all_features_pos"])

In [None]:
len(data_cols_pos_1["Sequence"])

In [None]:
data_cols_neg_1 = restr.get_data_vectors(rst_data["ked"]["all_relations"], rst_data["ked"]["all_features_neg"])

In [None]:
data_cols_neg_1

In [None]:
len(data_cols_neg_1["Sequence"])

## Stat Tests

In [None]:
from typing import Dict, List, Optional, Iterable, Tuple
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu, iqr
from cliffs_delta import cliffs_delta

def test_features(
    depressed: Dict[str, List[float]],
    healthy: Dict[str, List[float]],
    features: Optional[Iterable[str]] = None,  # if None: test all overlapping keys
    exclude: Optional[Iterable[str]] = None,   # keys to skip
    correction: Optional[str] = "fdr_bh",      # "fdr_bh", "bonferroni", or None
    decimals: int = 3,
    min_n: int = 2,                            # min per-group samples to test a feature
) -> pd.DataFrame:
    """
    Mann–Whitney + Cliff's delta for each overlapping feature (column) in the two dicts.
    Works with the output from `get_data_vectors(...)`.
    """

    # --- choose candidate features
    keys_dep = set(depressed.keys())
    keys_hlt = set(healthy.keys())
    candidates = set(features) if features is not None else (keys_dep & keys_hlt)
    if exclude:
        candidates -= set(exclude)

    rows: List[Dict] = []

    for feat in sorted(candidates):
        dep_vals = np.asarray(depressed.get(feat, []), dtype=float)
        hlt_vals = np.asarray(healthy.get(feat, []), dtype=float)

        # drop NaNs/Infs
        dep = dep_vals[np.isfinite(dep_vals)]
        hlt = hlt_vals[np.isfinite(hlt_vals)]

        if len(dep) < min_n or len(hlt) < min_n:
            continue

        # Mann–Whitney (two-sided)
        try:
            u_stat, p_raw = mannwhitneyu(dep, hlt, alternative="two-sided")
        except ValueError:
            # identical distributions or all ties may raise in older SciPy—fallback
            u_stat, p_raw = np.nan, 1.0

        # Cliff's delta
        try:
            delta, size = cliffs_delta(dep.tolist(), hlt.tolist())
        except Exception:
            delta, size = np.nan, "negligible"

        rows.append({
            "Feature": feat,
            "Median_Depressed": float(np.median(dep)),
            "Median_Healthy":  float(np.median(hlt)),
            "IQR_Depressed":   float(iqr(dep)) if len(dep) > 1 else 0.0,
            "IQR_Healthy":     float(iqr(hlt)) if len(hlt) > 1 else 0.0,
            "Min_Depressed":   float(np.min(dep)),
            "Min_Healthy":     float(np.min(hlt)),
            "Max_Depressed":   float(np.max(dep)),
            "Max_Healthy":     float(np.max(hlt)),
            "MannWhitney_U":   float(u_stat),
            "p_raw":           float(p_raw),
            "Cliffs_Delta":    float(delta),
            "Effect_Size":     size,
            "n_dep":           int(len(dep)),
            "n_healthy":       int(len(hlt)),
        })

    if not rows:
        return pd.DataFrame(columns=[
            "Feature","Median_Depressed","Median_Healthy","IQR_Depressed","IQR_Healthy",
            "Min_Depressed","Min_Healthy","Max_Depressed","Max_Healthy",
            "MannWhitney_U","p_raw","p_adj","reject","Cliffs_Delta","Effect_Size",
            "n_dep","n_healthy"
        ])

    df = pd.DataFrame(rows)

    # multiple comparison correction
    if correction is not None:
        try:
            from statsmodels.stats.multitest import multipletests
            reject, p_adj, _, _ = multipletests(df["p_raw"].values, method=correction)
        except Exception:
            if correction.lower() == "bonferroni":
                m = len(df)
                p_adj = np.minimum(df["p_raw"].values * m, 1.0)
                reject = p_adj < 0.05
            else:
                # no statsmodels → leave raw p-values
                p_adj = df["p_raw"].values
                reject = p_adj < 0.05
        df["p_adj"] = p_adj
        df["reject"] = reject
    else:
        df["p_adj"] = df["p_raw"].values
        df["reject"] = df["p_adj"] < 0.05

    # pretty rounding
    num_cols = [
        "Median_Depressed","Median_Healthy","IQR_Depressed","IQR_Healthy",
        "Min_Depressed","Min_Healthy","Max_Depressed","Max_Healthy",
        "MannWhitney_U","p_raw","p_adj","Cliffs_Delta"
    ]
    for c in num_cols:
        if c in df.columns:
            df[c] = df[c].astype(float).round(decimals)

    # sort by adjusted p then |delta|
    df["abs_delta"] = df["Cliffs_Delta"].abs()
    df = df.sort_values(["p_adj", "abs_delta"], ascending=[True, False]).drop(columns=["abs_delta"]).reset_index(drop=True)

    return df

In [None]:
df_stats = test_features(data_cols_pos_1, data_cols_neg_1, correction="fdr_bh")

In [None]:
df_stats