In [None]:
import sys
import json
import pandas as pd
from pathlib import Path
# from collections import Counter

# === Define the path to the auxiliary modules ===
ROOT = Path.cwd().parent
SRC = (ROOT / "src").resolve()

if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

import importlib
import analysis.restructure as restr
import analysis.pipelines as plns

In [None]:
HOME = Path.home()
DATA_DIR = (HOME / "My Drive" / "_VectorData" / "projects" / "identifying_depression_with_rst" / "data").resolve(strict=True)

corpus_path = DATA_DIR / "interim"
corpus_file = corpus_path / "preprocesssed_corpora.json"

with open(corpus_file, "r") as file:
    corpora = json.load(file)

rst_data_path = DATA_DIR / "interim"
rst_data_file = rst_data_path / "rst_data_gumrrg.json"
# rst_data_file = rst_data_path / "rst_data_rstreebank.json"

with open(rst_data_file, "r") as file:
    rst_data = json.load(file)

In [None]:
corpora.keys()

In [None]:
CORPUS_NAME_1 = "ked"
CORPUS_NAME_2 = "kldl"

In [None]:
corpora[CORPUS_NAME_1]

In [None]:
corpora[CORPUS_NAME_2]

In [None]:
# The structure of the data in rst_data is guided by this strucutre of the dictionary that was used to save the data to a json file
"""
rst_data.setdefault(CORPUS_NAME_1, {})["all_features"] = all_features_1
rst_data.setdefault(CORPUS_NAME_1, {})["all_features_neg"] = all_features_neg_1
rst_data.setdefault(CORPUS_NAME_1, {})["all_features_pos"] = all_features_pos_1
rst_data.setdefault(CORPUS_NAME_1, {})["relations_pos"] = relations_pos_1
rst_data.setdefault(CORPUS_NAME_1, {})["relations_neg"] = relations_neg_1
rst_data.setdefault(CORPUS_NAME_1, {})["all_relations"] = list(all_relations_1)

rst_data.setdefault(CORPUS_NAME_2, {})["all_features"] = all_features_2
rst_data.setdefault(CORPUS_NAME_2, {})["all_features_neg"] = all_features_neg_2
rst_data.setdefault(CORPUS_NAME_2, {})["all_features_pos"] = all_features_pos_2
rst_data.setdefault(CORPUS_NAME_2, {})["relations_pos"] = relations_pos_2
rst_data.setdefault(CORPUS_NAME_2, {})["relations_neg"] = relations_neg_2
rst_data.setdefault(CORPUS_NAME_2, {})["all_relations"] = list(all_relations_2)
"""

# Example:
rst_data[CORPUS_NAME_1]["relations_pos"]

In [None]:
len(rst_data[CORPUS_NAME_2]["all_features_pos"])

In [None]:
len(rst_data[CORPUS_NAME_2]["all_features_neg"])

## Reload the modules if necessary

In [None]:
# In case we need to reload the module
restr = importlib.reload(restr)
plns = importlib.reload(plns)

## Xy with all the features
(apart from raw nuclearity pattern counts -- they are removed manually)

### A first corpus to Xy

In [None]:
neg_data = restr.get_data_vectors(rst_data[CORPUS_NAME_1]["all_relations"], rst_data[CORPUS_NAME_1]["all_features_neg"])
pos_data = restr.get_data_vectors(rst_data[CORPUS_NAME_1]["all_relations"], rst_data[CORPUS_NAME_1]["all_features_pos"])

Xy_1 = restr.build_feature_matrix(pos_data, neg_data, features=None)


Xy_1

In [None]:
len(Xy_1.columns)

### A second corpus to Xy

In [None]:
neg_data = restr.get_data_vectors(rst_data[CORPUS_NAME_2]["all_relations"], rst_data[CORPUS_NAME_2]["all_features_neg"])
pos_data = restr.get_data_vectors(rst_data[CORPUS_NAME_2]["all_relations"], rst_data[CORPUS_NAME_2]["all_features_pos"])

Xy_2 = restr.build_feature_matrix(pos_data, neg_data, features=None)

Xy_2


In [None]:
len(Xy_2.columns)

In [None]:
# Xy to be used in the pipelines below:

Xy = pd.concat([Xy_1, Xy_2], ignore_index=True)
# Xy = Xy_2
# Xy = Xy_1

Xy

### Dropping raw counts for nuclearity patterns (to keep only relational)

In [None]:
cols_to_drop = ["nucl_NN", "nucl_NS", "nucl_SN", "nucl_pattern"]  # last one just in case (if for some reason it is still there)
Xy = Xy.drop(columns=cols_to_drop, errors="ignore")

Xy

In [None]:
Xy.columns

### Regular LogReg

In [None]:
# Xy is the DataFrame from build_feature_matrix(...), with label already 0/1 as int's
# L2

X_train_s, X_test_s, y_train, y_test, scaler, X_train, X_test = plns.prep_train_test(Xy)

clf = plns.train_logreg(X_train_s, y_train)
report_df, cm, coef_df = plns.evaluate_classifier(
    clf,
    X_test_s,
    y_test,
    feature_names=list(X_train.columns),  # <- critical: order matches training
)


print(report_df.round(3))
print(cm)
print(coef_df.head(15))

## Lasso

In [None]:
# Lasso/L1

X_train_s, X_test_s, y_train, y_test, scaler, X_train, X_test = plns.prep_train_test(Xy)

clf = plns.train_logreg_l1(X_train_s, y_train)
report_df, cm, coef_df = plns.evaluate_classifier(
    clf,
    X_test_s,
    y_test,
    feature_names=list(X_train.columns),
)


print(report_df.round(3))
print(cm)
print(coef_df.head(15))

### LogReg with Cross Validation

In [None]:
# K-fold CV with L2 (default)
folds, summary, _ = plns.cross_validate_logreg(Xy, k=5, use_l1=False)
print("L2 Results:")
print(folds.round(3))
print(summary.round(3))
print("")

# K-fold CV with L1
folds_l1, summary_l1, models = plns.cross_validate_logreg(Xy, k=5, use_l1=True, C=1.0, return_models=True)
print("L1 Results:")
print(folds_l1.round(3))
print(summary_l1.round(3))

### HistGradientBoosting

In [None]:
X_train, X_test, y_train, y_test = plns.prep_train_test_tabular(Xy, label_col="label")

hgb = plns.train_hgb(X_train, y_train, max_depth=None, learning_rate=0.06, max_iter=400)

report_df, cm, imp_df, metrics = plns.evaluate_hgb(hgb, X_test, y_test, feature_names=list(X_train.columns))

print(report_df.round(3))
print("")
print(cm)
print("")
print(metrics)            # {'roc_auc': ..., 'pr_auc': ...}
print("")
print(imp_df.head(15))    # top features by permutation importance

## Xy_aug: Run on augmented/refined features

### Prep Xy_aug

In [None]:
# For a first corpus

rst_pos = rst_data[CORPUS_NAME_1]["all_features_pos"]
rst_neg = rst_data[CORPUS_NAME_1]["all_features_neg"]
rst_docs_all_1 = list(rst_pos) + list(rst_neg)

In [None]:
# For a second corpus

rst_pos = rst_data[CORPUS_NAME_2]["all_features_pos"]
rst_neg = rst_data[CORPUS_NAME_2]["all_features_neg"]
rst_docs_all_2 = list(rst_pos) + list(rst_neg)

In [None]:
# A small helper to calculate the extra features and concat them to the exisitng Xy

def augment_rst_features(docs_all, Xy):

    # Turning raw RST into the 6 "engineered" features:
    # (depth_per_edu, rel_entropy, rel_top2_dom, edu_len_mean, edu_len_std, edu_len_p90)
    df_extra = restr.extra_rst_features_from_raw(docs_all)

    assert len(docs_all) == len(Xy), "row count mismatch (ordering/alignment problem)"
    assert set(df_extra.columns) == {
        "depth_per_edu","rel_entropy","rel_top2_dom","edu_len_mean","edu_len_std","edu_len_p90"
    }

    Xy_aug = pd.concat([Xy.reset_index(drop=True), df_extra.reset_index(drop=True)], axis=1)

    return Xy_aug

In [None]:
# If concatenating:
# Xy_aug = pd.concat([augment_rst_features(rst_docs_all_1, Xy_1), augment_rst_features(rst_docs_all_2, Xy_2)], ignore_index = True)

# If using just one corpus:
# Xy_aug = augment_rst_features(rst_docs_all_1, Xy)
Xy_aug = augment_rst_features(rst_docs_all_2, Xy)

In [None]:
Xy_aug

### Regular LogReg

In [None]:
X_train_s, X_test_s, y_train, y_test, scaler, X_train, X_test = plns.prep_train_test(Xy_aug)
clf = plns.train_logreg(X_train_s, y_train)
report_df, cm, coef_df = plns.evaluate_classifier(clf, X_test_s, y_test, feature_names=list(X_train.columns))

print(report_df.round(3))
print(cm)
print(coef_df.head(15))

### LogReg + Cross Validation

In [None]:
# K-fold CV with L2 (default)
folds, summary, _ = plns.cross_validate_logreg(Xy_aug, k=5, use_l1=False)
print("L2 Results")
print(folds.round(3))
print(summary.round(3))
print("")

# K-fold CV with L1 (sparser)
folds_l1, summary_l1, models = plns.cross_validate_logreg(Xy_aug, k=5, use_l1=True, C=1.0, return_models=True)
print("L1 Results")
print(folds_l1.round(3))
print(summary_l1.round(3))

### HistGradientBoosting

In [None]:
X_train, X_test, y_train, y_test = plns.prep_train_test_tabular(Xy_aug, label_col="label")

hgb = plns.train_hgb(X_train, y_train, max_depth=None, learning_rate=0.06, max_iter=400)

report_df, cm, imp_df, metrics = plns.evaluate_hgb(hgb, X_test, y_test, feature_names=list(X_train.columns))

print(report_df.round(3))
print("")
print(cm)
print("")
print(metrics)            # {'roc_auc': ..., 'pr_auc': ...}
print("")
print(imp_df.head(15))    # top features by permutation importance

## Xy_final: augmented, rare relations removed (This is just a test / proof of concept for now)
(In case of the data used here there seems to be no added benefit from removing the potential "noise")

### Prep Xy_final

In [None]:
NON_REL = {
    "label",
    # structure / sizes
    "tree_depth","num_edus","depth_per_edu",
    # nuclearity
    "nucl_NN","nucl_NS","nucl_SN",
    "nucl_NN_relprop","nucl_NS_relprop","nucl_SN_relprop",
    # engineered stats
    "rel_entropy","rel_top2_dom",
    "edu_len_mean","edu_len_std","edu_len_p90",
    # if present already, donâ€™t let collapse touch it:
    "rel_OTHER",
}

REL_COLS = [c for c in Xy_aug.columns if c not in NON_REL]

# This is just to preview what it WOULD look like
Xy_final = restr.collapse_rare_relations_df(
    Xy_aug, REL_COLS, avg_prop_min=0.01, other_col="rel_OTHER"
)

Xy_final

### Regular LogReg

In [None]:
# Calling on the collapserare class to only collapse rare relations BASED on the traning set ONLY

rare = plns.CollapseRareRels(min_docs=100, other_col="rel_OTHER", rel_cols=REL_COLS)

X_tr, X_te, y_tr, y_te = plns.prep_train_test_tabular(Xy_aug, label_col="label") # scale later separately

rare.fit(X_tr)          # fit on TRAIN ONLY
X_tr2 = rare.transform(X_tr)
X_te2 = rare.transform(X_te)

assert list(X_tr2.columns) == list(X_te2.columns)
print("Cols/rels that were kept:", rare.keep_cols_)            # kept relations
print("Cols/rels that were removed:", rare.collapsed_cols_)  # collapsed relations
print("")

nz = X_tr2.loc[:, X_tr2.var(axis=0) > 1e-12]
X_te2 = X_te2[nz.columns]
X_tr2 = nz

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_tr2)
X_tr2s = scaler.transform(X_tr2)
X_te2s = scaler.transform(X_te2)

# The rest is business as usual
clf = plns.train_logreg(X_tr2s, y_tr)

report_df, cm, coef_df = plns.evaluate_classifier(
    clf,
    X_te2s,
    y_te,
    feature_names=list(X_tr2.columns),
)

print(report_df.round(3))
print("")
print(cm)
print("")
print(coef_df.sort_values("odds_ratio", ascending=False).head(15))

### HistGradientBoosting

In [None]:
rare_hgb = plns.CollapseRareRels(min_docs=100, other_col="rel_OTHER", rel_cols=REL_COLS)

X_tr_hgb, X_te_hgb, y_tr_hgb, y_te_hgb = plns.prep_train_test_tabular(Xy_aug, label_col="label")

rare_hgb.fit(X_tr_hgb)          # fit on TRAIN ONLY
X_tr2_hgb = rare_hgb.transform(X_tr_hgb)
X_te2_hgb = rare_hgb.transform(X_te_hgb)

assert list(X_tr2_hgb.columns) == list(X_te2_hgb.columns)
print("Cols/rels that were kept:", rare_hgb.keep_cols_)            # kept relations
print("Cols/rels that were removed:", rare_hgb.collapsed_cols_)  # collapsed relations
print("")

X_tr2_hgb = X_tr2_hgb.apply(pd.to_numeric, errors="coerce").fillna(0.0)
X_te2_hgb = X_te2_hgb.apply(pd.to_numeric, errors="coerce").fillna(0.0)

kept_cols = list(X_tr2_hgb.columns[X_tr2_hgb.var(axis=0) > 1e-12])
X_tr2_hgb = X_tr2_hgb[kept_cols]
X_te2_hgb = X_te2_hgb[kept_cols]

hgb_fin = plns.train_hgb(X_tr2_hgb, y_tr_hgb, max_depth=None, learning_rate=0.06, max_iter=400)

report_df, cm, imp_df, metrics = plns.evaluate_hgb(hgb_fin, X_te2_hgb, y_te_hgb, feature_names=list(X_tr2_hgb.columns))

print(report_df.round(3))
print(cm)
print(metrics)
print(imp_df.head(15))