In [1]:
from truthnet import truthnet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.api as sms
from tqdm.notebook import tqdm
import tikzplotlib as tpl
from datetime import datetime
from IPython.display import display, HTML
import glob
from zedstat import zedstat

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import (
    ExtraTreesClassifier,
    RandomForestClassifier,
)
from sklearn.metrics import (
    PrecisionRecallDisplay,
    RocCurveDisplay,
    auc,
    mean_squared_error,
    precision_recall_curve,
    r2_score,
    roc_curve,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
import optuna

# optuna.logging.set_verbosity(optuna.logging.WARNING)

In [2]:
from quasinet.qsampling import qsample
from scipy.stats import entropy
import random
from quasinet.utils import sample_from_dict


def _get_qnet(df):
    from quasinet import qnet

    qn = qnet.Qnet(
        feature_names=df.columns.values,
        min_samples_split=2,
        alpha=0.05,
        max_depth=-1,
        max_feats=-1,
        early_stopping=False,
        verbose=0,
        random_state=None,
        n_jobs=-1,
    )

    qn.fit(df.to_numpy(dtype="<U21"))

    return qn


def _get_tnets(
    df,
    df_pos=None,
    df_neg=None,
    diss_file=None,
    suspects=None,
):
    if df_pos is not None and df_neg is not None:
        non_null_cols = (
            (df.isna().sum() < len(df))
            & (df_pos.isna().sum() < len(df_pos))
            & (df_neg.isna().sum() < len(df_neg))
        )

        df_fmtd = df.loc[:, non_null_cols].fillna(-9).astype(int).replace(-9, "")
        df_pos_fmtd = (
            df_pos.loc[:, non_null_cols].fillna(-9).astype(int).replace(-9, "")
        )
        df_neg_fmtd = (
            df_neg.loc[:, non_null_cols].fillna(-9).astype(int).replace(-9, "")
        )
        df_fmtd.to_csv("df_tmp.csv", index=False)
        df_pos_fmtd.to_csv("df_pos_tmp.csv", index=False)
        df_neg_fmtd.to_csv("df_neg_tmp.csv", index=False)

        if suspects is not None:
            Tr = truthnet()
            Tr_pos = truthnet()
            features, samples = Tr.load_data(datapath="df_tmp.csv")
            features_pos, samples_pos = Tr_pos.load_data(datapath="df_pos_tmp.csv")
            Tr.fit(modelpath="tmp_Qnet.joblib")
            Tr_pos.fit(modelpath="tmp_Qnet_pos.joblib")
            if diss_file is not None:
                Tr.dissonance = pd.read_csv(diss_file)
            else:
                Tr.getDissonance(outfile="data/tmp_dissonance_matrix.csv")
                Tr_pos.getDissonance(outfile="data/tmp_dissonance_matrix_pos.csv")
            for alpha in [0.01, 0.05, 0.1]:
                Tr.getSuspects(alpha=alpha).to_csv(f"{suspects}_full_model_{alpha}.csv")
                Tr_pos.getSuspects(alpha=alpha).to_csv(
                    f"{suspects}_pos_model_{alpha}.csv"
                )

        Tr = _get_qnet(df_fmtd)
        Tr_pos = _get_qnet(df_pos_fmtd)
        Tr_neg = _get_qnet(df_neg_fmtd)
    else:
        Tr = truthnet()
        length = sum(df.isna().sum() < len(df))
        df.fillna(-9).astype(int).replace(-9, "").to_csv("tmpfile.csv", index=False)
        features, samples = Tr.load_data(datapath="tmpfile.csv")
        Tr.fit(modelpath="tmp_Qnet.joblib")
        if diss_file is not None:
            Tr.dissonance = pd.read_csv(diss_file)
        else:
            Tr.getDissonance(outfile="data/tmp_dissonance_matrix.csv")
        if suspects is not None:
            for alpha in [0.01, 0.05, 0.1]:
                Tr.getSuspects(alpha=alpha).to_csv(f"{suspects}_{alpha}.csv")
        coresamples = Tr.getCoresamples(alpha=0.01, steps=length)
        if len(coresamples) == len(df):
            mean_dissonance = pd.DataFrame(
                data=Tr.dissonance.mean(axis=1), columns=["mean_dissonance"]
            )
            coresamples = mean_dissonance.query(
                "mean_dissonance < mean_dissonance.quantile(0.99)"
            )
        df_pos = df.loc[coresamples.index.values]
        df_neg = df.loc[~df.index.isin(coresamples.index.values)]

        non_null_cols = (
            (df.isna().sum() < len(df))
            & (df_pos.isna().sum() < len(df_pos))
            & (df_neg.isna().sum() < len(df_neg))
        )

        Tr = _get_qnet(df.loc[:, non_null_cols].fillna(-9).astype(int).replace(-9, ""))
        Tr_neg = _get_qnet(
            df_neg.loc[:, non_null_cols].fillna(-9).astype(int).replace(-9, "")
        )
        Tr_pos = _get_qnet(
            df_pos.loc[:, non_null_cols].fillna(-9).astype(int).replace(-9, "")
        )

    return {
        "all": Tr,
        "pos": Tr_pos,
        "neg": Tr_neg,
        "data": df.loc[:, non_null_cols].fillna(-9).astype(int).replace(-9, ""),
    }


def _diss_linear(s, qnet, missing_value=0):
    diss = list()
    Ds = qnet.predict_distributions(s)

    for i in range(len(s)):
        if s[i] != "":
            if s[i] in Ds[i].keys():
                diss.append(1 - Ds[i][s[i]] / np.max(list(Ds[i].values())))
            elif s[i] == "missing":
                diss.append(missing_value)
            else:
                diss.append(1)

    return np.array(diss)


def _diss_log(s, qnet, missing_value=0):
    diss = list()
    Ds = qnet.predict_distributions(s)

    for i in range(len(s)):
        if s[i] != "":
            if s[i] in Ds[i].keys():
                diss.append(-np.log(Ds[i][s[i]]))
            elif s[i] == "missing":
                diss.append(missing_value)
            else:
                diss.append(np.inf)
        # else:
        # diss.append(missing_value)

    return np.array(diss)


def _actual_sample_dissonance(
    data_sample, diss_models, diss_fcn, order, length, missing_value=0
):
    if order is None:
        order = range(length)

    sample = np.full(length, "", dtype="<U21")

    diss = [list() for model in diss_models]

    # print(data_sample)

    for i in order:
        if data_sample[i] == "":
            sample[i] = "missing"
        else:
            sample[i] = data_sample[i]
        # [print(diss_fcn(sample, model)) for d, model in zip(diss, diss_models)]
        for d, model in zip(diss, diss_models):
            d.append(diss_fcn(sample, model, missing_value))
        # [d.append(diss_fcn(sample, model)) for d, model in zip(diss, diss_models)]

    return sample, diss


def _all_actual_samples_dissonance(
    data_samples, diss_models, diss_fcn, order, length, missing_value=0
):
    samples = list()
    dissonances = list()

    for data_sample in tqdm(data_samples):
        samp, diss = _actual_sample_dissonance(
            data_sample, diss_models, diss_fcn, order, length, missing_value
        )
        # print(len(diss), len(diss[0]), len(diss[0][8]), diss[0][8])
        # print(len(diss), len(diss[0]), len(diss[0][4]), diss[0][4])
        samples.append(samp)
        dissonances.append(diss)

    return samples, dissonances


def _sample_with_dissonance(
    sample_model,
    length,
    diss_models,
    diss_fcn=_diss_linear,
    order=None,
    data_samples=None,
):
    if order is None:
        order = range(length)

    if data_samples is not None:
        data_samples_df = pd.DataFrame(data_samples)
        data_sample_values = pd.Series(
            {
                col: [x for x in data_samples_df[col].unique() if x != ""]
                for col in data_samples_df
            }
        )

    sample = np.full(length, "")

    diss = [list() for model in diss_models]

    for i in order:
        if sample_model is not None:
            prob_dict = sample_model.predict_distribution(sample, i)
            sample[i] = sample_from_dict(prob_dict)
        else:
            sample[i] = random.choice(data_sample_values[i])
        [d.append(diss_fcn(sample, model)) for d, model in zip(diss, diss_models)]

    return sample, diss


def _nsamples_with_dissonance(
    n_samples,
    sample_model,
    length,
    diss_models,
    diss_fcn=_diss_linear,
    order=None,
    data_samples=None,
):
    samples = list()
    dissonances = list()

    for i in tqdm(range(n_samples)):
        samp, diss = _sample_with_dissonance(
            sample_model,
            length,
            diss_models,
            diss_fcn,
            order,
            data_samples,
        )
        samples.append(samp)
        dissonances.append(diss)

    return samples, dissonances


def _dissonance_data_at_question(dissonances, questions_asked):
    return np.array(
        [np.hstack([d[questions_asked - 1] for d in diss]) for diss in dissonances]
    )


# generate samples under the given models and compute dissonances under specified diss_models
def _sampling_scenario(
    n_qsamples,
    qsample_model,
    n_m2_samples,
    m2_model,
    diss_models,
    length,
    n_runif_samples=None,
    diss_fcn=_diss_linear,
    order=None,
    data_samples=None,
    missing_value=0,
):
    samples = {}
    dissonances = {}

    if order == "entropy":
        Ds = qsample_model.predict_distributions(np.full(length, ""))
        entrpy = list()
        for i in range(len(Ds)):
            entrpy.append(entropy(np.fromiter(Ds[i].values(), dtype=float)))
        order = pd.Series(entrpy).sort_values().index

    if order == "random":
        order = list(range(length))
        random.shuffle(order)

    if data_samples is not None:
        samples["actual"], dissonances["actual"] = _all_actual_samples_dissonance(
            data_samples.to_numpy(), diss_models, diss_fcn, order, length, missing_value
        )

    samples["qsampled"], dissonances["qsampled"] = _nsamples_with_dissonance(
        n_qsamples, qsample_model, length, diss_models, diss_fcn, order
    )

    samples["m2"], dissonances["m2"] = _nsamples_with_dissonance(
        n_m2_samples, m2_model, length, diss_models, diss_fcn, order
    )

    if n_runif_samples is not None:
        samples["runif"], dissonances["runif"] = _nsamples_with_dissonance(
            n_runif_samples,
            None,
            length,
            diss_models,
            diss_fcn,
            order,
            data_samples,
        )

    return samples, dissonances


def _diss_dataset(dissonances, questions_asked, groups=["qsampled", "m2"]):
    diss_dataset = pd.concat(
        [
            pd.DataFrame(
                _dissonance_data_at_question(dissonances[group], questions_asked)
            ).assign(y=group)
            for group in groups
        ]
    )
    return diss_dataset


def _save_sampling_scenarios(
    iter,
    name,
    df,
    n_qsamples,
    n_m2_samples,
    df_pos=None,
    df_neg=None,
    diss_file=None,
    diss_fcn=_diss_linear,
    order=None,
    n_runif_samples=None,
    invert_pos=False,
    missing_value=0,
):
    for i in tqdm(iter):
        tn = _get_tnets(
            df=df,
            df_pos=df_pos,
            df_neg=df_neg,
            diss_file=diss_file,
            suspects=f"{name}_{order}_order_suspects_{i}_",
        )

        data_samples = tn["data"]
        full_model = tn["all"]
        if invert_pos is True:
            pos_model = tn["neg"]
            neg_model = tn["pos"]
        else:
            pos_model = tn["pos"]
            neg_model = tn["neg"]

        length = len(full_model.feature_names)

        diss_models = [full_model, neg_model, pos_model]

        s, d = _sampling_scenario(
            n_qsamples,
            full_model,
            n_m2_samples,
            pos_model,
            diss_models,
            length,
            n_runif_samples,
            diss_fcn,
            order,
            data_samples=data_samples,
            missing_value=missing_value,
        )

        pd.to_pickle(s, f"{name}_{order}_order_samples_{i}.pkl")
        pd.to_pickle(d, f"{name}_{order}_order_disson_{i}.pkl")

In [15]:
df = pd.read_csv("data/ptsd/PTSD_cognet_test_processed.csv")
df_pos = pd.read_csv("data/ptsd/PTSD_cognet_test_processed_pos_only.csv")
df_neg = pd.read_csv("data/ptsd/PTSD_cognet_test_processed_neg_only.csv")

[
    _get_tnets(
        df=df,
        df_pos=df_pos,
        df_neg=df_neg,
        diss_file=None,
        suspects=f"pstd_suspects_dx_{datetime.now()}",
    )
    for i in tqdm(range(10))
]

In [10]:
times = list(
    set(
        [s[53:68] for s in glob.glob("suspects-dx-vs-pos-model/pstd_suspects_dx_*.csv")]
    )
)
times

['16:52:11.916375',
 '19:08:04.014408',
 '17:14:40.156313',
 '17:36:45.335135',
 '17:59:09.378503',
 '18:21:55.947332',
 '18:44:59.039389',
 '16:29:07.774240',
 '19:30:39.248756',
 '19:51:44.614126']

In [11]:
times = list(
    set(
        [s[53:68] for s in glob.glob("suspects-dx-vs-pos-model/pstd_suspects_dx_*.csv")]
    )
)

for alpha in [0.01, 0.05, 0.1]:
    dfs = []
    i = 0
    for t in times:
        for model in ["pos_model", "full_model"]:
            dfs.append(
                pd.read_csv(
                    f"suspects-dx-vs-pos-model/pstd_suspects_dx_2023-09-27 {t}_{model}_{alpha}.csv"
                )
                .assign(model=model, run=i)
                .set_axis(
                    ["subject", "mean_dissonance", "model", "run"], axis="columns"
                )
            )
        i = i + 1

    dfs = pd.concat(dfs)
    dfsu = dfs.groupby(["run", "model"])["subject"].unique()

    display(
        [np.intersect1d(dfsu[i]["full_model"], dfsu[i]["pos_model"]) for i in range(10)]
    )

[array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64)]

[array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64)]

[array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=int64)]

In [None]:
tnp_cr = _get_tnets(
    df=df,
    df_pos=None,
    df_neg=None,
    diss_file=None,
    suspects=f"tmp_pstd_suspects_core_",
)

In [4]:
df = pd.read_csv("data/gibbons_global/gibbons_global.csv")

tng = _get_tnets(
    df=df,
    diss_file="mpi_tmp/global_dissonance1.csv",
    suspects=f"tmp_global_suspects_{datetime.now()}",
)

QUEUEING TASKS | :   0%|          | 0/1236 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1236 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1236 [00:00<?, ?it/s]

In [7]:
len(tng["all"].feature_names)

362

In [8]:
len(tng["pos"].feature_names)

362

In [9]:
tng["data"]

Unnamed: 0,2,3,5,6,7,8,9,10,11,12,...,4590,4591,4592,4593,4594,4596,4631,4632,4634,4635
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,1,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,3,,,,,3,,,,
3,,,,1,,,,1,,,...,,,,,,3,,,,
4,,,1,1,,,,,,,...,1,,,,,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1231,,,,2,,,,,,,...,3,,,,,5,,,,
1232,,,1,,,,1,,,,...,2,,,,,3,,,,
1233,,,,,,,,,1,,...,1,,,,,4,,,,
1234,,,2,,,,2,,,,...,,,,,,2,,,2,


In [11]:
dir(tng["pos"])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_input_size',
 '_check_is_fitted',
 '_combine_distributions',
 '_map_col_to_non_leaf_nodes',
 '_parallel_fit_tree',
 '_predict_distributions_numba',
 '_predict_proba',
 'alpha',
 'clear_attributes',
 'early_stopping',
 'estimators_',
 'feature_names',
 'fit',
 'max_depth',
 'max_feats',
 'min_samples_split',
 'mix',
 'mixed',
 'n_jobs',
 'predict_distribution',
 'predict_distributions',
 'random_state',
 'verbose']