# Generate network hypotheses (Local Marginal Regulation/LOMAR coefficients)

In [None]:
import glob
import os
from datetime import datetime

import numpy as np
import pandas as pd
import qbiome.data_formatter
import qbiome.forecaster
import qbiome.hypothesis
import qbiome.network
import qbiome.qnet_orchestrator
import qbiome.quantizer
import qbiome.qutil
import statsmodels.stats.api as sms
from quasinet import qnet
from tqdm.notebook import tqdm

In [2]:
qnt = qbiome.quantizer.Quantizer(num_levels=26)
qnt.load_quantizer_states("quantizer.pkl")

In [3]:
def gen_hyp(
    quant, mod_path_dat, t_start, t_end, causal_cons, no_slf_lps, pref, hyp_dir
):
    import pathlib
    pathlib.Path(hyp_dir).mkdir(parents=True, exist_ok=True)
    hyp_dat = qbiome.hypothesis.Hypothesis(quantizer=quant, model_path=mod_path_dat, detailed_labels=True)

    hyp_dat.causal_constraint = causal_cons
    hyp_dat.no_self_loops = no_slf_lps

    hyp_dat.get(time_start=t_start, time_end=t_end)
    h_dat_dotfile = (
        hyp_dir
        + "/hyp_"
        + pref
        + str(t_start)
        + "_"
        + str(t_end)
        + "_"
        + str(causal_cons)
        + ".dot"
    )
    hyp_dat.to_dot(h_dat_dotfile)
    hyp_dat.to_csv(
        hyp_dir
        + "/hyp_"
        + pref
        + str(t_start)
        + "_"
        + str(t_end)
        + "_"
        + str(causal_cons)
        + str(datetime.now())
        + "_netwk.csv"
    )
    ntwk_dat_img = (
        hyp_dir
        + "/hyp_"
        + pref
        + str(t_start)
        + "_"
        + str(t_end)
        + "_"
        + str(causal_cons)
        + str(datetime.now())
        + "_netwk.png"
    )
    network_dat = qbiome.network.Network(h_dat_dotfile, outfile=ntwk_dat_img)
    network_dat.get()

    return hyp_dat, ntwk_dat_img

In [4]:
def _export_trees(qn, qns, tree_dir):
    import pathlib
    pathlib.Path(tree_dir + "/ahcg/").mkdir(parents=True, exist_ok=True)
    pathlib.Path(tree_dir + "/shcg/").mkdir(parents=True, exist_ok=True)
    for idx, feature_name in enumerate(qn.feature_names):
        qnet.export_qnet_tree(
            qn,
            idx,
            os.path.join(tree_dir + "/ahcg/", "{}.dot".format(feature_name)),
            outformat="graphviz",
            detailed_output=True,
        )

    for idx, feature_name in enumerate(qns.feature_names):
        qnet.export_qnet_tree(
            qns,
            idx,
            os.path.join(tree_dir + "/shcg/", "{}.dot".format(feature_name)),
            outformat="graphviz",
            detailed_output=True,
        )

In [5]:
def get_model_hyps(times, causal_cons, no_slf_lps, appr_df=None, sub_df=None,  quantized=False, quant=None, remove_n=5):
    
    def _get_quantizer(data, num_levels=26):
        """
        Construct a quantizer from provided `data`
        """
        quantizer = qbiome.quantizer.Quantizer(num_levels=num_levels)
        if num_levels > 26:
            def _get_labels(num_lvls):
                import string
                lbls = list(string.ascii_uppercase)
                for i in range(2, int(np.ceil(num_lvls / 26)) + 1):
                    lbls = lbls + [char * i for char in string.ascii_uppercase]

                lbls = tuple(lbls[:num_lvls])

                return {lbl: idx for idx, lbl in enumerate(lbls)}
            quantizer.labels = _get_labels(num_levels)
        data_quantized = quantizer.quantize_df(data)
        
        return quantizer
    
    def _get_qnet(
        data=None, quantized=False, num_levels=26, quantizer=quant, alpha=0.3, min_samples_split=2
    ):
        """
        Compute qnet from provided data
        """
        def _long_to_wide(df):
            df_ = pd.concat(
                [df.subject_id, df.variable + "_" + df.week.astype(str), df.value], axis=1
            ).rename(columns={0: "variable"})
            df_ = df_.pivot(index="subject_id", columns="variable")[
                "value"
            ].reset_index()
            return df_
        if data is None and data_quantized is None:
            raise Exception("Either data or data_quantized must be provided.")
        if quantizer is None:
            quantizer = qbiome.quantizer.Quantizer(num_levels=num_levels)
        if num_levels > 26:
            def _get_labels(num_lvls):
                import string
                lbls = list(string.ascii_uppercase)
                for i in range(2, int(np.ceil(num_lvls / 26)) + 1):
                    lbls = lbls + [char * i for char in string.ascii_uppercase]

                lbls = tuple(lbls[:num_lvls])

                return {lbl: idx for idx, lbl in enumerate(lbls)}
            quantizer.labels = _get_labels(num_levels)
        orchestrator = qbiome.qnet_orchestrator.QnetOrchestrator(quantizer)
        if quantized is True:
            data_quantized = _long_to_wide(data)
        features, label_matrix = quantizer.get_qnet_inputs(data_quantized)
        orchestrator.train_qnet(
            features,
            label_matrix,
            alpha=alpha,
            min_samples_split=min_samples_split,
        )

        return orchestrator.model
    
    if quant is None:
        quant = _get_quantizer(pd.concat([appr_df, sub_df]))

    drop_indices_a = np.random.choice(appr_df.index, remove_n, replace=False)
    drop_indices_s = np.random.choice(sub_df.index, remove_n, replace=False)   
       

    ahcg = _get_qnet(
        data=appr_df.drop(drop_indices_a),
        quantized=quantized,
        quantizer=quant,
    )
    shcg = _get_qnet(
        data=sub_df.drop(drop_indices_s),
        quantized=quantized,
        quantizer=quant,
    )

    time = str(datetime.now())
    tree_dir = "trees/" + time
    hyp_dir = "hyps/" + time
    _export_trees(ahcg, shcg, tree_dir)

    for t in times:
        gen_hyp(
            quant,
            tree_dir + "/ahcg/",
            t[0],
            t[1],
            causal_cons,
            no_slf_lps,
            "ahcg",
            hyp_dir,
        )
        gen_hyp(
            quant,
            tree_dir + "/shcg/",
            t[0],
            t[1],
            causal_cons,
            no_slf_lps,
            "shcg",
            hyp_dir,
        )

# Generate hypotheses

In [6]:
causal_constraint = -1
no_self_loops = False
times = [(27, 29), (30, 32)]
drop_n = 15
n_runs = 2

appr_df = pd.read_csv("uchicago_appropriate_cohort_quantized_lng.csv")
sub_df = pd.read_csv("uchicago_suboptimal_cohort_quantized_lng.csv")

for i in tqdm(range(n_runs)):
    get_model_hyps(times, causal_constraint, no_self_loops, appr_df=appr_df, sub_df=sub_df, quantized=True, quant=qnt, remove_n=drop_n)

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 91/91 [00:26<00:00,  3.48it/s]
100%|██████████| 91/91 [00:23<00:00,  3.93it/s]
100%|██████████| 91/91 [00:23<00:00,  3.82it/s]
100%|██████████| 91/91 [00:27<00:00,  3.34it/s]
100%|██████████| 91/91 [00:26<00:00,  3.47it/s]
100%|██████████| 91/91 [00:22<00:00,  3.96it/s]
100%|██████████| 91/91 [00:23<00:00,  3.82it/s]
100%|██████████| 91/91 [00:27<00:00,  3.33it/s]


# Read in hypotheses; aggregate/summarize

In [7]:
df_a27 = pd.concat(
    [
        pd.read_csv(df)
        for df in glob.glob("hyps" + "/**/*ahcg27_29*.csv", recursive=True)
    ]
)
df_a30 = pd.concat(
    [
        pd.read_csv(df)
        for df in glob.glob("hyps" + "/**/*ahcg30_32*.csv", recursive=True)
    ]
)

df_s27 = pd.concat(
    [
        pd.read_csv(df)
        for df in glob.glob("hyps" + "/**/*shcg27_29*.csv", recursive=True)
    ]
)
df_s30 = pd.concat(
    [
        pd.read_csv(df)
        for df in glob.glob("hyps" + "/**/*shcg30_32*.csv", recursive=True)
    ]
)

def summarise_df(df, n_runs, conf=0.95):
    return df.groupby(["src", "tgt", "time_tgt"], as_index=False).agg(
        mean=pd.NamedAgg(
            column="lomar",
            aggfunc=lambda x: x.reset_index(drop=True)
            .reindex(range(n_runs), fill_value=0)
            .mean(),
        ),
        var=pd.NamedAgg(
            column="lomar",
            aggfunc=lambda x: x.reset_index(drop=True)
            .reindex(range(n_runs), fill_value=0)
            .var(),
        ),
        median=pd.NamedAgg(
            column="lomar",
            aggfunc=lambda x: x.reset_index(drop=True)
            .reindex(range(n_runs), fill_value=0)
            .median(),
        ),
        ci=pd.NamedAgg(
            column="lomar",
            aggfunc=lambda x: sms.DescrStatsW(
                x.reset_index(drop=True).reindex(range(n_runs), fill_value=0)
            ).tconfint_mean(alpha=1 - conf),
        ),
    )

In [8]:
summarise_df(df_a27, n_runs).sort_values(by="mean", ascending=False)

Unnamed: 0,src,tgt,time_tgt,mean,var,median,ci
60,Coriobacteriia,Clostridia,28.0,86.258645,1.488111e+04,86.258645,"(-1009.7613627689717, 1182.2786534714637)"
143,unclassified_Proteobacteria,Clostridia,28.0,60.212059,7.250984e+03,60.212059,"(-704.8546951529498, 825.2788139895696)"
58,Coriobacteriia,Actinobacteria,28.0,8.194095,1.342864e+02,8.194095,"(-95.92175236366484, 112.30994213539108)"
139,unclassified_Proteobacteria,Actinobacteria,28.0,5.743401,6.597330e+01,5.743401,"(-67.23342523380732, 78.72022676301059)"
103,Negativicutes,Gammaproteobacteria,28.0,2.846680,2.138135e-05,2.846680,"(2.8051348654683985, 2.8882247904175924)"
...,...,...,...,...,...,...,...
84,Gammaproteobacteria,unclassified_Bacteria,27.0,-0.111976,6.045262e-07,-0.111976,"(-0.11896199484139601, -0.1049906431832192)"
70,Gammaproteobacteria,Alphaproteobacteria,27.0,-0.116076,9.167810e-07,-0.116076,"(-0.1246781975815983, -0.10747284200047681)"
77,Gammaproteobacteria,Clostridia,29.0,-0.343115,3.166840e-05,-0.343115,"(-0.3936760597603646, -0.2925544682805387)"
102,Negativicutes,Bacteroidia,29.0,-0.683070,3.509337e-04,-0.683070,"(-0.8513811236205734, -0.5147584114498466)"
