Load Jupyter modules

In [3]:
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


Load Python modules

In [4]:
# Make sure the working directory is correct
import os
os.chdir('d:\\microbial_network\\microbial_network_explore')
import numpy as np
from sklearn.metrics import average_precision_score, roc_auc_score
import pandas as pd
from utils.transformation import clr_transform, alr_transform
from matplotlib import pyplot as plt
from itertools import product
from tqdm import tqdm
import pandas as pd
import seaborn as sns
from utils import simulation
import rpy2.robjects as robjects
import seaborn as sns
from utils.generalized_lotka_volterra import GeneralizedLotkaVolterra
from utils.compositional_lotka_volterra import CompositionalLotkaVolterra
from scipy.stats import ttest_rel
import utils.evaluations as ev
from utils.evaluations import correlation_score, spearman_score, precision_matrix_score, clv_score, glv_score, pcor_score, pspe_score, sparcc_score, speic_score, cclasso_score, baseline_score
from typing import List
import seaborn as sns
from utils.transformation import clr_transform, alr_transform


In [5]:
# Define function for evaluation
def evaluation(adj, abundance, evaluation_func, metrics=average_precision_score, verbose=False, f=None):
    scores = []
    for func in evaluation_func:
        try:
            scores.append([func._method, *func(abundance, adj, metrics=metrics, verbose=verbose)])
        except Exception as e:
            scores.append([func._method, *[0] * len(metrics)])
            print(f"Error: {e}")
        if f:
            f.write(f"{func._method}\n")
            f.flush()
    columns = ['Method']
    columns.extend([metric.__name__ for metric in metrics] if isinstance(metrics, List) else [metrics.__name__])
    scores_df = pd.DataFrame(scores, columns=columns)
    return scores_df

Set up the parameters for the first simulation

In [6]:
# Set up parameters
n_vertices = 50
avg_degree = 2
network_type = 'random'
interaction_type = 'random'
max_interaction_strength = 1
time_points = 500
time_step = 0.01
downsample = 1
noise_var = 1e-3

# evaluation_func = [correlation_score, spearman_score, clv_score, glv_score, pcor_score, pspe_score, sparcc_score, speic_score, ev.dl_score, baseline_score]
evaluation_func = [correlation_score, ev.dl_score, baseline_score]
metrics = [average_precision_score, roc_auc_score]

In [7]:
%%capture
result_df = pd.DataFrame(columns=["Method", "run", "average_precision_score", "roc_auc_score", "abs_rel"])
repeat = 50
# Setup the random seed generator
rng = np.random.default_rng(42)

f = open("data\\results\\log", "w")

for i in range(repeat):
    seed = rng.integers(0, 2**32 - 1)
    adj, M = simulation.gen_graph(n_vertices, avg_degree, network_type, interaction_type, max_interaction_strength=1, seed=seed)
    z, x, abd, _, _ = simulation.simulate_glv(
        time_points=time_points,
        time_step=time_step,
        downsample=1,
        noise_var=0,
        adj=adj,
        M=M,
    )
    f.write(f'iter:{i}\n')
    score_df = evaluation(adj, z, evaluation_func, metrics=metrics, verbose=False, f=f)
    score_df["run"] = i
    score_df["abs_rel"] = "Absolute"
    result_df = result_df.append(score_df, ignore_index=True)

    score_df = evaluation(adj, abd, evaluation_func, metrics=metrics, verbose=False, f=f)
    score_df["run"] = i
    score_df["abs_rel"] = "Relative"
    result_df = result_df.append(score_df, ignore_index=True)
    
    # result_df.to_csv("data\\results\\simulation_results_fixed_interaction.csv", index=False)
# result_df = pd.read_csv("data\\temp_results\\simulation_results.csv")

In [8]:
# print(result_df.loc[result_df["Method"] == "gLV", "roc_auc_score"].mean())
# print(result_df.loc[result_df["Method"] == "gLV", "average_precision_score"].mean())
print(result_df.loc[result_df["Method"] == "Attention", "roc_auc_score"].mean())
print(result_df.loc[result_df["Method"] == "Attention", "average_precision_score"].mean())
print(result_df.loc[result_df["Method"] == "Baseline", "roc_auc_score"].mean())
print(result_df.loc[result_df["Method"] == "Baseline", "average_precision_score"].mean())

0.5012054468085105
0.042884843356203836
0.4927114893617021
0.042441699746670045


In [9]:
from scipy.stats import wilcoxon
print(wilcoxon(result_df.loc[result_df["Method"] == "Baseline", "roc_auc_score"], result_df.loc[result_df["Method"] == "Attention", "roc_auc_score"]))
print(wilcoxon(result_df.loc[result_df["Method"] == "Baseline", "average_precision_score"], result_df.loc[result_df["Method"] == "Attention", "average_precision_score"]))
# print(wilcoxon(result_df.loc[result_df["Method"] == "Baseline", "roc_auc_score"], result_df.loc[result_df["Method"] == "gLV", "roc_auc_score"]))
# print(wilcoxon(result_df.loc[result_df["Method"] == "Baseline", "average_precision_score"], result_df.loc[result_df["Method"] == "gLV", "average_precision_score"]))

WilcoxonResult(statistic=1910.0, pvalue=0.03446663167697859)
WilcoxonResult(statistic=2369.0, pvalue=0.5916968660875195)


In [6]:
%%capture
result_df = pd.DataFrame(columns=["Method", "run", "average_precision_score", "roc_auc_score"])
repeat = 50
# Setup the random seed generator
rng = np.random.default_rng(42)

for i in range(repeat):
    seed = rng.integers(0, 2**32 - 1)
    abd, adj, M = simulation.simulate_noiseless_glv(
        num_taxa=n_vertices,
        avg_degree=avg_degree,
        time_points=time_points,
        downsample=1,
        seed=seed,
    )
    score_df = evaluation(adj, abd, evaluation_func, metrics=metrics, verbose=False)
    score_df["run"] = i
    result_df = result_df.append(score_df, ignore_index=True)
result_df.to_csv("data\\results\\noiseless_results.csv", index=False)

Test time points

In [13]:
# Set up parameters
n_vertices = 5
avg_degree = 2
network_type = 'random'
interaction_type = 'random'
max_interaction_strength = 1
time_points = [100, 200, 500, 1000, 2000, 3000, 5000]
time_step = 0.01
downsample = 1
noise_var = 1e-3

# evaluation_func = [correlation_score, spearman_score, clv_score, glv_score, pcor_score, pspe_score, sparcc_score, speic_score, baseline_score]
evaluation_func = [correlation_score, spearman_score, clv_score, glv_score, baseline_score]
metrics = [average_precision_score, roc_auc_score]

In [14]:
%%capture
# Initialize a DataFrame to store the results from each run.
result_df = pd.DataFrame(columns=["Method", "run", "average_precision_score", "roc_auc_score", "time_points"])
# Set the number of times the simulation will be run.
repeat = 50
# Setup the random seed generator, initializing it with seed 42.
rng = np.random.default_rng(42)

for time_point in time_points:
    for i in range(repeat):
        seed = rng.integers(0, 2**32 - 1)
        z, x, abd, adj, M = simulation.simulate_glv(
            num_taxa=n_vertices,
            avg_degree=avg_degree,
            time_points=time_point,
            time_step=time_step,
            downsample=1,
            noise_var=0,
            seed=seed,
        )

        score_df = evaluation(adj, z, evaluation_func, metrics=metrics, verbose=False)
        score_df["run"] = i
        score_df["time_points"] = time_point
        result_df = result_df.append(score_df, ignore_index=True)
    result_df.to_csv("data\\temp_results\\vary_timepoints.csv", index=False)

Test time interval

In [11]:
# Set up parameters
n_vertices = 5
avg_degree = 2
network_type = 'random'
interaction_type = 'random'
max_interaction_strength = 1
time_points = 500
total_time = 50
# time_steps = [0.0001, 0.001, 0.01, 0.1, 1]
time_steps = [0.001, 0.01, 0.1, 1]
downsample = 1
noise_var = 1e-3

# evaluation_func = [correlation_score, spearman_score, clv_score, glv_score, pcor_score, pspe_score, sparcc_score, speic_score, baseline_score]
evaluation_func = [correlation_score, spearman_score, clv_score, glv_score, baseline_score]
metrics = [average_precision_score, roc_auc_score]

In [12]:
%%capture
result_df = pd.DataFrame(columns=["Method", "run", "average_precision_score", "roc_auc_score", "time_interval"])
repeat = 1
# Setup the random seed generator
rng = np.random.default_rng(42)

for i in range(repeat):
    for intv in time_steps:
        seed = rng.integers(0, 2**32 - 1)
        z, x, abd, adj, M = simulation.simulate_glv(
            num_taxa=n_vertices,
            avg_degree=avg_degree,
            time_points=int(total_time/intv),
            time_step=intv,
            downsample=1,
            noise_var=0,
            seed=seed,
        )

        score_df = evaluation(adj, z, evaluation_func, metrics=metrics, verbose=False)
        score_df["run"] = i
        score_df["time_interval"] = intv
        result_df = result_df.append(score_df, ignore_index=True)
    result_df.to_csv("data\\results\\vary_time_interval_230615.csv", index=False)

Test number of taxa

In [18]:
# Set up parameters
n_vertices = [5, 10, 20, 50, 100]
avg_degree = 2
network_type = 'random'
interaction_type = 'random'
max_interaction_strength = 1
time_points = 500
time_step = 0.01
downsample = 1
noise_var = 1e-3

# evaluation_func = [correlation_score, spearman_score, clv_score, glv_score, pcor_score, pspe_score, sparcc_score, speic_score, baseline_score]
evaluation_func = [correlation_score, spearman_score, clv_score, glv_score, baseline_score]
metrics = [average_precision_score, roc_auc_score]

In [20]:
%%capture
result_df = pd.DataFrame(columns=["Method", "run", "average_precision_score", "roc_auc_score", "num_taxa"])
repeat = 50
# Setup the random seed generator
rng = np.random.default_rng(42)

for i in range(repeat):
    for n in n_vertices:
        seed = rng.integers(0, 2**32 - 1)
        z, x, abd, adj, M = simulation.simulate_glv(
            num_taxa=n,
            avg_degree=avg_degree,
            time_points=time_points,
            time_step=intv,
            downsample=1,
            noise_var=0,
            seed=seed,
        )

        score_df = evaluation(adj, z, evaluation_func, metrics=metrics, verbose=False)
        score_df["run"] = i
        score_df["num_taxa"] = n
        result_df = result_df.append(score_df, ignore_index=True)
    result_df.to_csv("data\\temp_results\\vary_num_taxa.csv", index=False)

Test number of average degree

In [None]:
# Set up parameters
n_vertices = 50
avg_degree = [2, 5, 10]
network_type = 'random'
interaction_type = 'random'
max_interaction_strength = 1
time_points = 500
time_step = 0.01
downsample = 1
noise_var = 1e-3

# evaluation_func = [correlation_score, spearman_score, clv_score, glv_score, pcor_score, pspe_score, sparcc_score, speic_score, baseline_score]
evaluation_func = [correlation_score, spearman_score, clv_score, glv_score, baseline_score]
metrics = [average_precision_score, roc_auc_score]

In [None]:
%%capture
result_df = pd.DataFrame(columns=["Method", "run", "average_precision_score", "roc_auc_score", "avg_degree"])
repeat = 50
# Setup the random seed generator
rng = np.random.default_rng(42)

for i in range(repeat):
    for n in n_vertices:
        seed = rng.integers(0, 2**32 - 1)
        z, x, abd, adj, M = simulation.simulate_glv(
            num_taxa=n,
            avg_degree=avg_degree,
            time_points=time_points,
            time_step=intv,
            downsample=1,
            noise_var=0,
            seed=seed,
        )

        score_df = evaluation(adj, z, evaluation_func, metrics=metrics, verbose=False)
        score_df["run"] = i
        score_df["num_taxa"] = n
        result_df = result_df.append(score_df, ignore_index=True)
    result_df.to_csv("data\\temp_results\\vary_avg_degree.csv", index=False)

Test type of network

In [12]:
# Set up parameters
n_vertices = 50
avg_degree = 5
# n_vertices = 50
# avg_degree = 25
network_type = ['random', 'small-world', 'scale-free']
interaction_type = 'random'
max_interaction_strength = 1
time_points = 500
time_step = 0.01
downsample = 1
noise_var = 1e-3

# evaluation_func = [correlation_score, spearman_score, clv_score, glv_score, pcor_score, pspe_score, sparcc_score, speic_score, baseline_score]
evaluation_func = [correlation_score, spearman_score, clv_score, glv_score, baseline_score]
metrics = [average_precision_score, roc_auc_score]

In [13]:
%%capture
result_df = pd.DataFrame(columns=["Method", "run", "average_precision_score", "roc_auc_score", "network_type"])
repeat = 50
# Setup the random seed generator
rng = np.random.default_rng(42)

for i in range(repeat):
    for n in network_type:
        seed = rng.integers(0, 2**32 - 1)
        z, x, abd, adj, M = simulation.simulate_glv(
            num_taxa=n_vertices,
            avg_degree=avg_degree,
            time_points=time_points,
            time_step=time_step,
            downsample=1,
            noise_var=0,
            seed=seed,
            network_type=n,
        )

        score_df = evaluation(adj, z, evaluation_func, metrics=metrics, verbose=False)
        score_df["run"] = i
        score_df["network_type"] = n
        result_df = result_df.append(score_df, ignore_index=True)
    result_df.to_csv("data\\temp_results\\vary_network_type.csv", index=False)