In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
import pandas as pd
from Bio.Phylo._io import parse
from Bio.Phylo.BaseTree import Tree as BioTree
from scipy import stats
import plotly.express as px
import plotly.io as pio

from src.datasets.load_trees import load_trees_from_file
from src.utils.tree_utils import get_taxa_names
from src.utils.tree_utils import get_tree_height
from src.distribution_analysis.process_tree import get_observed_nodes, get_clade_split_df

from scipy.stats import ks_2samp

from collections import Counter

In [2]:
import warnings
warnings.filterwarnings('ignore')
pio.templates.default = "plotly_white"

In [3]:
POSTERIOR_DIR = Path("data/hpd_validation")
GRAPHS_DIR = Path("data/hpd_validation_analysis")

In [8]:
dict_true_tree_percentiles = {
    "model_name": [],
    "true_tree_percentile": [],
}

for posterior_file in POSTERIOR_DIR.glob("*.log"): 
    file_name_wo_ext = posterior_file.name.removesuffix(".log")
    dataset_name, model_name = file_name_wo_ext.split("_")
    
    posterior_df = pd.read_csv(posterior_file)

    assert posterior_df.iloc[0]["tree"] == "true"
    true_posterior = posterior_df.iloc[0]["posterior"]
    posterior_df = posterior_df.drop(posterior_df.index[0])

    dict_true_tree_percentiles["model_name"].append(model_name)
    dict_true_tree_percentiles["true_tree_percentile"].append(stats.percentileofscore(posterior_df.posterior, true_posterior))

    fig = px.line(
        posterior_df["posterior"].sort_values(ascending=False).reset_index()["posterior"],
        title=model_name
    )
    fig.write_image(GRAPHS_DIR / f"{model_name}.png", scale=4)

df_true_tree_percentiles = pd.DataFrame(dict_true_tree_percentiles)

In [5]:
fig = px.bar(
    df_true_tree_percentiles, 
    x="model_name", 
    y="true_tree_percentile",
    labels={"model_name": "Model", "true_tree_percentile": "Percentile of True Tree Posterior"},
    range_y=(70, 90)
)
fig.write_image(GRAPHS_DIR / "true-tree-percentiles.png", scale=4) 

In [6]:
sum(posterior_df["posterior"] > true_posterior) / len(posterior_df)

0.1659

In [7]:
px.bar(
        posterior_df["posterior"].sort_values(ascending=False).reset_index()["posterior"]
    )