In [None]:
import os
from glob import glob 
import tqdm
import re
from itertools import product 
from pathlib import Path

In [None]:
TALLIS = Path(os.getenv("TALLIS"))
PCH = TALLIS / 'PCH-ASTRAL'

## Validate All Simulated Data

In [None]:
E_FACTOR = [0.8]
H_FACTOR = ["0.1"]
C_FACTOR = [0.25, 0.5, 1, 3]
POLYMORPHISM = ["no", "low", "high"]
OUTPUT_FOLDER = 'sim_outputs'

TREES = 16
REPLICAS = 4
# METHODS = ["GA", "MP4", "COV", "ASTRAL4(9)", "ASTRAL4(12)"]
METHODS = ["ASTRAL(11,5)",  "ASTRAL(10,5)", "GA", "MP4", ]

In [None]:
METHOD_NAMES = {
    "ASTRAL(10,5)": "PCH-ASTRAL+K",
    "ASTRAL(11,5)": "PCH-ASTRAL-K",
    "ASTRAL4(11)": "ASTRAL4(EVANS-ALL)-K",
    "ASTRAL(9,5)": "ASTRAL(EVANS-ONE)+K", 
    "ASTRAL(12,5)": "ASTRAL(EVANS-ONE)",
    "ASTRAL4(12)": "ASTRAL4(EVANS-ONE)-K",
    "ASTRAL(13,5)": "ASTRAL(OMP-K)",
    "MP4": "MP"
}

In [None]:
from glob import glob
for (ef, hf, cf, poly, method) in product(
    E_FACTOR, H_FACTOR, C_FACTOR, POLYMORPHISM, METHODS
):
    method_root = OMP / OUTPUT_FOLDER / f"{poly}_{hf}_{ef}_{cf}" / method
    path = method_root / "trees"
    score_path = method_root / "allscores.txt" 
    if not path.is_dir():
        print(f"{path} is not a dir")
    num_subfiles = len(list(path.glob("*")))
    if num_subfiles != TREES * REPLICAS: 
        print(f"{path} has {num_subfiles} subfiles")

In [None]:

def extract_last_two_numbers(path: str) -> tuple[int, int]:
    # Extract the filename without extension
    filename = os.path.basename(path)
    name_without_ext = os.path.splitext(filename)[0]
    
    # Match all numbers in the filename
    numbers = re.findall(r'\d+', name_without_ext)
    
    # Ensure there are at least two numbers to return
    if len(numbers) < 2:
        raise ValueError("Filename does not contain at least two numbers.")
    
    # Return the last two numbers as a tuple of integers
    return int(numbers[-2]), int(numbers[-1])

# Generate figures 

Parse data

In [None]:
# parse 
import pandas as pd 
data = []
for (ef, hf, cf, poly, method) in product(
    E_FACTOR, H_FACTOR, C_FACTOR, POLYMORPHISM, METHODS
):
    method_root = OMP / OUTPUT_FOLDER / f"{poly}_{hf}_{ef}_{cf}" / method
    if not (method_root / "allscores.txt").exists():
        print(f"{method_root / "allscores.txt"} does not exist")
        continue
    with open(method_root / "allscores.txt", "r") as fp:
        lines = list(fp.readlines())
        N = len(lines)
        # assert len(lines) >= 250, f"{len(lines)=} for {method_root=}"
        i = 0
        got = 0
        while i < N - 1:
            try:
                file_text = lines[i].strip()
                score_text = lines[i + 1].strip()
                if not os.path.exists(file_text):
                    i += 1
                    continue
                replica = file_text.split('/')[-1]
                replica = replica[8:-5]
                # try:
                tree, repnum = extract_last_two_numbers(file_text)
                # print(f"{tree=}, {repnum=}")
                if int(tree) > TREES or int(repnum) > REPLICAS:
                    i += 2
                    continue

                # print(f"{score_text=}")
                fn, fp = score_text.split()
                # assert fn == fp 
                data.append((
                    float(ef),
                    float(hf),
                    float(cf),
                    poly,
                    replica,
                    METHOD_NAMES.get(method, method),
                    float(fn) * 100,
                ))
                i += 2
                got += 1
            except Exception as e: 
                print(e, method_root)
                i += 1
        if not ((got == (TREES * REPLICAS)) or (got == 0)):
        #, f"Got {got} for {method_root=}, expected {TREES * REPLICAS}":
            print(f"GOT {got} for {method_root=}")



In [None]:
df = pd.DataFrame.from_records(
    data = data,
    columns = [
        "evofactor",
        "homoplasyfactor",
        "chrfactor",
        "polymorphism",
        "replica",
        "method",
        "fn"
    ]
)

In [None]:
print(f"Expected length is {len(E_FACTOR) * len(H_FACTOR) * len(C_FACTOR) * len(POLYMORPHISM) * len(METHODS) * TREES * REPLICAS}")
print(f"Actual length is {len(df)}")

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt

plt.rcParams['figure.dpi'] = 150 # better resolution when showing and saving
plt.rcParams["savefig.dpi"] = 150
plt.rcParams['font.size'] = 10
plt.rcParams['lines.markeredgewidth'] = 1

In [None]:
df[(df['chrfactor'] == 1) & (df['homoplasyfactor'] == 0.05)].to_csv(Path(os.getenv("TALLIS")) / "3state-0.05.csv")

In [None]:
import seaborn as sns 

In [None]:
from matplotlib import lines
from seaborn._base import unique_dashes
DISPLAY_METHODS = df["method"].unique()
hue_palette = {
        m: c 
        for m, c in zip(DISPLAY_METHODS, sns.color_palette(n_colors = 4))
    }
dash_dict = {
    m: d 
    for m, d in zip(DISPLAY_METHODS, unique_dashes(len(DISPLAY_METHODS)))
}

print(hue_palette)
print(dash_dict)

In [None]:
import numpy as np
def plot_by_given_methods(BY_POLY_METHODS):
    df["Polymorphism"] = df["polymorphism"].map(lambda x: x.capitalize())
    fg = sns.relplot(
        kind = 'line',
        data = df[df["method"].isin(BY_POLY_METHODS)],
        col = 'Polymorphism',
        x = (df['chrfactor'] * 320).astype(int), 
        y = 'fn',
        hue = 'method',
        style = 'method',
        # hue_order = DISPLAY_METHODS,
        # style_order = DISPLAY_METHODS,
        err_style = 'bars',
        errorbar='se',
        err_kws = {
            'capsize': 5
        },
        palette = hue_palette,
        dashes = dash_dict
    )

    # l.texts = texts_to_handles.keys()
    # l.legend_handles = texts_to_handles.values()
    fg.set(ylim=(0, 15))
    fg.set_axis_labels(
        x_var = "Number of characters",
        y_var = "FN Error Rate (%)"
    )
    fg.set_titles(
        col_template="{col_name} Polymorphism"
    )

    LABELS = np.array(C_FACTOR) * 320
    for ax in fg.axes.flat:
        ax.set_xticks(LABELS)
    # fg.figure.suptitle(f"FN Error by number of characters across polymorphism levels")
    fg.tight_layout()
    fg.figure.savefig(OMP / "figs" / f"by-methods-{"-".join(BY_POLY_METHODS)}.png")

In [None]:
with sns.axes_style("darkgrid"):
    plot_by_given_methods([
        "MP",
        "GA",
        "PCH-ASTRAL-K",
    ])
    plot_by_given_methods([
        "PCH-ASTRAL-K",
        "PCH-ASTRAL+K"
    ])