In [7]:
from pathlib import Path
import pandas as pd

fasta_path = Path("../data/3FTx/3FTx_mature.fasta")
csv_path = Path("../data/protspace_after_dan/3FTx.csv")
iqtree_in = Path("../data/phylo_tree/iqtree/3FTx_resubmission_DSD_v2.fasta.contree")
iqtree_out = Path("../data/phylo_tree/iqtree/iqtree_rename.nexus")
dali_in = Path("../data/phylo_tree/dali/DALI_unrooted.newick")
dali_out = dali_in.with_name("DALI_rename.newick")
itol_out = dali_in.parents[2] / "iTOL.txt"

## Rename DALI file

In [6]:
import re
with open(dali_in, "r") as handle:
    data = handle.readline()
data = re.sub(r"[A-Z]{5} ", "", data)
with open(dali_out, "w") as handle:
    handle.write(data)

## Rename iqtree file

In [13]:
import re
with open(iqtree_in, "r") as handle:
    data = handle.readline()
data = re.sub(r"(None|NCBI|TR|SP)_([A-Za-z0-9_\-\.]+)_([A-Z][a-z]+_[a-z]+)", r"\1|\2|\3", data)
data
with open(iqtree_out, "w") as handle:
    handle.write(data)

## Create iTOL coloring file

In [14]:
from distinctipy import distinctipy

id_name = "identifier"
group_name = "cysteine_group"

df = pd.read_csv(csv_path)

# number of colours to generate
N = len(df[group_name].unique())
seed = 42

colorblind_type = "Normal"
colors = distinctipy.get_colors(N, colorblind_type=colorblind_type, rng=seed)
colors = ["#" + "".join([f"{int(c * 255):02X}" for c in color]) for color in colors]
color_group = {g: c for g, c in zip(df[group_name].unique(), colors)}

with open(itol_out, "w") as handle:
    handle.write("TREE_COLORS\n")
    handle.write("SEPARATOR TAB\n")
    handle.write("DATA\n")
    for group in df[group_name].unique():
        for idx, row in df[df[group_name] == group].iterrows():
            uid = (
                row[id_name]
                # .replace(":", "_")
                # .replace("ID=", "ID")
                # .replace("=", "_")
            )
            # group = row["group"]
            color = color_group[group]
            handle.write(f"{uid}\trange\t{color}\t{group}\n")