In [1]:
import jupyter_black

jupyter_black.load()

In [2]:
from pathlib import Path
import pandas as pd

fasta_path = Path("../data/3FTx/3FTx_mature.fasta")
csv_path = Path("../data/protspace_after_dan/3FTx.csv")
phylo_tree_base = Path("../data/phylo_tree")
iqtree_in = phylo_tree_base / "iqtree/3FTx_resubmission_DSD_v2.fasta.contree"
iqtree_out = phylo_tree_base / "iqtree/iqtree_rename.nexus"
dali_in = phylo_tree_base / "dali/DALI_unrooted.newick"
dali_out = dali_in.with_name("DALI_rename.newick")
itol_out = phylo_tree_base / "iTOL.txt"

## Rename DALI file

In [3]:
import re

with open(dali_in, "r") as handle:
    data = handle.readline()
data = re.sub(r"[A-Z]{5} ", "", data)
with open(dali_out, "w") as handle:
    handle.write(data)

## Rename iqtree file

In [4]:
import re

with open(iqtree_in, "r") as handle:
    data = handle.readline()
data = re.sub(
    r"(None|NCBI|TR|SP)_([A-Za-z0-9_\-\.]+)_([A-Z][a-z]+_[a-z]+)", r"\1|\2|\3", data
)
data
with open(iqtree_out, "w") as handle:
    handle.write(data)

## Create iTOL coloring file

In [8]:
color_txt = """00A79D frog Ly6
B1E0E1 GPIHBP1
E6D6D6 Ly6 unique
E083B6 Ly6D
ED2024 Ly6E
9A4C9D Ly6H
ECB9B9 Ly6K
902323 Ly6L
D6B9D9 LYNX1
3D5BA9 LYPD2
968AC2 PSCA
F15A29 Reptilian Ly6 group 6
F47745 Reptilian Ly6 group 7
6B2C8C Non-human mammalian Ly6
BD522A Reptilian Ly6 group 1
BD522A Reptilian Ly6 group 1-2
F8AC4F Reptilian Ly6 group 2
F7921E Reptilian Ly6 group 3
FFC919 Reptilian Ly6 group 5
488BCA SLURP1
E83895 SLURP2
F7EC13 Reptilian Ly6 group 4
788E42 Non-standard
67BD45 Plesiotypic
63CBE5 Short-chain
24638F Long-chain"""

color_group = dict()
for line in color_txt.split("\n"):
    color, groupname = line.split(" ", maxsplit=1)
    color_group[groupname] = f"#{color}"
color_group

{'frog Ly6': '#00A79D',
 'GPIHBP1': '#B1E0E1',
 'Ly6 unique': '#E6D6D6',
 'Ly6D': '#E083B6',
 'Ly6E': '#ED2024',
 'Ly6H': '#9A4C9D',
 'Ly6K': '#ECB9B9',
 'Ly6L': '#902323',
 'LYNX1': '#D6B9D9',
 'LYPD2': '#3D5BA9',
 'PSCA': '#968AC2',
 'Reptilian Ly6 group 6': '#F15A29',
 'Reptilian Ly6 group 7': '#F47745',
 'Non-human mammalian Ly6': '#6B2C8C',
 'Reptilian Ly6 group 1': '#BD522A',
 'Reptilian Ly6 group 1-2': '#BD522A',
 'Reptilian Ly6 group 2': '#F8AC4F',
 'Reptilian Ly6 group 3': '#F7921E',
 'Reptilian Ly6 group 5': '#FFC919',
 'SLURP1': '#488BCA',
 'SLURP2': '#E83895',
 'Reptilian Ly6 group 4': '#F7EC13',
 'Non-standard': '#788E42',
 'Plesiotypic': '#67BD45',
 'Short-chain': '#63CBE5',
 'Long-chain': '#24638F'}

In [7]:
color_group

{'Ly-6': '#00FF00',
 'pre-3FTx': '#FF00FF',
 'Plesiotypic': '#007FFF',
 'Non-standard': '#FF7F00',
 'Short-chain': '#7FBF7F',
 'Long-chain': '#5E06A4',
 nan: '#E90138'}

In [6]:
from distinctipy import distinctipy

id_name = "identifier"
group_name = "cysteine_group"

df = pd.read_csv(csv_path)

# number of colours to generate
N = len(df[group_name].unique())
seed = 42

colorblind_type = "Normal"
colors = distinctipy.get_colors(N, colorblind_type=colorblind_type, rng=seed)
colors = ["#" + "".join([f"{int(c * 255):02X}" for c in color]) for color in colors]
color_group = {g: c for g, c in zip(df[group_name].unique(), colors)}

with open(itol_out, "w") as handle:
    handle.write("TREE_COLORS\n")
    handle.write("SEPARATOR TAB\n")
    handle.write("DATA\n")
    for group in df[group_name].unique():
        for idx, row in df[df[group_name] == group].iterrows():
            uid = (
                row[id_name]
                # .replace(":", "_")
                # .replace("ID=", "ID")
                # .replace("=", "_")
            )
            # group = row["group"]
            color = color_group[group]
            handle.write(f"{uid}\trange\t{color}\t{group}\n")