## Get phlylum mapping for itol

In [7]:
# Import Python packages
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import pearsonr
from matplotlib_venn import venn2
import matplotlib.patches as mpatches
from scipy.stats import mannwhitneyu
from biom import Table
from gemelli.rpca import rpca
from scipy.spatial.distance import euclidean
import re
import matplotlib.colors as mcolors
import qiime2 as q2


In [8]:
gg_taxonomy = q2.Artifact.load('../Reference/2022.10.taxonomy.asv.tsv.qza').view(pd.DataFrame)
gg_taxonomy

Unnamed: 0_level_0,Taxon,Confidence
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1
GB-GCA-003568775.1-MWMI01000008.1,d__Bacteria; p__; c__; o__; f__; g__; s__,0.3
GB-GCA-001552015.1-CP010514.1,d__Bacteria; p__; c__; o__; f__; g__; s__,0.3
GB-GCA-000008085.1-AE017199.1,d__Bacteria; p__; c__; o__; f__; g__; s__,0.3
TAGCCGCACCCCAAGTGGTAGTCATTATTATTGGGCTTAAAGTGTTCGTAGCCGGGCCTGAAAGTCCGCTGTGAAATCCAAGCGCTCAAC,d__Bacteria; p__; c__; o__; f__; g__; s__,0.3
TACCCGCGCCACGAGTGGTGATCGCGATTATTGGGCCTAAAGGGTTCGTAGCCGGTTTGGCAAGTTCCTGGTGAAATCTTTCAGCTAACTGAAAGGCGTG,d__Bacteria; p__; c__; o__; f__; g__; s__,0.3
...,...,...
TACCCGCGCTGCGAGTGGTCACCACGATTATTGGGCTTAGAGCGTTCGTAGCCGGCTTTGCAAGTCCCCGGTGAAATCATCTGGCAAACC,d__Archaea; p__; c__; o__; f__; g__; s__,0.3
TACCCGCGCCACGAGTGGTGATCGCGATTATTGGGCCTAAAGGGTTCGTAGCCGGCTCGGCAAGTTCCTGGTGAAATCTTCCAGCTAACTGGAAGGCGTG,d__Archaea; p__; c__; o__; f__; g__; s__,0.3
CACTGGCAGTTCGGGTGGCAGTCGGTTCTATTGAGCCTAAAGCGTCCGTAGCCGGTTTGATCAGTCCTCGGTGAAATCTTTGGGCCTAACTCAAAGGCTT,d__Archaea; p__; c__; o__; f__; g__; s__,0.3
TACCCGCGCTGCAAGTGGTCACCACGATTATTGGGCTTAGAGCGTTCGTAGCCGGTTTTGGTAGTCCTCGGTGAAATCGCCTGGTTGACC,d__Archaea; p__; c__; o__; f__; g__; s__,0.3


In [9]:
# Split taxonomy into levels
taxonomy_levels = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
gg_taxonomy[taxonomy_levels] = gg_taxonomy['Taxon'].str.split(';', expand=True)
gg_taxonomy

Unnamed: 0_level_0,Taxon,Confidence,Kingdom,Phylum,Class,Order,Family,Genus,Species
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GB-GCA-003568775.1-MWMI01000008.1,d__Bacteria; p__; c__; o__; f__; g__; s__,0.3,d__Bacteria,p__,c__,o__,f__,g__,s__
GB-GCA-001552015.1-CP010514.1,d__Bacteria; p__; c__; o__; f__; g__; s__,0.3,d__Bacteria,p__,c__,o__,f__,g__,s__
GB-GCA-000008085.1-AE017199.1,d__Bacteria; p__; c__; o__; f__; g__; s__,0.3,d__Bacteria,p__,c__,o__,f__,g__,s__
TAGCCGCACCCCAAGTGGTAGTCATTATTATTGGGCTTAAAGTGTTCGTAGCCGGGCCTGAAAGTCCGCTGTGAAATCCAAGCGCTCAAC,d__Bacteria; p__; c__; o__; f__; g__; s__,0.3,d__Bacteria,p__,c__,o__,f__,g__,s__
TACCCGCGCCACGAGTGGTGATCGCGATTATTGGGCCTAAAGGGTTCGTAGCCGGTTTGGCAAGTTCCTGGTGAAATCTTTCAGCTAACTGAAAGGCGTG,d__Bacteria; p__; c__; o__; f__; g__; s__,0.3,d__Bacteria,p__,c__,o__,f__,g__,s__
...,...,...,...,...,...,...,...,...,...
TACCCGCGCTGCGAGTGGTCACCACGATTATTGGGCTTAGAGCGTTCGTAGCCGGCTTTGCAAGTCCCCGGTGAAATCATCTGGCAAACC,d__Archaea; p__; c__; o__; f__; g__; s__,0.3,d__Archaea,p__,c__,o__,f__,g__,s__
TACCCGCGCCACGAGTGGTGATCGCGATTATTGGGCCTAAAGGGTTCGTAGCCGGCTCGGCAAGTTCCTGGTGAAATCTTCCAGCTAACTGGAAGGCGTG,d__Archaea; p__; c__; o__; f__; g__; s__,0.3,d__Archaea,p__,c__,o__,f__,g__,s__
CACTGGCAGTTCGGGTGGCAGTCGGTTCTATTGAGCCTAAAGCGTCCGTAGCCGGTTTGATCAGTCCTCGGTGAAATCTTTGGGCCTAACTCAAAGGCTT,d__Archaea; p__; c__; o__; f__; g__; s__,0.3,d__Archaea,p__,c__,o__,f__,g__,s__
TACCCGCGCTGCAAGTGGTCACCACGATTATTGGGCTTAGAGCGTTCGTAGCCGGTTTTGGTAGTCCTCGGTGAAATCGCCTGGTTGACC,d__Archaea; p__; c__; o__; f__; g__; s__,0.3,d__Archaea,p__,c__,o__,f__,g__,s__


In [24]:
# txt_file = pd.read_csv('../Data/Trees/itol_AD_asv_group_colors_edit.txt', sep='\t', header=None)
txt_file = pd.read_csv('../Data/Trees/itol_H_asv_group_colors_edit.txt', sep='\t', header=None)

txt_file

FileNotFoundError: [Errno 2] No such file or directory: '../Data/Trees/itol_H_asv_group_colors_edit.txt'

In [22]:
# Extract taxonomy names by removing _ASV-* suffix
txt_file[0] = txt_file[0].str.replace(r'_ASV-\d+$', '', regex=True)
txt_file


Unnamed: 0,0,1,2
0,g__Acinetobacter,#947fff,AD_skin_only
1,g__Leptotrichia_A_993758,#947fff,AD_skin_only
2,g__Alistipes_A_871400,#947fff,AD_skin_only
3,g__Sphingomonas_L_486704,#947fff,AD_skin_only
4,g__Clostridium_T,#947fff,AD_skin_only
...,...,...,...
784,g__Streptococcus,#e7b9d8,AD_shared
785,g__,#e7b9d8,AD_shared
786,g__Berryella,#e7b9d8,AD_shared
787,g__Prevotella,#e7b9d8,AD_shared


In [23]:
# Create a mapping from genus to phylum
genus_to_phylum = {}
for idx, row in gg_taxonomy.iterrows():
    genus = row['Genus'].strip()
    phylum = row['Phylum'].strip()
    if genus and phylum:  # Only add if both values exist
        genus_to_phylum[genus] = phylum

# Map genera to phyla and update txt_file
for idx, genus in enumerate(txt_file[0]):
    if genus in genus_to_phylum:
        txt_file.at[idx, 2] = genus_to_phylum[genus]

txt_file

Unnamed: 0,0,1,2
0,g__Acinetobacter,#947fff,p__Proteobacteria
1,g__Leptotrichia_A_993758,#947fff,p__Fusobacteriota
2,g__Alistipes_A_871400,#947fff,p__Bacteroidota
3,g__Sphingomonas_L_486704,#947fff,p__Proteobacteria
4,g__Clostridium_T,#947fff,p__Firmicutes_A
...,...,...,...
784,g__Streptococcus,#e7b9d8,p__Firmicutes_D
785,g__,#e7b9d8,p__
786,g__Berryella,#e7b9d8,p__Actinobacteriota
787,g__Prevotella,#e7b9d8,p__Bacteroidota
