In [13]:
import pandas as pd

In [14]:
path_to_mpa_output="/shared/projects/mudis4ls_is4_benchmark/test_results/metaphlan/metaphlan_output_gtdb.txt"
path_to_sylph_output="/shared/projects/mudis4ls_is4_benchmark/test_results/sylph/ERS12377136.fastq.gz.sylphmpa"

In [15]:
# Charger les fichiers en DataFrames
df_metaphlan = pd.read_csv(path_to_mpa_output, sep="\t", skiprows=1 )
df_sylph = pd.read_csv(path_to_sylph_output, sep="\t", skiprows=1)

In [16]:
# Modifier le nom de la première colonne
df_metaphlan = df_metaphlan.rename(columns={df_metaphlan.columns[0]: df_metaphlan.columns[0][1:]})

print(df_metaphlan.head())

                      clade_name  relative_abundance
0                    d__Bacteria           100.00000
1    d__Bacteria;p__Bacteroidota            52.20608
2    d__Bacteria;p__Firmicutes_A            43.38615
3    d__Bacteria;p__Firmicutes_C             3.35772
4  d__Bacteria;p__Proteobacteria             1.05005


In [17]:
# Renommer la colonne relative_abundance pour chaque outil
df_metaphlan.rename(columns={"relative_abundance": "abundance_metaphlan"}, inplace=True)
df_sylph.rename(columns={"relative_abundance": "abundance_sylph"}, inplace=True)


In [18]:
# Remplacement efficace avec regex
df_metaphlan["clade_name"] = df_metaphlan["clade_name"].str.replace(
    r"Firmicutes", "Bacillota", regex=True
).str.replace(
    r"Proteobacteria", "Pseudomonadota", regex=True
)

In [19]:
df_metaphlan["clade_name"] = df_metaphlan["clade_name"].str.replace(";", "|")


In [20]:
df_merged = df_metaphlan.merge(df_sylph, on="clade_name", how="outer")


In [21]:
df_merged

Unnamed: 0,clade_name,abundance_metaphlan,abundance_sylph,sequence_abundance,ANI (if strain-level),Coverage (if strain-level)
0,d__Bacteria,100.00000,100.0000,100.0001,,
1,d__Bacteria|p__Bacillota_A,43.38615,41.4963,36.1829,,
2,d__Bacteria|p__Bacillota_A|c__Clostridia,43.38615,41.4963,36.1829,,
3,d__Bacteria|p__Bacillota_A|c__Clostridia|o__La...,23.22263,24.1255,22.7149,,
4,d__Bacteria|p__Bacillota_A|c__Clostridia|o__La...,23.22263,24.1255,22.7149,,
...,...,...,...,...,...,...
96,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,0.9629,,
97,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,0.9629,,
98,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,0.9629,,
99,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,0.9629,,


In [22]:
df_merged["diff_abundance"] = df_merged["abundance_metaphlan"] - df_merged["abundance_sylph"]


In [23]:
df_merged

Unnamed: 0,clade_name,abundance_metaphlan,abundance_sylph,sequence_abundance,ANI (if strain-level),Coverage (if strain-level),diff_abundance
0,d__Bacteria,100.00000,100.0000,100.0001,,,0.00000
1,d__Bacteria|p__Bacillota_A,43.38615,41.4963,36.1829,,,1.88985
2,d__Bacteria|p__Bacillota_A|c__Clostridia,43.38615,41.4963,36.1829,,,1.88985
3,d__Bacteria|p__Bacillota_A|c__Clostridia|o__La...,23.22263,24.1255,22.7149,,,-0.90287
4,d__Bacteria|p__Bacillota_A|c__Clostridia|o__La...,23.22263,24.1255,22.7149,,,-0.90287
...,...,...,...,...,...,...,...
96,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,0.9629,,,
97,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,0.9629,,,
98,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,0.9629,,,
99,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,0.9629,,,


In [24]:
df_merged = df_merged[["clade_name", "abundance_metaphlan", "abundance_sylph","diff_abundance"]]


In [25]:
df_merged

Unnamed: 0,clade_name,abundance_metaphlan,abundance_sylph,diff_abundance
0,d__Bacteria,100.00000,100.0000,0.00000
1,d__Bacteria|p__Bacillota_A,43.38615,41.4963,1.88985
2,d__Bacteria|p__Bacillota_A|c__Clostridia,43.38615,41.4963,1.88985
3,d__Bacteria|p__Bacillota_A|c__Clostridia|o__La...,23.22263,24.1255,-0.90287
4,d__Bacteria|p__Bacillota_A|c__Clostridia|o__La...,23.22263,24.1255,-0.90287
...,...,...,...,...
96,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,
97,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,
98,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,
99,d__Bacteria|p__Pseudomonadota|c__Gammaproteoba...,,1.3124,


In [11]:
#Pseudomonadota – anciennement Proteobacteria 
#Bacillota – anciennement Firmicutes
# Regarde ce qui se passe si on change juste l'assignation de ces 2 clades

In [26]:
import seaborn as sns
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'seaborn'

In [12]:
sns.heatmap(df.set_index("clade_name"), annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Comparaison des abondances par clade et outil")
plt.show()
