In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadr  # Load directly the rds file
import seaborn as sns

In [None]:
save_files = False

In [None]:
# How much of the abundance is accounted for by the AGORA2 models?
data = pyreadr.read_r("../../data/processed_files/otumat.rds")
otu_table = data[None]  # extract the pandas data frame for the only object available

full_tax = pd.read_csv(f"../../data/processed_files/taxonomy.csv", sep=",")

### Identify the organisms which we can map to a model

In [None]:
def write_org_name(s: str):
    """
    Parse the string containing the organism name
    """
    classif = s.split("-")
    if len(classif[0]) == 0:
        return "unknown unknown unknown"
    elif len(classif[1]) == 0:
        return classif[0] + " unknown unknown"
    elif len(classif[2]) == 0:
        return classif[0] + " " + classif[1] + " unknown"
    else:
        return classif[0] + " " + classif[1] + " " + classif[2]

In [None]:
full_tax

Single out the organism that we can identify at least at the Family level

In [None]:
full_tax = full_tax[["Family", "Genus", "Species"]]
full_tax.fillna("unknown", inplace=True)
full_tax.drop_duplicates()

In [None]:
# Generate list of the 457 identified OTUs
if save_files:
    full_tax.drop_duplicates().reset_index(drop=True).drop(0).to_csv(
        "../../data/tables/Supp_tabl_457_identified_OTUs.csv"
    )

There are many duplicates: let us match OTUs to their associated known taxonomy

In [None]:
# Detect which rows correspond to which organism
organisms = pd.DataFrame()
organisms["Organism"] = (
    full_tax["Family"] + " " + full_tax["Genus"] + " " + full_tax["Species"]
)
organisms["Index"] = organisms.index
organisms_grouped = pd.DataFrame(
    organisms.groupby("Organism").apply(lambda x: list(x["Index"])), columns=["Matches"]
)
organisms_grouped

# Compute the percentage of abundances that can be mapped to a model

We compute the reaction abundances by associating each OTU characterized at least at the Family level to models of the AGORA2 collection. We are not able to match all characterized OTUs but we can look at which fraction can be characterized.
To do so, we retrieve the mapping obtained from the MATLAB script microbiome.m: an OTU is mapped only to models with the same taxonomy.

In [None]:
# If the mapping is means that this taxonomic annotation was not mapped to any model
matlab_mapping = pd.read_csv("../../data/processed_files/mapped_species_matlab.csv")
matlab_mapping

In [None]:
matlab_mapping["Organism"] = matlab_mapping["mapped_species1"].apply(write_org_name)
matlab_mapping

In [None]:
matlab_mapped = matlab_mapping[matlab_mapping["mapped_species2"] > 0]
matlab_mapped

In [None]:
known_organisms_grouped = matlab_mapped.join(
    organisms_grouped, on="Organism", how="inner"
)

In [None]:
# Get the indices corresponding to the mapped species
indices = known_organisms_grouped["Matches"].sum()
# Compute the relative abundances
otu_rel = otu_table / otu_table.sum()
# Filter the OTU abundances for the mapped abundances
otu_mapped = otu_rel.iloc[indices, :]

In [None]:
sns.violinplot(otu_mapped.sum())
plt.title("Overall mapping of species abundance to model")

In [None]:
# Concatenate the OTU table with the metadata to obtain the mapping per model

metadata = pd.read_csv("../../data/processed_files/metadata.csv", index_col=0)
metadata

In [None]:
# Concatenate the metadata to the OTU abundance using the #SampleID

# Update the dataframe containing reactions and metadata
# Join the metadata with the reactions
otu_abund_meta = metadata.merge(
    otu_mapped.transpose(), left_on="#SampleID", right_index=True
)
otu_abund_meta["Mapped abundance"] = otu_abund_meta.iloc[:, 48:].sum(axis=1)
otu_abund_meta

In [None]:
sns.boxplot(data=otu_abund_meta, y="Model", x="Mapped abundance")
plt.title("Mapped abundance for each microbiome model")

In [None]:
# Make each datapoint appear
sns.boxplot(data=otu_abund_meta, y="Model", x="Mapped abundance")


sns.stripplot(data=otu_abund_meta, y="Model", x="Mapped abundance")


plt.title("Mapped abundance for each microbiome model")

#### Table used to generate Extended Figure 6d

In [None]:
# Table used to generate Extended Figure 6d
if save_files:
    otu_abund_meta.to_csv(
        "../../data/tables/Supp_figure_mapped_microbial_abundance.csv"
    )

# Having a look at the normalized OTU abundances


In [None]:
# Filter for humans, SPF, Wild
otu_of_int = otu_abund_meta[otu_abund_meta["Model"].isin(["Wild", "SPF", "Human"])]
# Filter the metadata out
otu_of_int_abund = otu_of_int[otu_of_int.columns[48:]]
otu_of_int_abund.drop("Mapped abundance", inplace=True, axis=1)
otu_of_int_abund

In [None]:
# Fraction of nonzero entries
(otu_of_int_abund > 0).sum().sum() / (
    len(otu_of_int_abund.columns) * len(otu_of_int_abund.index)
)

In [None]:
# Mean and median of all OTU abundances
print("Mean", np.mean(otu_of_int_abund.to_numpy().flatten()))
print("Median", np.median(otu_of_int_abund.to_numpy().flatten()))

In [None]:
# Mean and median of nonzero OTU abundances
array = np.array(otu_of_int_abund)
nonzero_of_int = array[np.nonzero(array)]
print("Mean", np.mean(nonzero_of_int))
print("Median", np.median(nonzero_of_int))

In [None]:
sns.boxenplot(np.log10(nonzero_of_int))

In [None]:
import pandas as pd

test = pd.read_csv("../../data/manuscript/Supp_figure_mapped_microbial_abundance.csv")
test

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.boxplot(data=test, y="Model", x="Mapped abundance")
plt.title("Mapped abundance for each microbiome model")

In [None]:
test2 = 
test2