In [24]:
import pandas as pd
import numpy as np
import biom
from biom import load_table
from biom.util import biom_open
import qiime2 as q2
import csv
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import FancyBboxPatch
import matplotlib.gridspec as gridspec
from matplotlib.colorbar import ColorbarBase
from matplotlib.colors import Normalize
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [25]:
# Read in BIOM tables
biom_path = '../tables/tables_woltka/per_genome/Genus_collapsed.biom'
# biom_path = '../tables/tables_rs210/per_genome/Genus_collapsed.biom'

biom_table = load_table(biom_path)
df = pd.DataFrame(biom_table.to_dataframe())
df = df.transpose() # transpose so features are columns
df.head()

Unnamed: 0,g__Abiotrophia,g__Acinetobacter,g__Actinomyces,g__Aerococcus,g__Alloiococcus,g__Alloprevotella,g__Anaerococcus,g__Anaeroglobus,g__Bacillus_A,g__Bifidobacterium,...,g__Staphylococcus,g__Stomatobaculum,g__Streptobacillus,g__Streptococcus,g__Tepidiphilus,g__Thermaerobacter,g__Thiobacillus,g__Varibaculum,g__Veillonella,g__Xanthomonas
15443.209.S78.L005,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0,3.0,0.0
15443.136.S29.L005,16.0,1.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,57.0,1.0,1.0,1.0,0.0,1.0,0.0
15443.167.S39.L005,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,1.0,2.0,0.0,51.0,0.0,1.0,0.0,0.0,0.0,0.0
15443.160.S33.L005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15443.192.S61.L005,1.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Check any samples with no count data and filter them out
rows_with_sum_zero = df.sum(axis=1) == 0
df_zero_sum = df[rows_with_sum_zero]
print(f"Rows where the sum of values is 0: {df_zero_sum.index}")

# Drop that sample from the dataframe 
df = df.drop(df_zero_sum.index)

Rows where the sum of values is 0: Index(['15443.178.S48.L005', '15443.179.S49.L005'], dtype='object')


In [27]:
# # Edit sample names, convert index to string (if not already)
df.index = df.index.astype(str)
# Transform the index by keeping only the part between the first and third periods
df.index = [x.split('.')[1] for x in df.index]
df.head()

Unnamed: 0,g__Abiotrophia,g__Acinetobacter,g__Actinomyces,g__Aerococcus,g__Alloiococcus,g__Alloprevotella,g__Anaerococcus,g__Anaeroglobus,g__Bacillus_A,g__Bifidobacterium,...,g__Staphylococcus,g__Stomatobaculum,g__Streptobacillus,g__Streptococcus,g__Tepidiphilus,g__Thermaerobacter,g__Thiobacillus,g__Varibaculum,g__Veillonella,g__Xanthomonas
209,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0,3.0,0.0
136,16.0,1.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,57.0,1.0,1.0,1.0,0.0,1.0,0.0
167,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,1.0,2.0,0.0,51.0,0.0,1.0,0.0,0.0,0.0,0.0
160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
192,1.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
df.index

Index(['209', '136', '167', '160', '192', '189', '78', '195', '162', '163',
       '193', '191', '72', '190', '188', '211', '205', '204', '207', '203',
       '202', '206', '73', '114', '200', '201', '199', '91', '198', '196',
       '197', '157', '176', '174', '170', '172', '173', '169', '187', '216',
       '210', '186', '93', '186', '161', '177', '182', '101', '102', '100',
       '112', '184', '87', '110', '181', '168', '166', '165', '194', '76',
       '208', '96', '85', '88', '90', '82', '84', '135', '123', '117', '124',
       '115', '120', '151', '159', '218', '214', '217', '215', '185', '212',
       '213', '79', '81', '75'],
      dtype='object')

In [29]:
df = df[~df.index.duplicated(keep='first')]
df


Unnamed: 0,g__Abiotrophia,g__Acinetobacter,g__Actinomyces,g__Aerococcus,g__Alloiococcus,g__Alloprevotella,g__Anaerococcus,g__Anaeroglobus,g__Bacillus_A,g__Bifidobacterium,...,g__Staphylococcus,g__Stomatobaculum,g__Streptobacillus,g__Streptococcus,g__Tepidiphilus,g__Thermaerobacter,g__Thiobacillus,g__Varibaculum,g__Veillonella,g__Xanthomonas
209,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0,3.0,0.0
136,16.0,1.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,57.0,1.0,1.0,1.0,0.0,1.0,0.0
167,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,1.0,2.0,0.0,51.0,0.0,1.0,0.0,0.0,0.0,0.0
160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
192,1.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,1.0,162.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,2.0,2.0
213,30.0,68.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,37.0,3.0
79,0.0,2.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,20.0,0.0,0.0,20.0,0.0,2.0,0.0,0.0,0.0,3.0
81,1.0,2.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,...,19.0,0.0,0.0,141.0,0.0,0.0,0.0,0.0,5.0,0.0


In [30]:
df.index

Index(['209', '136', '167', '160', '192', '189', '78', '195', '162', '163',
       '193', '191', '72', '190', '188', '211', '205', '204', '207', '203',
       '202', '206', '73', '114', '200', '201', '199', '91', '198', '196',
       '197', '157', '176', '174', '170', '172', '173', '169', '187', '216',
       '210', '186', '93', '161', '177', '182', '101', '102', '100', '112',
       '184', '87', '110', '181', '168', '166', '165', '194', '76', '208',
       '96', '85', '88', '90', '82', '84', '135', '123', '117', '124', '115',
       '120', '151', '159', '218', '214', '217', '215', '185', '212', '213',
       '79', '81', '75'],
      dtype='object')

In [31]:
# Read in metadata
metadata_path = '../metadata/metadata.csv'
md = pd.read_csv(metadata_path)
md['sample_ID'] = md['sample_ID'].astype(str)
md.head()

Unnamed: 0,patient_number,sample_ID,subject_ID,microbiome_type,body_site,AD_status,les_or_nonles,age_months,age_years,sex,skin_group
0,1,72,1,human_skin,forehead,Normal,NL,5,0,M,Normal
1,1,73,1,human_skin,nose,Normal,NL,5,0,M,Normal
2,1,75,1,human_skin,knee_pit,Normal,NL,5,0,M,Normal
3,2,76,2,human_skin,elbow_pit,Normal,NL,2,0,F,Normal
4,2,78,2,human_skin,forehead,Normal,NL,2,0,F,Normal


In [32]:
# Kepp only human_skin samples (remove human_oral samples)
skin_md = md[md['microbiome_type'] == 'human_oral']
skin_sample_ids = skin_md['sample_ID'].values
filtered_df = df.loc[df.index.isin(skin_sample_ids)]
filtered_df.head()

Unnamed: 0,g__Abiotrophia,g__Acinetobacter,g__Actinomyces,g__Aerococcus,g__Alloiococcus,g__Alloprevotella,g__Anaerococcus,g__Anaeroglobus,g__Bacillus_A,g__Bifidobacterium,...,g__Staphylococcus,g__Stomatobaculum,g__Streptobacillus,g__Streptococcus,g__Tepidiphilus,g__Thermaerobacter,g__Thiobacillus,g__Varibaculum,g__Veillonella,g__Xanthomonas
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,46.0,0.0,0.0,0.0,0.0,1.0,0.0
190,11.0,0.0,211.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,61.0,0.0,0.0,0.0,0.0,75.0,0.0
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,46.0,0.0,0.0,0.0,0.0,17.0,0.0
210,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,65.0,0.0,0.0,0.0,0.0,4.0,0.0
186,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,6.0,2.0,97.0,0.0,0.0,0.0,0.0,9.0,0.0


In [33]:
# Collapsing to top 10 genera and adding an 'Other' category for the rest
top_genera = filtered_df.sum().sort_values(ascending=False).head(10).index.tolist()
df_top_genera = filtered_df[top_genera]
df_top_genera[' s__Other'] = filtered_df.drop(columns=top_genera).sum(axis=1)
df_top_genera = df_top_genera.transpose()
df_top_genera.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top_genera[' s__Other'] = filtered_df.drop(columns=top_genera).sum(axis=1)


Unnamed: 0,195,190,197,210,186,217,213
g__Prevotella,2.0,16.0,0.0,267.0,240.0,21.0,30.0
g__Streptococcus,46.0,61.0,46.0,65.0,97.0,39.0,48.0
g__Lautropia,0.0,47.0,0.0,0.0,0.0,82.0,169.0
g__Actinomyces,0.0,211.0,0.0,0.0,1.0,15.0,57.0
g__Granulicatella,95.0,33.0,2.0,9.0,52.0,56.0,9.0


In [34]:
# Sort samples based on Cutibacterium relative abundances
# cutibacterium_values = df_top_genera.loc[' g__Cutibacterium']
# sorted_columns = cutibacterium_values.sort_values(ascending=False).index
# df_top_genera_Cuti_sorted = df_top_genera[sorted_columns]
# df_top_genera_Cuti_sorted.head()
df_top_genera_Cuti_sorted = df_top_genera

In [35]:
# Convert to relative abundance table
total_counts_per_sample = df_top_genera_Cuti_sorted.sum(axis=0)
relative_abundance_df = df_top_genera_Cuti_sorted.div(total_counts_per_sample, axis=1)
# Drop 'Other' 
relative_abundance_df.head()

Unnamed: 0,195,190,197,210,186,217,213
g__Prevotella,0.013245,0.02735,0.0,0.710106,0.467836,0.059829,0.04886
g__Streptococcus,0.304636,0.104274,0.666667,0.172872,0.189084,0.111111,0.078176
g__Lautropia,0.0,0.080342,0.0,0.0,0.0,0.233618,0.275244
g__Actinomyces,0.0,0.360684,0.0,0.0,0.001949,0.042735,0.092834
g__Granulicatella,0.629139,0.05641,0.028986,0.023936,0.101365,0.159544,0.014658


In [36]:
# Iterate over the columns in df
for col in relative_abundance_df.columns:
    # Check if the column name exists in md['sample_ID']
    if col in md['sample_ID'].values:
        # Find the row in md where sample_ID matches the column name
        row = md[md['sample_ID'] == col]
        
        # Construct the new column name
        # new_col_name = f"{row['age_months'].values[0]} mo"
        # new_col_name = f"{row['AD_status'].values[0]}"
        # new_col_name = f"{row['subject_ID'].values[0]}_{row['body_site'].values[0]}"
        new_col_name = f"#{row['sample_ID'].values[0]}_{row['skin_group'].values[0]}"

        # Rename the column in df
        relative_abundance_df.rename(columns={col: new_col_name}, inplace=True)

In [37]:
# Filter columns for face body sites
face_columns = [col for col in relative_abundance_df.columns if 'forehead' in col or 'cheek' in col or 'nose' in col]

In [38]:
face_df = relative_abundance_df[face_columns]
face_df.head()

g__Prevotella
g__Streptococcus
g__Lautropia
g__Actinomyces
g__Granulicatella


In [39]:
# Filter columns for pits body sites
pits_columns = [col for col in relative_abundance_df.columns if 'elbow_pit' in col or 'knee_pit' in col]
pits_df = relative_abundance_df[pits_columns]
pits_df.head()

g__Prevotella
g__Streptococcus
g__Lautropia
g__Actinomyces
g__Granulicatella


In [40]:
# Assuming relative_abundance_df is your DataFrame
relative_abundance_df_rev = relative_abundance_df.iloc[::-1]
relative_abundance_df_rev
data = relative_abundance_df_rev.to_numpy()  # Convert your DataFrame to numpy array if it's not already

num_rows, num_cols = data.shape

fig, ax = plt.subplots(figsize=(10,12))

# Normalizing data for color mapping
norm = plt.Normalize(data.min(), data.max())
cmap = plt.get_cmap("mako_r")

for i in range(num_rows):
    for j in range(num_cols):
        # Create a rectangle with rounded corners
        rect = patches.FancyBboxPatch((j, i), 1, 1, boxstyle="round,pad=-0.02,rounding_size=0.2", 
                                      edgecolor="white", facecolor=cmap(norm(data[i, j])))
        ax.add_patch(rect)

# Set the limits, labels, and other aesthetics
ax.set_xlim(0, num_cols)
ax.set_ylim(0, num_rows)
ax.set_xticks(np.arange(num_cols) + 0.5)
ax.set_yticks(np.arange(num_rows) + 0.5)
ax.set_xticklabels(relative_abundance_df_rev.columns, rotation=45, ha="right", fontsize=12)
ax.set_yticklabels(relative_abundance_df_rev.index, fontsize=12)
ax.set_title('Top 10 Genera in the Oral Microbiome', fontsize=20)
# ax.set_ylabel('Genera', fontsize=16)
# ax.set_xlabel('Sample Body Location', fontsize=16)

# Remove the border around the heatmap
for spine in ax.spines.values():
    spine.set_visible(False)
    
# Create a colorbar
# sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
# sm.set_array([])  # You can alternatively specify the bounds of your data
# cbar = plt.colorbar(sm, ax=ax, pad=0.02, aspect=20)  # Adjust pad and aspect to fit your layout
# cbar.tick_params(labelsize=10)  # You can adjust the font size as needed
# cbar.outline.set_edgecolor('none') # This removes the border around the colorbar

# Creating the colorbar with no border around it
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])  # You can alternatively specify the bounds of your data
cbar = plt.colorbar(sm, ax=ax, pad=0.02, aspect=20)  # Adjust pad and aspect to fit your layout
cbar.outline.set_edgecolor('none')  # This removes the border around the colorbar

# To change the font size of the colorbar's tick labels
for l in cbar.ax.yaxis.get_ticklabels():
    l.set_size(12)  # Set the font size here
    
plt.tight_layout()  # Adjust layout to not cut off labels

# Saving the plot
plt.savefig('../plots/genera-heatmap_plots/top_genera_heatmap_rs210_oral.png')
# plt.savefig('../plots/genera-heatmap_plots/top_genera_heatmap_woltka.png')


In [41]:
# # Assuming relative_abundance_df is your DataFrame
# # face_df = face_df.iloc[::-1]
# data = face_df.to_numpy()  # Convert your DataFrame to numpy array if it's not already

# num_rows, num_cols = data.shape

# fig, ax = plt.subplots(figsize=(18, 6))

# # Normalizing data for color mapping
# norm = plt.Normalize(data.min(), data.max())
# cmap = plt.get_cmap("magma_r")

# for i in range(num_rows):
#     for j in range(num_cols):
#         # Create a rectangle with rounded corners
#         rect = patches.FancyBboxPatch((j, i), 1, 1, boxstyle="round,pad=-0.02,rounding_size=0.2", 
#                                       edgecolor="white", facecolor=cmap(norm(data[i, j])))
#         ax.add_patch(rect)

# # Set the limits, labels, and other aesthetics
# ax.set_xlim(0, num_cols)
# ax.set_ylim(0, num_rows)
# ax.set_xticks(np.arange(num_cols) + 0.5)
# ax.set_yticks(np.arange(num_rows) + 0.5)
# ax.set_xticklabels(face_df.columns, rotation=45, ha="right")
# ax.set_yticklabels(face_df.index)
# ax.set_title('Heatmap of Top 20 Genera Relative Abundance', fontsize=20)
# ax.set_ylabel('Genera', fontsize=16)
# ax.set_xlabel('Sample Body Location', fontsize=16)

# # Remove the border around the heatmap
# for spine in ax.spines.values():
#     spine.set_visible(False)
    
# # Create a colorbar
# sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
# sm.set_array([])  # You can alternatively specify the bounds of your data
# cbar = plt.colorbar(sm, ax=ax, pad=0.02, aspect=10)  # Adjust pad and aspect to fit your layout
# cbar.outline.set_edgecolor('none') # This removes the border around the colorbar

# plt.tight_layout()  # Adjust layout to not cut off labels

# # Saving the plot
# plt.savefig('../plots/genera-heatmap_plots/top_genera_heatmap_face_df.png')


In [42]:
# # Create figure
# fig = plt.figure(figsize=(18, 6))

# # Create GridSpec with 2 columns of different widths based on the number of samples
# gs = gridspec.GridSpec(1, 2, width_ratios=[len(face_df.columns), len(pits_df.columns)])

# # Create subplots using the GridSpec
# ax1 = plt.subplot(gs[0])
# ax2 = plt.subplot(gs[1])

# # Plot heatmap for face samples
# sns.heatmap(face_df, ax=ax1, cmap='magma_r')
# ax1.set_title('Face site samples')

# # Plot heatmap for pits samples
# sns.heatmap(pits_df, ax=ax2, cmap='magma_r')
# ax2.set_title('Pit site samples')

# # Display the plot with tight layout
# plt.tight_layout()
# plt.savefig('../plots/top_genera_heatmap.png')
