In [169]:
import pandas as pd
import numpy as np
import biom
from biom import load_table
from biom.util import biom_open
import qiime2 as q2
import csv
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from scipy.stats import pearsonr
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [170]:
# Read in BIOM table
biom_path = '../tables/tables_woltka/per_genome/Genus_collapsed.biom'
biom_table = load_table(biom_path)
df = pd.DataFrame(biom_table.to_dataframe())
df = df.transpose() # transpose so features are columns
df.head()

Unnamed: 0,g__,g__Abiotrophia,g__Acinetobacter,g__Actinomyces,g__Aerococcus,g__Aggregatibacter,g__Alloiococcus,g__Alloprevotella,g__Anaerococcus,g__Anaeroglobus,...,g__Stomatobaculum,g__Streptobacillus,g__Streptococcus,g__Tepidiphilus,g__Thermaerobacter,g__Thiobacillus,g__Varibaculum,g__Veillonella,g__Winkia,g__Xanthomonas
15443.218.S87.L005,118.0,8664.0,8338.0,71717.0,670.0,1244.0,4.0,767.0,169.0,41.0,...,17.0,2.0,167198.0,52.0,110.0,136.0,79.0,2614.0,305.0,4310.0
15443.216.S85.L005,67.0,13134.0,3289.0,60989.0,162.0,1349.0,8.0,482.0,67.0,38.0,...,22.0,3.0,286081.0,53.0,43.0,105.0,30.0,4760.0,143.0,5562.0
15443.90.S13.L005,0.0,5.0,153.0,303.0,0.0,174.0,0.0,76.0,24.0,10.0,...,0.0,0.0,9148.0,2.0,0.0,0.0,6.0,496.0,15.0,17.0
15443.187.S56.L005,208.0,5375.0,7445.0,29939.0,94.0,2199.0,217.0,26623.0,149.0,59.0,...,136.0,32.0,196509.0,222.0,121.0,152.0,54.0,5782.0,25.0,807.0
15443.88.S12.L005,22.0,14.0,1878.0,131.0,10.0,7.0,3.0,15.0,396.0,2.0,...,0.0,0.0,9727.0,54.0,85.0,130.0,127.0,600.0,86.0,191.0


In [171]:
# Check any samples with no count data and filter them out
rows_with_sum_zero = df.sum(axis=1) == 0
df_zero_sum = df[rows_with_sum_zero]
print(f"Rows where the sum of values is 0: {df_zero_sum.index}")

# Drop that sample from the dataframe 
df = df.drop(df_zero_sum.index)

Rows where the sum of values is 0: Index([], dtype='object')


In [172]:
# # Edit sample names, convert index to string (if not already)
df.index = df.index.astype(str)
# Transform the index by keeping only the part between the first and third periods
df.index = [x.split('.')[1] for x in df.index]
df.head()

Unnamed: 0,g__,g__Abiotrophia,g__Acinetobacter,g__Actinomyces,g__Aerococcus,g__Aggregatibacter,g__Alloiococcus,g__Alloprevotella,g__Anaerococcus,g__Anaeroglobus,...,g__Stomatobaculum,g__Streptobacillus,g__Streptococcus,g__Tepidiphilus,g__Thermaerobacter,g__Thiobacillus,g__Varibaculum,g__Veillonella,g__Winkia,g__Xanthomonas
218,118.0,8664.0,8338.0,71717.0,670.0,1244.0,4.0,767.0,169.0,41.0,...,17.0,2.0,167198.0,52.0,110.0,136.0,79.0,2614.0,305.0,4310.0
216,67.0,13134.0,3289.0,60989.0,162.0,1349.0,8.0,482.0,67.0,38.0,...,22.0,3.0,286081.0,53.0,43.0,105.0,30.0,4760.0,143.0,5562.0
90,0.0,5.0,153.0,303.0,0.0,174.0,0.0,76.0,24.0,10.0,...,0.0,0.0,9148.0,2.0,0.0,0.0,6.0,496.0,15.0,17.0
187,208.0,5375.0,7445.0,29939.0,94.0,2199.0,217.0,26623.0,149.0,59.0,...,136.0,32.0,196509.0,222.0,121.0,152.0,54.0,5782.0,25.0,807.0
88,22.0,14.0,1878.0,131.0,10.0,7.0,3.0,15.0,396.0,2.0,...,0.0,0.0,9727.0,54.0,85.0,130.0,127.0,600.0,86.0,191.0


In [173]:
# Read in metadata
metadata_path = '../metadata/metadata.csv'
md = pd.read_csv(metadata_path)
md['sample_ID'] = md['sample_ID'].astype(str)
md.head()

Unnamed: 0,patient_number,sample_ID,subject_ID,microbiome_type,body_site,AD_status,les_or_nonles,age_months,age_years,sex,skin_group
0,1,72,1,human_skin,forehead,healthy,NL,5,0,M,healthy
1,1,73,1,human_skin,nose,healthy,NL,5,0,M,healthy
2,1,75,1,human_skin,knee_pit,healthy,NL,5,0,M,healthy
3,2,76,2,human_skin,elbow_pit,healthy,NL,2,0,F,healthy
4,2,78,2,human_skin,forehead,healthy,NL,2,0,F,healthy


In [174]:
# Kepp only human_skin samples (remove human_oral samples)
skin_md = md[md['microbiome_type'] == 'human_skin']
skin_sample_ids = skin_md['sample_ID'].values
filtered_df = df.loc[df.index.isin(skin_sample_ids)]
filtered_df.head()

Unnamed: 0,g__,g__Abiotrophia,g__Acinetobacter,g__Actinomyces,g__Aerococcus,g__Aggregatibacter,g__Alloiococcus,g__Alloprevotella,g__Anaerococcus,g__Anaeroglobus,...,g__Stomatobaculum,g__Streptobacillus,g__Streptococcus,g__Tepidiphilus,g__Thermaerobacter,g__Thiobacillus,g__Varibaculum,g__Veillonella,g__Winkia,g__Xanthomonas
218,118.0,8664.0,8338.0,71717.0,670.0,1244.0,4.0,767.0,169.0,41.0,...,17.0,2.0,167198.0,52.0,110.0,136.0,79.0,2614.0,305.0,4310.0
216,67.0,13134.0,3289.0,60989.0,162.0,1349.0,8.0,482.0,67.0,38.0,...,22.0,3.0,286081.0,53.0,43.0,105.0,30.0,4760.0,143.0,5562.0
90,0.0,5.0,153.0,303.0,0.0,174.0,0.0,76.0,24.0,10.0,...,0.0,0.0,9148.0,2.0,0.0,0.0,6.0,496.0,15.0,17.0
187,208.0,5375.0,7445.0,29939.0,94.0,2199.0,217.0,26623.0,149.0,59.0,...,136.0,32.0,196509.0,222.0,121.0,152.0,54.0,5782.0,25.0,807.0
88,22.0,14.0,1878.0,131.0,10.0,7.0,3.0,15.0,396.0,2.0,...,0.0,0.0,9727.0,54.0,85.0,130.0,127.0,600.0,86.0,191.0


In [175]:
# Collapsing to top 20 genera and adding an 'Other' category for the rest
top_genera = filtered_df.sum().sort_values(ascending=False).head(20).index.tolist()
df_top_genera = filtered_df[top_genera]
df_top_genera[' s__Other'] = filtered_df.drop(columns=top_genera).sum(axis=1)
df_top_genera = df_top_genera.transpose()
df_top_genera.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top_genera[' s__Other'] = filtered_df.drop(columns=top_genera).sum(axis=1)


Unnamed: 0,218,216,90,187,88,85,188,208,205,204,...,189,91,96,87,76,78,79,73,81,75
g__Streptococcus,167198.0,286081.0,9148.0,196509.0,9727.0,51732.0,323431.0,20547.0,5965.0,6546.0,...,161337.0,67181.0,343811.0,4415158.0,244.0,48776.0,48044.0,50563.0,4937824.0,20149610.0
g__Cutibacterium,322266.0,141741.0,146671.0,155266.0,42790.0,22978.0,38387.0,685634.0,2518.0,548.0,...,905363.0,188465.0,502177.0,29067.0,1979.0,122151.0,115560.0,2203743.0,19261.0,348826.0
g__Staphylococcus,57446.0,12662.0,2699.0,7827.0,58645.0,3038.0,6751.0,51104.0,116.0,28.0,...,155047.0,33215.0,8585.0,589123.0,2117.0,8590.0,12584.0,13459.0,577970.0,1699151.0
g__Corynebacterium,69265.0,54834.0,1095.0,63782.0,4667.0,1821.0,46522.0,73877.0,74.0,21.0,...,100157.0,497.0,13148.0,727.0,350757.0,23829.0,16638.0,43322.0,2072.0,27412.0
g__Rothia,38157.0,53215.0,532.0,112141.0,2071.0,177.0,80316.0,7308.0,67911.0,70164.0,...,72659.0,5629.0,26118.0,40567.0,271.0,3050.0,5560.0,715.0,40342.0,98686.0


In [176]:
# Sort samples based on Cutibacterium relative abundances
cutibacterium_values = df_top_genera.loc[' g__Cutibacterium']
sorted_columns = cutibacterium_values.sort_values(ascending=False).index
df_top_genera_Cuti_sorted = df_top_genera[sorted_columns]
df_top_genera_Cuti_sorted.head()

Unnamed: 0,114,120,110,211,209,124,73,115,166,159,...,76,101,135,167,176,177,204,179,174,178
g__Streptococcus,21878.0,342944.0,5528.0,88086.0,923623.0,60033.0,50563.0,517399.0,51268.0,205716.0,...,244.0,8116.0,516.0,4136.0,143.0,30863.0,6546.0,491.0,214.0,23.0
g__Cutibacterium,5245204.0,4580448.0,3345705.0,3187686.0,2663630.0,2426528.0,2203743.0,1282868.0,1179478.0,999421.0,...,1979.0,1790.0,1066.0,890.0,551.0,549.0,548.0,360.0,234.0,73.0
g__Staphylococcus,8801.0,6873.0,31372.0,30294.0,35458.0,4319.0,13459.0,12919.0,11536.0,19794.0,...,2117.0,721.0,118.0,112.0,679.0,20054.0,28.0,1033.0,17869.0,47.0
g__Corynebacterium,105111.0,101167.0,35845.0,137877.0,52972.0,22258.0,43322.0,383389.0,216584.0,49423.0,...,350757.0,1099.0,209.0,298.0,81.0,723.0,21.0,40.0,197.0,26.0
g__Rothia,9014.0,82472.0,4283.0,7107.0,60518.0,6281.0,715.0,255111.0,30534.0,1498.0,...,271.0,7805.0,102.0,5276.0,93.0,417.0,70164.0,109.0,25.0,21.0


In [177]:
# Convert to relative abundance table
total_counts_per_sample = df_top_genera_Cuti_sorted.sum(axis=0)
relative_abundance_df = df_top_genera_Cuti_sorted.div(total_counts_per_sample, axis=1)
# Drop 'Other' 
relative_abundance_df.head()

Unnamed: 0,114,120,110,211,209,124,73,115,166,159,...,76,101,135,167,176,177,204,179,174,178
g__Streptococcus,0.003989,0.062135,0.001528,0.024885,0.196058,0.022412,0.021574,0.119454,0.031628,0.061269,...,0.000673,0.327166,0.127849,0.23516,0.031658,0.504157,0.061895,0.189869,0.011269,0.072555
g__Cutibacterium,0.956411,0.829886,0.924931,0.90055,0.565409,0.905911,0.94028,0.296181,0.727629,0.297661,...,0.005455,0.072157,0.264123,0.050603,0.121984,0.008968,0.005182,0.139211,0.012322,0.230284
g__Staphylococcus,0.001605,0.001245,0.008673,0.008558,0.007527,0.001612,0.005743,0.002983,0.007117,0.005895,...,0.005835,0.029064,0.029237,0.006368,0.150321,0.327589,0.000265,0.399459,0.940969,0.148265
g__Corynebacterium,0.019166,0.018329,0.009909,0.038951,0.011244,0.00831,0.018484,0.088515,0.133612,0.01472,...,0.966773,0.044302,0.051784,0.016943,0.017932,0.01181,0.000199,0.015468,0.010374,0.082019
g__Rothia,0.001644,0.014942,0.001184,0.002008,0.012846,0.002345,0.000305,0.058899,0.018837,0.000446,...,0.000747,0.314629,0.025273,0.299977,0.020589,0.006812,0.663427,0.04215,0.001316,0.066246


In [178]:
relative_abundance_df = relative_abundance_df.transpose()

In [179]:
# df = pd.read_csv('../csv_files/genera_relative_abundance.csv', index_col=0)
# df = df.transpose()
# df.head()

In [180]:
# # # Edit sample names, convert index to string (if not already)
# df.index = df.index.astype(str)
# # Transform the index by keeping only the part between the first and third periods
# df.index = [x.split('_')[0] for x in df.index]

In [181]:
# # Read in metadata
# metadata_path = '../metadata/metadata.csv'
# md = pd.read_csv(metadata_path)
# md['sample_ID'] = md['sample_ID'].astype(str)
# md.head()

In [182]:
sample_order = list(set(md['sample_ID']))
sample_order

['200',
 '209',
 '117',
 '185',
 '168',
 '218',
 '166',
 '162',
 '194',
 '81',
 '214',
 '159',
 '193',
 '115',
 '102',
 '198',
 '167',
 '177',
 '212',
 '170',
 '195',
 '82',
 '88',
 '90',
 '114',
 '87',
 '215',
 '151',
 '189',
 '76',
 '184',
 '163',
 '181',
 '176',
 '191',
 '216',
 '199',
 '192',
 '124',
 '205',
 '120',
 '211',
 '93',
 '101',
 '75',
 '206',
 '96',
 '213',
 '182',
 '100',
 '202',
 '173',
 '135',
 '179',
 '187',
 '210',
 '78',
 '84',
 '178',
 '174',
 '207',
 '196',
 '188',
 '72',
 '79',
 '217',
 '190',
 '110',
 '165',
 '172',
 '73',
 '169',
 '136',
 '203',
 '161',
 '85',
 '91',
 '157',
 '208',
 '186',
 '201',
 '112',
 '160',
 '197',
 '204',
 '123']

In [183]:
relative_abundance_df

Unnamed: 0,g__Streptococcus,g__Cutibacterium,g__Staphylococcus,g__Corynebacterium,g__Rothia,g__Neisseria,g__Acinetobacter,g__Haemophilus_D,g__Gemella,g__Prevotella,...,g__Klebsiella,g__Pauljensenia,g__Veillonella,g__Mycobacterium,g__Porphyromonas,g__F0422,g__Granulicatella,g__Haemophilus,g__Alloprevotella,s__Other
114,0.003989,0.956411,0.001605,0.019166,0.001644,0.002734,0.000071,0.000213,0.000051,0.000667,...,0.000080,0.001138,0.000177,0.002233,0.002586,0.000049,0.000140,0.000073,0.000410,0.004169
120,0.062135,0.829886,0.001245,0.018329,0.014942,0.002637,0.001149,0.001305,0.001120,0.018074,...,0.000214,0.008393,0.005390,0.001104,0.004322,0.000246,0.001588,0.000089,0.000592,0.025149
110,0.001528,0.924931,0.008673,0.009909,0.001184,0.000664,0.001706,0.000141,0.000019,0.000258,...,0.000206,0.000872,0.000116,0.024434,0.000536,0.000014,0.000048,0.000011,0.000092,0.021529
211,0.024885,0.900550,0.008558,0.038951,0.002008,0.000158,0.003205,0.000077,0.000490,0.006754,...,0.000092,0.002349,0.001596,0.000429,0.000086,0.000026,0.000229,0.000015,0.000254,0.006395
209,0.196058,0.565409,0.007527,0.011244,0.012846,0.000329,0.026063,0.000044,0.002706,0.068818,...,0.000609,0.030524,0.015796,0.001949,0.000059,0.000073,0.001991,0.000072,0.000187,0.030469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0.504157,0.008968,0.327589,0.011810,0.006812,0.000408,0.000882,0.000000,0.002744,0.000016,...,0.000000,0.000065,0.000131,0.000016,0.000000,0.000000,0.002908,0.000000,0.000000,0.133345
204,0.061895,0.005182,0.000265,0.000199,0.663427,0.005531,0.000132,0.128376,0.000057,0.000738,...,0.000000,0.031704,0.000038,0.000038,0.077061,0.000000,0.005408,0.004056,0.000000,0.014382
179,0.189869,0.139211,0.399459,0.015468,0.042150,0.012374,0.007734,0.000000,0.003094,0.000773,...,0.000000,0.000000,0.003867,0.000000,0.000000,0.000000,0.002320,0.001933,0.000000,0.180201
174,0.011269,0.012322,0.940969,0.010374,0.001316,0.003739,0.000105,0.001790,0.000000,0.002106,...,0.000000,0.003160,0.000527,0.000000,0.001896,0.000000,0.000000,0.000000,0.000000,0.010111


In [184]:
filtered_order = [index for index in sample_order if index in relative_abundance_df.index]
# Reorder the DataFrame using .loc with the filtered_order
df_reordered = relative_abundance_df.loc[filtered_order]

df_reordered

Unnamed: 0,g__Streptococcus,g__Cutibacterium,g__Staphylococcus,g__Corynebacterium,g__Rothia,g__Neisseria,g__Acinetobacter,g__Haemophilus_D,g__Gemella,g__Prevotella,...,g__Klebsiella,g__Pauljensenia,g__Veillonella,g__Mycobacterium,g__Porphyromonas,g__F0422,g__Granulicatella,g__Haemophilus,g__Alloprevotella,s__Other
200,0.020045,0.536218,0.030902,0.010341,0.046559,0.001841,0.008173,0.004233,0.000568,0.001600,...,0.000292,0.010134,0.001101,0.003854,0.005110,0.000069,0.000533,0.000327,0.000172,0.314694
209,0.196058,0.565409,0.007527,0.011244,0.012846,0.000329,0.026063,0.000044,0.002706,0.068818,...,0.000609,0.030524,0.015796,0.001949,0.000059,0.000073,0.001991,0.000072,0.000187,0.030469
117,0.167578,0.064989,0.002248,0.115255,0.087936,0.117025,0.002523,0.006831,0.003188,0.005696,...,0.000549,0.006455,0.004448,0.023475,0.049790,0.002184,0.003973,0.004061,0.024909,0.206221
185,0.438021,0.007363,0.009507,0.011227,0.037319,0.096897,0.002055,0.046102,0.016171,0.121949,...,0.000209,0.018438,0.019976,0.000049,0.031006,0.027430,0.027630,0.007413,0.028174,0.049213
168,0.142743,0.414946,0.010344,0.052167,0.149542,0.001438,0.001338,0.000290,0.001408,0.010307,...,0.000402,0.031138,0.001805,0.000295,0.001094,0.000257,0.000697,0.000102,0.000574,0.043550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,0.033305,0.663472,0.008688,0.013700,0.099986,0.002726,0.004798,0.008560,0.000525,0.002073,...,0.000213,0.013586,0.000866,0.002981,0.009966,0.000000,0.001789,0.001377,0.000312,0.128123
112,0.001331,0.990218,0.001123,0.002371,0.000261,0.000643,0.000063,0.000044,0.000015,0.000158,...,0.000000,0.000116,0.000085,0.000552,0.000637,0.000050,0.000074,0.000009,0.000166,0.001650
160,0.000610,0.006856,0.985288,0.000410,0.000188,0.000020,0.000138,0.000132,0.000003,0.000056,...,0.002833,0.000082,0.000017,0.000000,0.000065,0.000000,0.000006,0.000003,0.000006,0.003283
204,0.061895,0.005182,0.000265,0.000199,0.663427,0.005531,0.000132,0.128376,0.000057,0.000738,...,0.000000,0.031704,0.000038,0.000038,0.077061,0.000000,0.005408,0.004056,0.000000,0.014382


In [185]:
g__Cutibacterium = df_reordered[' g__Cutibacterium'].to_numpy()
g__Streptococcus = df_reordered[' g__Streptococcus'].to_numpy()

In [186]:
# Calculate the Pearson correlation coefficient
correlation_matrix = np.corrcoef(g__Cutibacterium, g__Streptococcus)
correlation_coefficient = correlation_matrix[0, 1]

print(f'Pearson Correlation Coefficient: {correlation_coefficient}')

Pearson Correlation Coefficient: -0.42018628154174925


In [187]:
# Assuming the necessary data and variables (md, g__Cutibacterium, g__Streptococcus) are defined

# Define your color map
color_map = {'healthy': 'blue', 'AD_pos_L': 'red', 'AD_pos_NL': 'orange'}

plt.figure(figsize=(6, 6))

# Filter and plot only healthy points
# for skin_group, cutibacterium, streptococcus in zip(md['skin_group'], g__Cutibacterium, g__Streptococcus):
#     if skin_group == 'healthy':  # Check if the status is 'healthy'
#         color = color_map[skin_group]
#         plt.scatter(streptococcus, cutibacterium, color=color, edgecolors='none')

# Since we're only plotting 'healthy', we can simplify the legend
plt.legend(handles=[mpatches.Patch(color=color_map['healthy'], label='healthy')], bbox_to_anchor=(0.95, 0.9))  # Example starting point

# plt.title('Correlation between Streptoccocus and Cutibacterium in AD non-lesional samples', size = 10)
plt.ylabel('g__Streptococcus')
plt.xlabel('g__Cutibacterium')

# Calculate and plot the trend line only for healthy data
# First, filter data for healthy
ad_pos_cutibacterium = [cutibacterium for skin_group, cutibacterium in zip(md['skin_group'], g__Cutibacterium) if skin_group == 'healthy']
ad_pos_streptococcus = [streptococcus for skin_group, streptococcus in zip(md['skin_group'], g__Streptococcus) if skin_group == 'healthy']

# Calculate the trend line
z = np.polyfit(ad_pos_cutibacterium, ad_pos_streptococcus, 1)
p = np.poly1d(z)
# Sort the cutibacterium values for a smooth trend line
sorted_cutibacterium = np.sort(ad_pos_cutibacterium)
plt.plot(sorted_cutibacterium, p(sorted_cutibacterium), "r-")

# Calculate the Pearson correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(ad_pos_cutibacterium, ad_pos_streptococcus)

# Annotate the Pearson correlation coefficient and p-value
plt.text(0.92, 0.75, f'Pearson r: {correlation_coefficient:.2f}\np-value: {p_value:.2e}', transform=plt.gca().transAxes, ha='right', va='top', bbox=dict(boxstyle="round", alpha=0.5, color="w"))

# Use seaborn's regplot to plot the data with a regression line and confidence interval
sns.regplot(x=ad_pos_cutibacterium, y=ad_pos_streptococcus, color='blue', ci=95)

ax = plt.gca()
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)

plt.savefig('../plots/correlation_plots/Cuti_vs_Strep_corr_healthy.png')


In [188]:
# Assuming the necessary data and variables (md, g__Cutibacterium, g__Streptococcus) are defined

# Define your color map
color_map = {'healthy': 'blue', 'AD_pos_L': 'red', 'AD_pos_NL': 'orange'}

plt.figure(figsize=(6, 6))

# Filter and plot only healthy points
# for skin_group, cutibacterium, streptococcus in zip(md['skin_group'], g__Cutibacterium, g__Streptococcus):
#     if skin_group == 'AD_pos_L':  # Check if the status is 'healthy'
#         color = color_map[skin_group]
#         plt.scatter(streptococcus, cutibacterium, color=color, edgecolors='none')

# Since we're only plotting 'healthy', we can simplify the legend
plt.legend(handles=[mpatches.Patch(color=color_map['AD_pos_L'], label='AD_pos_L')], bbox_to_anchor=(0.95, 0.9))  # Example starting point

# plt.title('Correlation between Streptoccocus and Cutibacterium in AD non-lesional samples', size = 10)
plt.ylabel('g__Streptococcus')
plt.xlabel('g__Cutibacterium')

# Calculate and plot the trend line only for healthy data
# First, filter data for healthy
ad_pos_cutibacterium = [cutibacterium for skin_group, cutibacterium in zip(md['skin_group'], g__Cutibacterium) if skin_group == 'AD_pos_L']
ad_pos_streptococcus = [streptococcus for skin_group, streptococcus in zip(md['skin_group'], g__Streptococcus) if skin_group == 'AD_pos_L']

# Calculate the trend line
z = np.polyfit(ad_pos_cutibacterium, ad_pos_streptococcus, 1)
p = np.poly1d(z)
# Sort the cutibacterium values for a smooth trend line
sorted_cutibacterium = np.sort(ad_pos_cutibacterium)
plt.plot(sorted_cutibacterium, p(sorted_cutibacterium), "r-")

# Calculate the Pearson correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(ad_pos_cutibacterium, ad_pos_streptococcus)

# Annotate the Pearson correlation coefficient and p-value
plt.text(0.92, 0.75, f'Pearson r: {correlation_coefficient:.2f}\np-value: {p_value:.2e}', transform=plt.gca().transAxes, ha='right', va='top', bbox=dict(boxstyle="round", alpha=0.5, color="w"))

# Use seaborn's regplot to plot the data with a regression line and confidence interval
sns.regplot(x=ad_pos_cutibacterium, y=ad_pos_streptococcus, color='red', ci=95)

ax = plt.gca()
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)

plt.savefig('../plots/correlation_plots/Cuti_vs_Strep_corr_AD_pos_L.png')


In [189]:
# Assuming the necessary data and variables (md, g__Cutibacterium, g__Streptococcus) are defined

# Define your color map
color_map = {'healthy': 'blue', 'AD_pos_L': 'red', 'AD_pos_NL': 'orange'}

plt.figure(figsize=(6, 6))


# Since we're only plotting 'healthy', we can simplify the legend
plt.legend(handles=[mpatches.Patch(color=color_map['AD_pos_NL'], label='AD_pos_NL')], bbox_to_anchor=(0.95, 0.9))  # Example starting point

# plt.title('Correlation between Streptoccocus and Cutibacterium in AD non-lesional samples', size = 10)
plt.ylabel('g__Streptococcus')
plt.xlabel('g__Cutibacterium')

# Calculate and plot the trend line only for healthy data
# First, filter data for healthy
ad_pos_cutibacterium = [cutibacterium for skin_group, cutibacterium in zip(md['skin_group'], g__Cutibacterium) if skin_group == 'AD_pos_NL']
ad_pos_streptococcus = [streptococcus for skin_group, streptococcus in zip(md['skin_group'], g__Streptococcus) if skin_group == 'AD_pos_NL']

# Calculate the trend line
z = np.polyfit(ad_pos_cutibacterium, ad_pos_streptococcus, 1)
p = np.poly1d(z)
# Sort the cutibacterium values for a smooth trend line
sorted_cutibacterium = np.sort(ad_pos_cutibacterium)
plt.plot(sorted_cutibacterium, p(sorted_cutibacterium), "r-")

# Calculate the Pearson correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(ad_pos_cutibacterium, ad_pos_streptococcus)

# Annotate the Pearson correlation coefficient and p-value
plt.text(0.92, 0.75, f'Pearson r: {correlation_coefficient:.2f}\np-value: {p_value:.2e}', transform=plt.gca().transAxes, ha='right', va='top', bbox=dict(boxstyle="round", alpha=0.5, color="w"))

# Use seaborn's regplot to plot the data with a regression line and confidence interval
sns.regplot(x=ad_pos_cutibacterium, y=ad_pos_streptococcus, color='orange', ci=95)

ax = plt.gca()
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)

plt.savefig('../plots/correlation_plots/Cuti_vs_Strep_corr_AD_pos_NL.png')
