# Cape Town vs Umtata Severity Distribution

In [7]:
# Import Python packages
import pandas as pd
import numpy as np
import biom
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
import os
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgba
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
from scipy.stats import ttest_ind


In [8]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

# Add AD status column based on group values
metadata['ad_status'] = metadata['group'].apply(lambda x: 'AD' if x.split('-')[-1].startswith('AD') else 'H')

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group,microbiome_type,ad_status
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin,AD
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin,AD
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD,nares,AD


In [9]:
# Convert o_scorad to numeric, handling any non-numeric values
metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')
# Convert to integer, NaN values will remain as NaN
metadata['o_scorad'] = metadata['o_scorad'].astype('Int64')

# Filter for Umtata participants
umtata_metadata = metadata[metadata['area'] == 'Umtata']

# Count participants in different severity ranges
moderate_scorad = umtata_metadata[
    (umtata_metadata['o_scorad'] >= 15) & 
    (umtata_metadata['o_scorad'] <= 40)
]['pid'].nunique()

severe_scorad = umtata_metadata[
    umtata_metadata['o_scorad'] > 40
]['pid'].nunique()

print("\nUmtata participants by severity:")
print(f"Moderate (oSCORAD 15–40): {moderate_scorad}")
print(f"Severe (oSCORAD >40): {severe_scorad}")



Umtata participants by severity:
Moderate (oSCORAD 15–40): 27
Severe (oSCORAD >40): 32


In [10]:
# Convert o_scorad to numeric, handling any non-numeric values
metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')
# Convert to integer, NaN values will remain as NaN
metadata['o_scorad'] = metadata['o_scorad'].astype('Int64')

# Filter for Umtata participants
umtata_metadata = metadata[metadata['area'] == 'Cape Town']

# Count participants in different severity ranges
moderate_scorad = umtata_metadata[
    (umtata_metadata['o_scorad'] >= 15) & 
    (umtata_metadata['o_scorad'] <= 40)
]['pid'].nunique()

severe_scorad = umtata_metadata[
    umtata_metadata['o_scorad'] > 40
]['pid'].nunique()

print("\nUmtata participants by severity:")
print(f"Moderate (oSCORAD 15–40): {moderate_scorad}")
print(f"Severe (oSCORAD >40): {severe_scorad}")



Umtata participants by severity:
Moderate (oSCORAD 15–40): 24
Severe (oSCORAD >40): 21


In [11]:
# Check if each person has a unique o_scorad
pid_scorad = metadata.groupby('pid')['o_scorad'].nunique()
pids_with_multiple_scorads = pid_scorad[pid_scorad > 1]

if len(pids_with_multiple_scorads) > 0:
    print("\nParticipants with multiple different o_scorad values:")
    print(pids_with_multiple_scorads)
    
    # Show the details for these participants
    for pid in pids_with_multiple_scorads.index:
        print(f"\nDetails for participant {pid}:")
        print(metadata[metadata['pid'] == pid][['o_scorad', 'group']])
else:
    print("\nAll participants have consistent o_scorad values across their samples")



All participants have consistent o_scorad values across their samples


In [12]:
# Get counts of samples by area and group
area_group_counts = pd.crosstab(metadata['area'], metadata['group'])

# Display the counts
print("\nSample counts by area and group:")
print(area_group_counts)



Sample counts by area and group:
group      nares-AD  nares-H  skin-ADL  skin-ADNL  skin-H
area                                                     
Cape Town        47       34        46         50      35
Umtata           61       55        61         61      52


In [13]:
# Check if samples from same pid are from different areas
pid_areas = metadata.groupby('pid')['area'].nunique()
pids_with_multiple_areas = pid_areas[pid_areas > 1]

if len(pids_with_multiple_areas) > 0:
    print("\nParticipants with samples from multiple areas:")
    print(pids_with_multiple_areas)
    
    # Show the details for these participants
    for pid in pids_with_multiple_areas.index:
        print(f"\nDetails for participant {pid}:")
        print(metadata[metadata['pid'] == pid][['area', 'group']])
else:
    print("\nAll participants' samples are from a single area")



All participants' samples are from a single area


In [14]:
# Use one o_scorad per individual
meta_pid = metadata[['pid', 'area', 'o_scorad']].drop_duplicates(subset='pid')
meta_pid['o_scorad'] = pd.to_numeric(meta_pid['o_scorad'], errors='coerce')

# Extract and filter data (remove oSCORAD < 15)
ct_data = meta_pid[(meta_pid['area'] == 'Cape Town') & (meta_pid['o_scorad'] >= 15)]['o_scorad'].dropna()
um_data = meta_pid[(meta_pid['area'] == 'Umtata') & (meta_pid['o_scorad'] >= 15)]['o_scorad'].dropna()


# T-test
t_stat, p_val = ttest_ind(ct_data, um_data)

# Create figure with 2x2 layout (histogram + boxplot)
fig, axes = plt.subplots(2, 2, figsize=(15, 6), gridspec_kw={'height_ratios': [3, 1]})

# Cape Town histogram
axes[0][0].hist(ct_data, bins=20, edgecolor='black', color='#d2b48c')
axes[0][0].set_title(f'Cape Town (urban)', fontsize=20)
axes[0][0].set_xlabel('oSCORAD Scores', fontsize=18)
axes[0][0].set_ylabel('Count', fontsize=18)
axes[0][0].tick_params(axis='both', labelsize=14)
axes[0][0].set_ylim(0, 12)
axes[0][0].set_yticks(range(0, 12, 4))
axes[0][0].set_xlim(15, 83)
axes[0][0].axvline(x=40, color='gray', linestyle='--', linewidth=2)
axes[0][0].text(0.02, 0.95, '15-40: moderate', transform=axes[0][0].transAxes,
                fontsize=13, verticalalignment='top',
                bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))
axes[0][0].text(0.8, 0.95, '>40: severe', transform=axes[0][0].transAxes,
                fontsize=13, verticalalignment='top',
                bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))

# Umtata histogram
axes[0][1].hist(um_data, bins=20, edgecolor='black', color='salmon')
axes[0][1].set_title(f'Umtata (rural)', fontsize=20)
axes[0][1].set_xlabel('oSCORAD Scores', fontsize=18)
axes[0][1].set_ylabel('Count', fontsize=18)
axes[0][1].tick_params(axis='both', labelsize=14)
axes[0][1].set_ylim(0, 12)
axes[0][1].set_yticks(range(0, 12, 4))
axes[0][1].set_xlim(15, 83)
axes[0][1].axvline(x=40, color='gray', linestyle='--', linewidth=2)
axes[0][1].text(0.02, 0.95, '15-40: moderate', transform=axes[0][1].transAxes,
                fontsize=13, verticalalignment='top',
                bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))
axes[0][1].text(0.8, 0.95, '>40: severe', transform=axes[0][1].transAxes,
                fontsize=13, verticalalignment='top',
                bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))

# Cape Town boxplot
axes[1][0].boxplot(ct_data, vert=False, patch_artist=True, widths=0.4,
                   boxprops=dict(facecolor='#d2b48c', color='black'),
                   medianprops=dict(color='black'))
axes[1][0].set_yticks([])
axes[1][0].set_xlim(15, 83)
axes[1][0].set_xlabel('oSCORAD', fontsize=16)
axes[1][0].tick_params(axis='x', labelsize=13)
axes[1][0].text(76, 1.3, f"n={len(ct_data)}", ha='left', va='center', fontsize=13)

# Umtata boxplot
axes[1][1].boxplot(um_data, vert=False, patch_artist=True, widths=0.4,
                   boxprops=dict(facecolor='salmon', color='black'),
                   medianprops=dict(color='black'))
axes[1][1].set_yticks([])
axes[1][1].set_xlim(15, 83)
axes[1][1].set_xlabel('oSCORAD', fontsize=16)
axes[1][1].tick_params(axis='x', labelsize=13)
axes[1][1].text(76, 1.3, f"n={len(um_data)}", ha='left', va='center', fontsize=13)


# Final layout and save
plt.suptitle('Distribution of AD Severity (oSCORAD) by Region', fontsize=22, y=0.95)
plt.tight_layout(rect=[0, 0, 1, 0.93])
plt.subplots_adjust(wspace=0.35, hspace=0.15)

plt.savefig('../Figures/Main/Fig_2D,F.png', dpi=600)

# Output t-test result
print(f"T-statistic: {t_stat:.3f}, P-value: {p_val:.3f}")


T-statistic: -0.887, P-value: 0.377
