# Cape Town vs Umtata Clinical Severity Distribution

In [11]:
# Import Python packages
import pandas as pd
import numpy as np
import biom
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
import os
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgba
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
from scipy.stats import ttest_ind


In [12]:
# Load the metadata
metadata_path = '../Metadata/16S_AD_South-Africa_metadata_subset.tsv'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional_skin': 'skin-ADL',
    'case-nonlesional_skin': 'skin-ADNL', 
    'control-nonlesional_skin': 'skin-H',
    'case-anterior_nares': 'nares-AD',
    'control-anterior_nares': 'nares-H'
})

# Add AD status column based on group values
metadata['ad_status'] = metadata['group'].apply(lambda x: 'AD' if x.split('-')[-1].startswith('AD') else 'H')

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,age_months,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,group,ad_status
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,24.0,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,skin-ADL,AD
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,9.0,female,8/11/2015,Winter,Unexposed,negative,7.0,34,skin-ADL,AD
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,24.0,female,11/20/2014,Spring,Unexposed,negative,7.0,21,skin-ADL,AD
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,18.0,female,9/23/2015,Spring,Unexposed,,4.0,40,skin-ADL,AD
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,31.0,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,skin-ADL,AD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900401,5,C12,SB503,AGAGTCAC,SB712,CGTAGCGA,SB712SB503,CGTAGCGA-AGAGTCAC,1.010000e+21,C12,...,21.0,female,9/17/2015,Spring,Exposed,negative,12.0,38,skin-ADNL,AD
900402,6,B4,SA502,ACTATCTG,SB704,TCTCTATG,SB704SA502,TCTCTATG-ACTATCTG,1.010000e+21,B4,...,21.0,,,,,,,,nares-AD,AD
Ca006ONL,6,F1,SA506,CGTGAGTG,SB701,CTCGACTT,SB701SA506,CTCGACTT-CGTGAGTG,1.010000e+21,F1,...,35.0,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,skin-ADL,AD
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,35.0,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,skin-ADNL,AD


In [13]:
# Ensure o_scorad is numeric
metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

# Check for values < 15
below_15 = metadata.loc[
    (metadata['group'] == 'skin-ADL') & (metadata['o_scorad'] < 15),
    ['area', 'o_scorad']
]

below_15

Unnamed: 0_level_0,area,o_scorad
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1


In [14]:
# Convert o_scorad to numeric, handling any non-numeric values
metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')
# Convert to integer, NaN values will remain as NaN
metadata['o_scorad'] = metadata['o_scorad'].astype('Int64')

# Filter for Umtata participants
umtata_metadata = metadata[metadata['area'] == 'Umtata']

# Count participants in different severity ranges
moderate_scorad = umtata_metadata[
    (umtata_metadata['o_scorad'] >= 15) & 
    (umtata_metadata['o_scorad'] <= 40)
]['pid'].nunique()

severe_scorad = umtata_metadata[
    umtata_metadata['o_scorad'] > 40
]['pid'].nunique()

print("\nUmtata participants by severity:")
print(f"Moderate (oSCORAD 15–40): {moderate_scorad}")
print(f"Severe (oSCORAD >40): {severe_scorad}")



Umtata participants by severity:
Moderate (oSCORAD 15–40): 26
Severe (oSCORAD >40): 28


In [15]:
# Convert o_scorad to numeric, handling any non-numeric values
metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')
# Convert to integer, NaN values will remain as NaN
metadata['o_scorad'] = metadata['o_scorad'].astype('Int64')

# Filter for Umtata participants
capetown_metadata = metadata[metadata['area'] == 'Cape Town']

# Count participants in different severity ranges
moderate_scorad = capetown_metadata[
    (capetown_metadata['o_scorad'] >= 15) & 
    (capetown_metadata['o_scorad'] <= 40)
]['pid'].nunique()

severe_scorad = capetown_metadata[
    capetown_metadata['o_scorad'] > 40
]['pid'].nunique()

print("\nCape Town metadata participants by severity:")
print(f"Moderate (oSCORAD 15–40): {moderate_scorad}")
print(f"Severe (oSCORAD >40): {severe_scorad}")



Cape Town metadata participants by severity:
Moderate (oSCORAD 15–40): 23
Severe (oSCORAD >40): 20


In [16]:
# Check if each person has a unique o_scorad
pid_scorad = metadata.groupby('pid')['o_scorad'].nunique()
pids_with_multiple_scorads = pid_scorad[pid_scorad > 1]

if len(pids_with_multiple_scorads) > 0:
    print("\nParticipants with multiple different o_scorad values:")
    print(pids_with_multiple_scorads)
    
    # Show the details for these participants
    for pid in pids_with_multiple_scorads.index:
        print(f"\nDetails for participant {pid}:")
        print(metadata[metadata['pid'] == pid][['o_scorad', 'group']])
else:
    print("\nAll participants have consistent o_scorad values across their samples")



All participants have consistent o_scorad values across their samples


In [17]:
# Get counts of samples by area and group
area_group_counts = pd.crosstab(metadata['area'], metadata['group'])

# Display the counts
print("\nSample counts by area and group:")
print(area_group_counts)



Sample counts by area and group:
group      nares-AD  nares-H  skin-ADL  skin-ADNL  skin-H
area                                                     
Cape Town        43       34        44         44      34
Umtata           53       50        55         55      50


In [18]:
# Check if samples from same pid are from different areas
pid_areas = metadata.groupby('pid')['area'].nunique()
pids_with_multiple_areas = pid_areas[pid_areas > 1]

if len(pids_with_multiple_areas) > 0:
    print("\nParticipants with samples from multiple areas:")
    print(pids_with_multiple_areas)
    
    # Show the details for these participants
    for pid in pids_with_multiple_areas.index:
        print(f"\nDetails for participant {pid}:")
        print(metadata[metadata['pid'] == pid][['area', 'group']])
else:
    print("\nAll participants' samples are from a single area")



All participants' samples are from a single area


In [19]:
# Filter for Cape Town AD Lesional samples
ct_data = capetown_metadata[
    (capetown_metadata['area'] == 'Cape Town') &
    (capetown_metadata['group'] == 'skin-ADL')
]['o_scorad'].dropna()

um_data = umtata_metadata[
    (umtata_metadata['area'] == 'Umtata') &
    (umtata_metadata['group'] == 'skin-ADL')
]['o_scorad'].dropna()


In [20]:
# Check all missing o_scorad in skin-ADL samples
metadata.loc[
    (metadata['group'] == 'skin-ADL') & (metadata['o_scorad'].isna()),
    ['area', 'pid']
]

# 2 skin-ADL from Umtata missing o_scorad and 1 from Cape Town

Unnamed: 0_level_0,area,pid
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1
900224,Umtata,Ca-102-IR
900116,Cape Town,Ca-032-LN
900497,Umtata,Ca-157-LM


In [21]:
# T-test
t_stat, p_val = ttest_ind(ct_data, um_data)

# Create figure with 2x2 layout (histogram + boxplot)
fig, axes = plt.subplots(2, 2, figsize=(15, 6), gridspec_kw={'height_ratios': [3, 1]})

# Umtata histogram (LEFT)
axes[0][0].hist(um_data, bins=20, edgecolor='black', color='salmon')
axes[0][0].set_title(f'Umtata (rural)', fontsize=20)
axes[0][0].set_xlabel('oSCORAD Scores', fontsize=18)
axes[0][0].set_ylabel('Count', fontsize=18)
axes[0][0].tick_params(axis='both', labelsize=14)
axes[0][0].set_ylim(0, 12)
axes[0][0].set_yticks(range(0, 12, 4))
axes[0][0].set_xlim(15, 83)
axes[0][0].axvline(x=40, color='gray', linestyle='--', linewidth=2)
axes[0][0].text(0.02, 0.95, '15-40: moderate', transform=axes[0][0].transAxes,
                fontsize=13, verticalalignment='top',
                bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))
axes[0][0].text(0.8, 0.95, '>40: severe', transform=axes[0][0].transAxes,
                fontsize=13, verticalalignment='top',
                bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))

# Cape Town histogram (RIGHT)
axes[0][1].hist(ct_data, bins=20, edgecolor='black', color='#d2b48c')
axes[0][1].set_title(f'Cape Town (urban)', fontsize=20)
axes[0][1].set_xlabel('oSCORAD Scores', fontsize=18)
axes[0][1].set_ylabel('Count', fontsize=18)
axes[0][1].tick_params(axis='both', labelsize=14)
axes[0][1].set_ylim(0, 12)
axes[0][1].set_yticks(range(0, 12, 4))
axes[0][1].set_xlim(15, 83)
axes[0][1].axvline(x=40, color='gray', linestyle='--', linewidth=2)
axes[0][1].text(0.02, 0.95, '15-40: moderate', transform=axes[0][1].transAxes,
                fontsize=13, verticalalignment='top',
                bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))
axes[0][1].text(0.8, 0.95, '>40: severe', transform=axes[0][1].transAxes,
                fontsize=13, verticalalignment='top',
                bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))

# Umtata boxplot (LEFT)
axes[1][0].boxplot(um_data, vert=False, patch_artist=True, widths=0.4,
                   boxprops=dict(facecolor='salmon', color='black'),
                   medianprops=dict(color='black'))
axes[1][0].set_yticks([])
axes[1][0].set_xlim(15, 83)
axes[1][0].set_xlabel('oSCORAD', fontsize=16)
axes[1][0].tick_params(axis='x', labelsize=13)
axes[1][0].text(76, 1.3, f"n={len(um_data)}", ha='left', va='center', fontsize=13)

# Cape Town boxplot (RIGHT)
axes[1][1].boxplot(ct_data, vert=False, patch_artist=True, widths=0.4,
                   boxprops=dict(facecolor='#d2b48c', color='black'),
                   medianprops=dict(color='black'))
axes[1][1].set_yticks([])
axes[1][1].set_xlim(15, 83)
axes[1][1].set_xlabel('oSCORAD', fontsize=16)
axes[1][1].tick_params(axis='x', labelsize=13)
axes[1][1].text(76, 1.3, f"n={len(ct_data)}", ha='left', va='center', fontsize=13)

# Final layout and save
plt.tight_layout(rect=[0, 0, 1, 0.93])
plt.subplots_adjust(wspace=0.35, hspace=0.15)
plt.savefig('../Figures/Supplementary/Suppl_Fig_3A,B.jpg', dpi=600)

# Output t-test result
print(f"T-statistic: {t_stat:.3f}, P-value: {p_val:.3f}")


T-statistic: -0.550, P-value: 0.584
