In [27]:
import pandas as pd

In [28]:
# Read in metadata
md = pd.read_csv('../Data/Metadata/16S_AD_South-Africa_metadata_full.tsv', sep='\t')
print("Number of total of samples: ", md.shape[0])

Number of total of samples:  502


In [29]:
# Keep only one sample type per child
md_unique = md.drop_duplicates(subset=['pid', 'case_type'], keep='first')
print("Number of total of samples: ", md_unique.shape[0])

Number of total of samples:  483


In [30]:
print("Number of children by AD status:\n", md_unique.groupby('participant')['pid'].nunique())

Number of children by AD status:
 participant
case       110
control     87
Name: pid, dtype: int64


In [31]:
print("Number of children: ", md_unique['pid'].nunique())

Number of children:  197


In [32]:
md_unique = md_unique.copy()
md_unique['age_months'] = pd.to_numeric(md_unique['age_months'], errors='coerce')
print("Range of age_months:")
print("Min:", md_unique['age_months'].min())
print("Max:", md_unique['age_months'].max())

Range of age_months:
Min: 9.0
Max: 38.0


In [33]:
print("Age range (in months) by AD status:\n",
      md_unique.groupby('participant')['age_months'].agg(['min', 'max']))

Age range (in months) by AD status:
               min   max
participant            
case          9.0  38.0
control      12.0  36.0


In [34]:
print("Number of skin and nasal samples: ", md_unique['specimen'].value_counts())

Number of skin and nasal samples:  specimen
skin     294
nasal    189
Name: count, dtype: int64


In [35]:
print("Sample numbers by body site and case/control status: ", md_unique['case_type'].value_counts())

Sample numbers by body site and case/control status:  case_type
case-nonlesional skin       106
case-lesional skin          103
case-anterior nares         103
control-anterior nares       86
control-nonlesional skin     85
Name: count, dtype: int64


In [36]:
print("Number of total of samples: ", md_unique.shape[0])

Number of total of samples:  483


In [37]:
# Loop through each unique case_type
for case in md_unique['case_type'].unique():
    subset = md_unique[md_unique['case_type'] == case]
    duplicates = subset[subset.duplicated('pid', keep=False)]

    if not duplicates.empty:
        print(f"Duplicate children found for case_type '{case}':")
        print(duplicates[['pid', '#sample-id']])
        print()  # blank line for readability
    else:
        print(f"No duplicate children found for case_type '{case}'.")

No duplicate children found for case_type 'case-lesional skin'.
No duplicate children found for case_type 'case-nonlesional skin'.
No duplicate children found for case_type 'case-anterior nares'.
No duplicate children found for case_type 'control-nonlesional skin'.
No duplicate children found for case_type 'control-anterior nares'.


### Subset of metadata included in analyses

In [38]:
# === CASE (AD) CHILDREN ===
ad_lesional = md_unique.loc[
    md_unique['case_type'] == 'case-lesional skin', 'pid'
].unique()

ad_nonlesional = md_unique.loc[
    md_unique['case_type'] == 'case-nonlesional skin', 'pid'
].unique()

ad_with_both = set(ad_lesional) & set(ad_nonlesional)


# === CONTROL CHILDREN ===
control_nl_skin = md_unique.loc[
    md_unique['case_type'] == 'control-nonlesional skin', 'pid'
].unique()

control_nares = md_unique.loc[
    md_unique['case_type'] == 'control-anterior nares', 'pid'
].unique()

control_with_both = set(control_nl_skin) & set(control_nares)


# === COMBINE CASE + CONTROL ===
children_with_both = ad_with_both.union(control_with_both)

# Subset to keep only those individuals
md_unique = md_unique[md_unique['pid'].isin(children_with_both)]

print("Final sample count:", len(md_unique))
print("Unique children retained:", md_unique['pid'].nunique())

# Optional: show how many in each group
print("\nAD with both:", len(ad_with_both))
print("Control with both:", len(control_with_both))


Final sample count: 462
Unique children retained: 183

AD with both: 99
Control with both: 84


In [39]:
print("Sample numbers by body site and case/control status: ", md_unique['case_type'].value_counts())

Sample numbers by body site and case/control status:  case_type
case-lesional skin          99
case-nonlesional skin       99
case-anterior nares         96
control-nonlesional skin    84
control-anterior nares      84
Name: count, dtype: int64


In [40]:
print("Age range (in months) by AD status:\n",
      md_unique.groupby('area')['age_months'].agg(['min', 'max']))

Age range (in months) by AD status:
             min   max
area                 
Cape Town  11.0  36.0
Umtata      9.0  37.0


In [41]:
print("Age range (in months) by region:\n",
      md_unique.groupby('participant')['age_months'].agg(['min', 'max']))

Age range (in months) by region:
               min   max
participant            
case          9.0  37.0
control      12.0  36.0


In [42]:
print("Number of children: ", md_unique['pid'].nunique())

Number of children:  183


In [43]:
print("Number of skin and nasal samples: ", md_unique['specimen'].value_counts())

Number of skin and nasal samples:  specimen
skin     282
nasal    180
Name: count, dtype: int64


In [44]:
# Save the subsetted metadata
md_unique.to_csv('../Data/Metadata/16S_AD_South-Africa_metadata_subset.tsv', sep='\t')

### Generate summary statistic data table

In [52]:
df = md_unique

# Get unique children data (one row per child to avoid counting same child multiple times)
children_df = df.drop_duplicates(subset='pid', keep='first')

# Function to calculate age stats and format
def get_age_stats(child_df):
    ages = child_df['age_months'].dropna()
    if len(ages) == 0:
        return "N/A"
    mean_age = ages.mean()
    std_age = ages.std()
    return f"{mean_age:.1f} ± {std_age:.1f}"

# Function to get sex distribution
def get_sex_dist(child_df):
    sex_counts = child_df['sex'].value_counts()
    male = sex_counts.get('male', 0)
    female = sex_counts.get('female', 0)
    total = male + female
    if total == 0:
        return "N/A"
    return f"{male}M / {female}F"

# Filter data by participant type and area
cases_data = df[df['participant'] == 'case']
cases_ct = cases_data[cases_data['area'] == 'Cape Town']
cases_um = cases_data[cases_data['area'] == 'Umtata']

controls_data = df[df['participant'] == 'control']
controls_ct = controls_data[controls_data['area'] == 'Cape Town']
controls_um = controls_data[controls_data['area'] == 'Umtata']

# Filter unique children for demographics
cases_children = children_df[children_df['participant'] == 'case']
cases_ct_children = cases_children[cases_children['area'] == 'Cape Town']
cases_um_children = cases_children[cases_children['area'] == 'Umtata']

controls_children = children_df[children_df['participant'] == 'control']
controls_ct_children = controls_children[controls_children['area'] == 'Cape Town']
controls_um_children = controls_children[controls_children['area'] == 'Umtata']

ct_children = children_df[children_df['area'] == 'Cape Town']
um_children = children_df[children_df['area'] == 'Umtata']

# Calculate all values for the table
summary_data = {
    'Cohort': ['AD', '', '', 'Controls', '', '', 'TOTAL', '', ''],
    'Group': ['Cape Town', 'Umtata', 'Subtotal', 'Cape Town', 'Umtata', 'Subtotal', 'Cape Town', 'Umtata', 'TOTAL'],
    
    # Children count (unique PIDs)
    'Children (n)': [
        cases_ct['pid'].nunique(),
        cases_um['pid'].nunique(),
        cases_data['pid'].nunique(),
        controls_ct['pid'].nunique(),
        controls_um['pid'].nunique(),
        controls_data['pid'].nunique(),
        df[df['area'] == 'Cape Town']['pid'].nunique(),
        df[df['area'] == 'Umtata']['pid'].nunique(),
        df['pid'].nunique()
    ],
    
    # Age (mean ± std)
    'Age (months)': [
        get_age_stats(cases_ct_children),
        get_age_stats(cases_um_children),
        get_age_stats(cases_children),
        get_age_stats(controls_ct_children),
        get_age_stats(controls_um_children),
        get_age_stats(controls_children),
        get_age_stats(ct_children),
        get_age_stats(um_children),
        get_age_stats(children_df)
    ],
    
    # Sex distribution
    'Sex (M/F)': [
        get_sex_dist(cases_ct_children),
        get_sex_dist(cases_um_children),
        get_sex_dist(cases_children),
        get_sex_dist(controls_ct_children),
        get_sex_dist(controls_um_children),
        get_sex_dist(controls_children),
        get_sex_dist(ct_children),
        get_sex_dist(um_children),
        get_sex_dist(children_df)
    ],
    
    # Lesional skin samples
    'Lesional Skin': [
        len(cases_ct[cases_ct['sample_type'] == 'lesional skin']),
        len(cases_um[cases_um['sample_type'] == 'lesional skin']),
        len(cases_data[cases_data['sample_type'] == 'lesional skin']),
        0,
        0,
        0,
        len(df[(df['area'] == 'Cape Town') & (df['sample_type'] == 'lesional skin')]),
        len(df[(df['area'] == 'Umtata') & (df['sample_type'] == 'lesional skin')]),
        len(df[df['sample_type'] == 'lesional skin'])
    ],
    
    # Nonlesional skin samples
    'Nonlesional Skin': [
        len(cases_ct[cases_ct['sample_type'] == 'nonlesional skin']),
        len(cases_um[cases_um['sample_type'] == 'nonlesional skin']),
        len(cases_data[cases_data['sample_type'] == 'nonlesional skin']),
        len(controls_ct[controls_ct['sample_type'] == 'nonlesional skin']),
        len(controls_um[controls_um['sample_type'] == 'nonlesional skin']),
        len(controls_data[controls_data['sample_type'] == 'nonlesional skin']),
        len(df[(df['area'] == 'Cape Town') & (df['sample_type'] == 'nonlesional skin')]),
        len(df[(df['area'] == 'Umtata') & (df['sample_type'] == 'nonlesional skin')]),
        len(df[df['sample_type'] == 'nonlesional skin'])
    ],
    
    # Nasal samples
    'Nasal': [
        len(cases_ct[cases_ct['specimen'] == 'nasal']),
        len(cases_um[cases_um['specimen'] == 'nasal']),
        len(cases_data[cases_data['specimen'] == 'nasal']),
        len(controls_ct[controls_ct['specimen'] == 'nasal']),
        len(controls_um[controls_um['specimen'] == 'nasal']),
        len(controls_data[controls_data['specimen'] == 'nasal']),
        len(df[(df['area'] == 'Cape Town') & (df['specimen'] == 'nasal')]),
        len(df[(df['area'] == 'Umtata') & (df['specimen'] == 'nasal')]),
        len(df[df['specimen'] == 'nasal'])
    ],
    
    # Total samples
    'Total Samples': [
        len(cases_ct),
        len(cases_um),
        len(cases_data),
        len(controls_ct),
        len(controls_um),
        len(controls_data),
        len(df[df['area'] == 'Cape Town']),
        len(df[df['area'] == 'Umtata']),
        len(df)
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df

Unnamed: 0,Cohort,Group,Children (n),Age (months),Sex (M/F),Lesional Skin,Nonlesional Skin,Nasal,Total Samples
0,AD,Cape Town,44,24.7 ± 7.5,23M / 21F,44,44,43,131
1,,Umtata,55,20.3 ± 6.7,28M / 24F,55,55,53,163
2,,Subtotal,99,22.3 ± 7.3,51M / 45F,99,99,96,294
3,Controls,Cape Town,34,24.3 ± 7.2,16M / 18F,0,34,34,68
4,,Umtata,50,21.7 ± 6.9,33M / 16F,0,50,50,100
5,,Subtotal,84,22.8 ± 7.1,49M / 34F,0,84,84,168
6,TOTAL,Cape Town,78,24.5 ± 7.3,39M / 39F,44,78,77,199
7,,Umtata,105,21.0 ± 6.8,61M / 40F,55,105,103,263
8,,TOTAL,183,22.5 ± 7.2,100M / 79F,99,183,180,462
