In [65]:
import pandas as pd

In [66]:
# Read in metadata
md = pd.read_csv('../Data/Metadata/16S_AD_South-Africa_metadata_full.tsv', sep='\t')
print("Number of total of samples: ", md.shape[0])

Number of total of samples:  502


In [67]:
# Keep only one sample type per child
md_unique = md.drop_duplicates(subset=['pid', 'case_type'], keep='first')
print("Number of total of samples: ", md_unique.shape[0])

Number of total of samples:  483


In [68]:
print("Number of children by AD status:\n", md_unique.groupby('participant')['pid'].nunique())

Number of children by AD status:
 participant
case       110
control     87
Name: pid, dtype: int64


In [69]:
print("Number of children: ", md_unique['pid'].nunique())

Number of children:  197


In [70]:
md_unique = md_unique.copy()
md_unique['age_months'] = pd.to_numeric(md_unique['age_months'], errors='coerce')
print("Range of age_months:")
print("Min:", md_unique['age_months'].min())
print("Max:", md_unique['age_months'].max())

Range of age_months:
Min: 9.0
Max: 38.0


In [71]:
print("Age range (in months) by AD status:\n",
      md_unique.groupby('participant')['age_months'].agg(['min', 'max']))

Age range (in months) by AD status:
               min   max
participant            
case          9.0  38.0
control      12.0  36.0


In [72]:
print("Number of skin and nasal samples: ", md_unique['specimen'].value_counts())

Number of skin and nasal samples:  specimen
skin     294
nasal    189
Name: count, dtype: int64


In [73]:
print("Sample numbers by body site and case/control status: ", md_unique['case_type'].value_counts())

Sample numbers by body site and case/control status:  case_type
case-nonlesional skin       106
case-lesional skin          103
case-anterior nares         103
control-anterior nares       86
control-nonlesional skin     85
Name: count, dtype: int64


In [74]:
print("Number of total of samples: ", md_unique.shape[0])

Number of total of samples:  483


In [75]:
# Loop through each unique case_type
for case in md_unique['case_type'].unique():
    subset = md_unique[md_unique['case_type'] == case]
    duplicates = subset[subset.duplicated('pid', keep=False)]

    if not duplicates.empty:
        print(f"Duplicate children found for case_type '{case}':")
        print(duplicates[['pid', '#sample-id']])
        print()  # blank line for readability
    else:
        print(f"No duplicate children found for case_type '{case}'.")

No duplicate children found for case_type 'case-lesional skin'.
No duplicate children found for case_type 'case-nonlesional skin'.
No duplicate children found for case_type 'case-anterior nares'.
No duplicate children found for case_type 'control-nonlesional skin'.
No duplicate children found for case_type 'control-anterior nares'.


### Subset of metadata included in analyses

In [76]:
# === CASE (AD) CHILDREN ===
ad_lesional = md_unique.loc[
    md_unique['case_type'] == 'case-lesional skin', 'pid'
].unique()

ad_nonlesional = md_unique.loc[
    md_unique['case_type'] == 'case-nonlesional skin', 'pid'
].unique()

ad_with_both = set(ad_lesional) & set(ad_nonlesional)


# === CONTROL CHILDREN ===
control_nl_skin = md_unique.loc[
    md_unique['case_type'] == 'control-nonlesional skin', 'pid'
].unique()

control_nares = md_unique.loc[
    md_unique['case_type'] == 'control-anterior nares', 'pid'
].unique()

control_with_both = set(control_nl_skin) & set(control_nares)


# === COMBINE CASE + CONTROL ===
children_with_both = ad_with_both.union(control_with_both)

# Subset to keep only those individuals
md_unique = md_unique[md_unique['pid'].isin(children_with_both)]

print("Final sample count:", len(md_unique))
print("Unique children retained:", md_unique['pid'].nunique())

# Optional: show how many in each group
print("\nAD with both:", len(ad_with_both))
print("Control with both:", len(control_with_both))


Final sample count: 462
Unique children retained: 183

AD with both: 99
Control with both: 84


In [77]:
print("Sample numbers by body site and case/control status: ", md_unique['case_type'].value_counts())

Sample numbers by body site and case/control status:  case_type
case-lesional skin          99
case-nonlesional skin       99
case-anterior nares         96
control-nonlesional skin    84
control-anterior nares      84
Name: count, dtype: int64


In [78]:
print("Age range (in months) by AD status:\n",
      md_unique.groupby('participant')['age_months'].agg(['min', 'max']))

Age range (in months) by AD status:
               min   max
participant            
case          9.0  37.0
control      12.0  36.0


In [79]:
print("Number of children: ", md_unique['pid'].nunique())

Number of children:  183


In [80]:
print("Number of skin and nasal samples: ", md_unique['specimen'].value_counts())

Number of skin and nasal samples:  specimen
skin     282
nasal    180
Name: count, dtype: int64
