#### Wrangling the metadata used for the Lymberopoulos et al., 2021 paper. Cleaned for upload to tmap_geography repo.

In [None]:
import pandas as pd
import numpy as np

In [None]:
## import GMrepo metadata of all genus-level data, filtered for healthy phenotype
metadata = pd.read_csv("metadata_health.csv", index_col=0)

### Filtering and refactoring metadata

In [None]:
## excluding all children and those with missing age values (to exclude possibility of having children in sample)
print(metadata.host_age.min())
print(metadata['host_age'].isnull().sum())

metadata = metadata[metadata["host_age"] > 18]
metadata = metadata[metadata["host_age"].notnull()]

print(metadata.host_age.min())
print(metadata['host_age'].isnull().sum())

In [None]:
## excluding those with country < 20
countries = metadata.country.value_counts()
countries = countries[countries>=20]
countries.index.tolist()
metadata = metadata[metadata["country"].isin(countries.index.tolist())]

# check sample sizes of the included countries
metadata.country.value_counts()

In [None]:
## Make BMI into categories
def bmi(row):
    if row["BMI"] < 18.5:
        return "underweight"
    if row["BMI"] < 25:
        return "normal"
    if row["BMI"] < 30:
        return "overweight"
    if row["BMI"] >= 30:
        return "obese"
    
metadata["BMI"] = metadata.apply(lambda row: bmi(row), axis=1)
metadata.BMI.value_counts()

#### Obtaining descriptive statistics

In [None]:
## Counting the different sequencing instruments used
metadata["instrument_model"].value_counts()

In [None]:
## Counting the number of missing values for BMI and sex
print(metadata['BMI'].isnull().sum())
print(metadata['sex'].isnull().sum())

In [None]:
## Counting the number of missing values for BMI and sex by country
countries = ["Brazil", "Canada", "China", "Denmark", "France", "Germany", "Italy", "New Zealand", "Spain", 
             "Tanzania, United Republic of", "United Kingdom", "United States of America"]

for country in countries:
    print(country)
    print(metadata[metadata["country"]==country].BMI.isnull().sum())
    print(metadata[metadata["country"]==country].sex.isnull().sum())

In [None]:
## Counting the number of all values for BMI and sex by country for demographics table
countries = ["Brazil", "Canada", "China", "Denmark", "France", "Germany", "Italy", "New Zealand", "Spain", 
             "Tanzania, United Republic of", "United Kingdom", "United States of America"]

for country in countries:
    print(country)
    print(metadata[metadata["country"]==country].BMI.value_counts())
    print(metadata[metadata["country"]==country].sex.value_counts())

In [None]:
## Calculating mean and SD age by country for demographics table
countries = ["Brazil", "Canada", "China", "Denmark", "France", "Germany", "Italy", "New Zealand", "Spain", 
             "Tanzania, United Republic of", "United Kingdom", "United States of America"]

for country in countries:
    print(country)
    print(round(metadata[metadata["country"]==country].host_age.mean(), 1))
    print(round(metadata[metadata["country"]==country].host_age.std(), 1))

In [None]:
## Calculating mean and SD age for all countries
print(round(metadata.host_age.mean(),1))
print(round(metadata.host_age.std(),1))

#### Dummy-coding categorical data and saving it 

In [None]:
## Dummy code, sex, BMI, and low/high
cat_columns = ["sex", "BMI", "country"]
metadata_dummies = pd.get_dummies(metadata, prefix_sep=": ", columns=cat_columns)

In [None]:
metadata_dummies.to_csv("metadata_health_all.csv")

### Filtering genus data to align with metadata

In [None]:
genus_data = pd.read_csv("genus_health.csv")
metadata_final = pd.read_csv("metadata_health_all.csv")


## filtering genus data for samples that are also in metadata
genus_data = genus_data[genus_data["run_id"].isin(metadata_final["run_id"].to_list())]
genus_data = genus_data.set_index("run_id")
metadata_final = metadata_final.set_index("run_id")

# checking that they are of the same length
print(len(metadata_final) == len(genus_data))

In [None]:
## exclude empty columns that appear after removing samples
genus_data.loc['sum'] = genus_data.sum(axis=0)
# for those that have sum == 0, drop the column, first find column names
empty_taxa = []

for column in genus_data.columns:
    column_series = genus_data[column]
    if column_series.iloc[-1] == 0:
        empty_taxa.append(column)

        
# drop columns, drop sum row
genus_data = genus_data.drop(columns = empty_taxa)
genus_data.drop(genus_data.tail(1).index,inplace=True)

In [None]:
# drop 'Unknown' column
genus_data = genus_data.drop(columns = ["Unknown"])

In [None]:
## overwrite files
genus_data.to_csv("genus_health_all.csv")