In [None]:
import pandas as pd
import pyreadr

In [None]:
save_files = True

# Process the metadata

1. Match the metadata to the samples in the OTU matrix. For this, some of the sample IDs need to be corrected.
2. Curate the chip annotation and the gender
3. Join the metadata with the reaction abundance to prepare the dataset for model fitting

To overwrite the saved files, set the boolean ```save_files``` in the above cell to ```True```

## 1. Filter the metadata file to only keep samples that are present in the data matrix

This step is necessary as the metadata was obtained at a later date than the samples and contains irrelevant rows

In [None]:
meta_original = pd.read_csv("../data/processed_files/temp_metadata.csv", index_col=0)
meta_updated = pd.read_csv(
    "../data/original_files/After_Chip126_2022_december.tsv", sep="\t"
)
ID_curation = pd.read_csv("../data/original_files/unmatch_ID_BY.csv", index_col=0)
ID_curation.index = ID_curation["metadata"]

data = pyreadr.read_r("../data/processed_files/otumat.rds")
otu_table = data[None]  # extract the pandas data frame for the only object available

In [None]:
# Remove the first row of the metadata which is the data type
meta_updated = meta_updated.drop(0, axis=0)

# Curate the IDs of the misannotated samples
curated_IDs_updated = meta_updated["#SampleID"].apply(
    lambda i: i if i not in ID_curation["metadata"] else ID_curation.loc[i, "otu table"]
)
meta_updated["#SampleID"] = curated_IDs_updated
meta_updated.index = meta_updated["#SampleID"]
meta_updated

There is not the same amount of samples in both dataframes. There are 4248 samples in the original metadata, and 4286 is the new metadata. Those 38 extra samples need to be removed. To do so we match the updated metadata to the OTU matrix.


In [None]:
# Filter for the samples that are present in the OTU matrix
meta = meta_updated.loc[otu_table.columns, :]
meta

## Curate the chip annotation 

In [None]:
# Curate the chip annotation
def correct_chip(s):
    if "Chip157" in s:
        return "Chip157"
    else:
        return s


meta["IontorrentChip"] = meta["IontorrentChip"].apply(correct_chip)
meta["IontorrentChip"].unique()

In [None]:
if save_files:
    meta.to_csv("../data/processed_files/metadata.csv")