# Parse BPA for input into pipeline

We want files for each strain for each omics data and merge the conditions.

In [1]:
import matplotlib
import pandas as pd
# pd.set_option('display.max_rows', None)

In [2]:
infile_path = "../multi_omics_master_heatmap_table.tsv"
data = pd.read_csv(infile_path, sep="\t")

# two different metabolomics and proteomics platforms used
exp_old = data["Type_of_Experiment"].unique().tolist()
exp_new = ["Proteomics", "Proteomics", "Metabolomics", 
           "Metabolomics", "Transcriptomics"]
exp_map = dict(zip(exp_old, exp_new))
data.replace({"Type_of_Experiment": exp_map}, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# take one strain as example first
strain = "AJ292"

data = data[data["Strain"] == strain]
sample_info = data[[
    "replicate_name", "Treatment_Type", "Type_of_Experiment"
]].drop_duplicates()
sample_info.set_index("replicate_name", inplace=True)

In [4]:
meta = data[data["Type_of_Experiment"] == "Metabolomics"]
prot = data[data["Type_of_Experiment"] == "Proteomics"]
tran = data[data["Type_of_Experiment"] == "Transcriptomics"]

meta = meta[[
    "entity_id", "replicate_name", "Log_Counts"
]].pivot_table(
    index="replicate_name", 
    columns="entity_id", 
    values="Log_Counts",
)

prot = prot[[
    "entity_id", "replicate_name", "Log_Counts"
]].pivot_table(
    index="replicate_name", 
    columns="entity_id", 
    values="Log_Counts",
)

tran = tran[[
    "entity_id", "replicate_name", "Log_Counts"
]].pivot_table(
    index="replicate_name", 
    columns="entity_id", 
    values="Log_Counts",
)

# check for missing values in data
missing = [
    meta.isnull().sum(axis=1).sum(),
    prot.isnull().sum(axis=1).sum(),
    tran.isnull().sum(axis=1).sum(),
]

In [5]:
def remap_samples(sample_info, omics_block, omics_name):
    index = sample_info[sample_info["Type_of_Experiment"] == omics_name]["Treatment_Type"]
    mapped = pd.DataFrame(index).merge(omics_block, left_index=True, right_index=True)
    mapped = mapped.reset_index().set_index("Treatment_Type").drop("replicate_name", axis=1)
    mapped.sort_values("Treatment_Type", inplace=True)
    mapped.reset_index(inplace=True)
    
    treat = pd.DataFrame(mapped.Treatment_Type)
    count = pd.DataFrame(pd.Series(mapped.index, dtype=str))
    treat_count = pd.merge(treat, count, left_index=True, right_index=True)
    mapped["tmp"] = treat_count["Treatment_Type"] + "_" + treat_count[0]
    mapped.drop("Treatment_Type", axis=1, inplace=True)
    mapped.set_index("tmp", inplace=True)
    mapped.index.name = None
    mapped.to_csv(".".join([omics_name, "tsv"]) ,sep="\t")
    return mapped
    
mapped_meta = remap_samples(sample_info, meta, "Metabolomics")
mapped_prot = remap_samples(sample_info, prot, "Proteomics")
mapped_tran = remap_samples(sample_info, tran, "Transcriptomics")

In [6]:
pheno = pd.DataFrame(mapped_meta.index)
pheno["Growth_Media"] = pheno[0]
pheno.set_index(0, inplace=True)
pheno.index.name = None
pheno = pd.DataFrame(pheno.Growth_Media.str.split("_", expand=True)[0])
pheno.columns = ["Growth_Media"]
pheno.to_csv("targets.tsv", sep="\t")

In [7]:
!md5 Metabolomics.tsv Proteomics.tsv Transcriptomics.tsv targets.tsv ../multi_omics_master_heatmap_table.tsv

MD5 (Metabolomics.tsv) = 9dafa5f07f26d05d1a2c8243fc1210d2
MD5 (Proteomics.tsv) = da31039054c4a3e389be76cb62e52080
MD5 (Transcriptomics.tsv) = 654d1344e5ee48503153fad2e1ed838d
MD5 (targets.tsv) = fd7fbb00c917cdaa8789ece2f9515cf8
MD5 (../multi_omics_master_heatmap_table.tsv) = 2cf0f076d94b9ed0f0810862997a76fe
