In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
def load_data():
    infile_path = "../data/multi_omics_master_heatmap_table.tsv.gz"
    data = pd.read_csv(infile_path, sep="\t", low_memory=False, compression="gzip")
    data["Species_Strain"] = data["Species"] + " " + data["Strain"]
    data.replace(regex={
        r'^Proteomics MS1 DDA':'Proteomics_MS1', 
        r'^Proteomics MS2 DIA/SWATH':'Proteomics_MS2', 
        r'^Metabolomics GC-MS':'Metabolomics_GCMS',
        r'^Metabolomics LC-MS':'Metabolomics_LCMS',
        r';':'_',
    }, inplace=True)
    # for rnaseq, the id columns are swapped!
    rnaseq = data["Type_of_Experiment"] == "RNA-Seq"
    data.loc[rnaseq,['entity_id','additional_id']] = data.loc[rnaseq,['additional_id','entity_id']].values
    data = data[data['entity_id'].notna()]
    return data

data = load_data()

In [3]:
# reformat and standardise data to match below sample
# columns are features, rows are samples
#
#         MEGF9    GINS2   FAM13A    CPT1A    SNX10   TRIM45     ELP2    ALOX5
# A0FJ 5.340520 5.353213 4.849088 6.191176 4.591788 3.150658 6.581283 2.657454
# A13E 5.292700 3.639934 4.924028 6.139249 3.981164 3.280685 7.150755 2.979337
# A0G0 5.015130 4.453413 4.350443 6.504090 3.073839 2.542987 6.430628 3.373541

def get_unique(data):
    sp_strain = data["Species_Strain"].unique()
    omics = data["Type_of_Experiment"].unique()
    treat = data["Treatment_Type"].unique()
    return sp_strain, omics, treat
    
def split_modality(data, omics):
    """Take whole dataframe, separate into omics for a strain"""
    data = data.loc[
        (data["Type_of_Experiment"].str.contains(omics))
    ]
    data["entity_id"] = data["entity_id"] + "_" + data["Type_of_Experiment"]
    return data[["entity_id", "Treatment_Type", "replicate_name", "Units", "Log_Counts"]].set_index(["entity_id"])

def split_replicate(data, replicate_name):
    """Split into replicate"""
    return data[data["replicate_name"] == replicate_name]

def split_treatment(data, treatment, replicate_name):
    """Split into treatment"""
    data = pd.DataFrame(data[data["Treatment_Type"] == treatment]["Log_Counts"]).T
    data.rename({"Log_Counts":replicate_name}, inplace=True)
    data.columns.name = None
    return data

def split_data(data, omics):
    """Split data into diablo-compatible format"""
    modalities = split_modality(data, omics)
    treatments = modalities["Treatment_Type"].unique()
    
    samples = list()
    for rep in modalities["replicate_name"].unique():
        replicate = split_replicate(modalities, rep)
        for treatment in treatments:
            sample = split_treatment(replicate, treatment, rep)
            if sample.empty:
                continue
            else:
                sample = sample.T.groupby(by=sample.T.index, as_index=True).mean()
                samples.append(sample)
    return samples

def rescale_log(data, base=2):
    data = base**data
    data = data.fillna(0) + 1
    return np.log2(data)


In [4]:
# pick ids corresponding to strains of interest
# we want strains represented across all omics
def find_multiomics_strains(data, omic_main):
    print(omic_main)
    omic_data = [set(data.loc[(data["Type_of_Experiment"].str.contains(omic))]["Strain"].unique().tolist()) for omic in omic_main]
    omic_strains = omic_data[0].intersection(omic_data[1], omic_data[2], omic_data[3], omic_data[4])
    return omic_strains

def find_sample_names(data, omic_strains):
    print(omic_strains)
    omic_names = [data.loc[(data["Strain"].str.contains(strain))]["replicate_name"].unique() for strain in omic_strains]
    sample_names = [data.loc[(data["Strain"] == strain)]["replicate_name"].unique().tolist() for strain in omic_strains] 
    return sample_names

def extract_info(data, sample_names):
    treatments = data.drop_duplicates(["replicate_name", "Treatment_Type"])
    treatments = [treatments.loc[(treatments["replicate_name"].isin(sample_name))] for sample_name in sample_names]
    treatments = [t[["replicate_name", "Treatment_Type"]].set_index("replicate_name") for t in treatments]
    return treatments

def extract_data(normalised, strain_samples, strain):
    print(strain)
    return [x.loc[strain_samples[strain].index].fillna(0) + 1 for x in normalised]

In [5]:
def main(data):
    omics_main = list(data["Type_of_Experiment"].unique())
    # for each omics platform, extract a unique feature id
    samples = [split_data(data, omic) for omic in omics_main]
    # 
    samples = [pd.concat(sample, axis=1, join="outer") for sample in samples]
    # this is not log scale data
    lfq = [x.fillna(0) for x in samples[:1]]
    # add offset by unlogging > +1 > relogging
    log = [rescale_log(x) for x in samples[1:]]
    normalised = lfq + log
    normalised = [x.T for x in normalised]

    omic_strains = find_multiomics_strains(data, omics_main)
    sample_names = find_sample_names(data, omic_strains)
    sample_info = extract_info(data, sample_names)
    strain_samples = dict(zip(omic_strains, sample_info))    
    
    for key in strain_samples.keys():
        strain = extract_data(normalised, strain_samples, key)
        for omic, data in tuple(zip(omics_main, strain)):
            outfile_path = "".join(["../results/", key, "_", omic, ".tsv"])
            info_path = "".join(["../results/", key, "_info.tsv"])
            data.to_csv(outfile_path, sep="\t")
            strain_samples[key].to_csv(info_path, sep="\t")

main(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


['Proteomics_MS1', 'Proteomics_MS2', 'Metabolomics_GCMS', 'Metabolomics_LCMS', 'RNA-Seq']
{'SP444', '5448', 'MS14385', 'AJ218', 'AJ055', 'BPH2819', 'B36', 'BPH2986', 'MS14386', 'BPH2947', 'BPH2900', 'AJ292', 'PS006', 'BPH2760', 'KPC2', 'HKU419', 'MS14384', '03-311-0071', 'PS003', '04153260899A', 'MS14387'}
SP444


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


5448
MS14385
AJ218
AJ055
BPH2819
B36
BPH2986
MS14386
BPH2947
BPH2900
AJ292
PS006
BPH2760
KPC2
HKU419
MS14384
03-311-0071
PS003
04153260899A
MS14387
