In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
infile_path = "multi_omics_master_heatmap_table.tsv"
data = pd.read_csv(infile_path, sep="\t", low_memory=False)
data["Species_Strain"] = data["Species"] + " " + data["Strain"]
data.replace(regex={
    r'^Proteomics MS1 DDA':'Proteomics_MS1', 
    r'^Proteomics MS2 DIA/SWATH':'Proteomics_MS2', 
    r'^Metabolomics GC-MS':'Metabolomics_GCMS',
    r'^Metabolomics LC-MS':'Metabolomics_LCMS',
    r';':'_',
}, inplace=True)
# for rnaseq, the id columns are swapped!
rnaseq = data["Type_of_Experiment"] == "RNA-Seq"
data.loc[rnaseq,['entity_id','additional_id']] = data.loc[rnaseq,['additional_id','entity_id']].values
data = data[data['entity_id'].notna()]
data

Unnamed: 0,Species,Strain,Type_of_Experiment,entity_id,Treatment_Type,replicate_name,Units,Log_Counts,Imputed,additional_id,Species_Strain
0,Escherichia coli,B36,Proteomics_MS1,WP_000002542.1,Sera,50979,LFQ,8.918769,False,EW036_RS06795,Escherichia coli B36
1,Escherichia coli,B36,Proteomics_MS1,WP_000002953.1,Sera,50979,LFQ,9.318898,False,EW036_RS22610,Escherichia coli B36
2,Escherichia coli,B36,Proteomics_MS1,WP_000003071.1,Sera,50979,LFQ,10.183896,False,EW036_RS05240,Escherichia coli B36
3,Escherichia coli,B36,Proteomics_MS1,WP_000003317.1,Sera,50979,LFQ,8.208334,False,EW036_RS07025,Escherichia coli B36
4,Escherichia coli,B36,Proteomics_MS1,WP_000003382.1,Sera,50979,LFQ,9.662881,False,EW036_RS00895,Escherichia coli B36
5,Escherichia coli,B36,Proteomics_MS1,WP_000003638.1,Sera,50979,LFQ,8.250029,False,EW036_RS08900,Escherichia coli B36
6,Escherichia coli,B36,Proteomics_MS1,WP_000003806.1,Sera,50979,LFQ,10.137702,False,EW036_RS23285,Escherichia coli B36
7,Escherichia coli,B36,Proteomics_MS1,WP_000003820.1,Sera,50979,LFQ,10.892056,False,EW036_RS20870,Escherichia coli B36
8,Escherichia coli,B36,Proteomics_MS1,WP_000004202.1,Sera,50979,LFQ,7.969379,False,EW036_RS17905,Escherichia coli B36
9,Escherichia coli,B36,Proteomics_MS1,WP_000004421.1,Sera,50979,LFQ,9.155700,False,EW036_RS02570,Escherichia coli B36


In [3]:
# get type > sp_st_treatment 

#         MEGF9    GINS2   FAM13A    CPT1A    SNX10   TRIM45     ELP2    ALOX5
# A0FJ 5.340520 5.353213 4.849088 6.191176 4.591788 3.150658 6.581283 2.657454
# A13E 5.292700 3.639934 4.924028 6.139249 3.981164 3.280685 7.150755 2.979337
# A0G0 5.015130 4.453413 4.350443 6.504090 3.073839 2.542987 6.430628 3.373541

# treatment foo treatment foo treatment bar
# df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]

def get_unique(data):
    sp_strain = data["Species_Strain"].unique()
    omics = data["Type_of_Experiment"].unique()
    treat = data["Treatment_Type"].unique()
    return sp_strain, omics, treat
    
def split_modality(data, omics):
    """Take whole dataframe, separate into omics for a strain"""
    data = data.loc[
        (data["Type_of_Experiment"].str.contains(omics))
    ]
    data["entity_id"] = data["entity_id"] + "_" + data["Type_of_Experiment"]
    return data[["entity_id", "Treatment_Type", "replicate_name", "Units", "Log_Counts"]].set_index(["entity_id"])

def split_replicate(data, replicate_name):
    """Split into replicate"""
    return data[data["replicate_name"] == replicate_name]

def split_treatment(data, treatment, replicate_name):
    """Split into treatment"""
    data = pd.DataFrame(data[data["Treatment_Type"] == treatment]["Log_Counts"]).T
    data.rename({"Log_Counts":replicate_name}, inplace=True)
    data.columns.name = None
    return data

def split_data(data, omics):
    """Split data into diablo-compatible format"""
    modalities = split_modality(data, omics)
    treatments = modalities["Treatment_Type"].unique()
    
    samples = list()
    for rep in modalities["replicate_name"].unique():
        replicate = split_replicate(modalities, rep)
        for treatment in treatments:
            sample = split_treatment(replicate, treatment, rep)
            if sample.empty:
                continue
            else:
                sample = sample.T.groupby(by=sample.T.index, as_index=True).mean()
                samples.append(sample)
    return samples

def rescale_log(data, base=2):
    data = base**data
    data = data.fillna(0) + 1
    return np.log2(data)


In [27]:
def main():
    omics = list(data["Type_of_Experiment"].unique())
    # for each omics platform, extract a unique feature id
    samples = [split_data(data, omic) for omic in omics]
    # 
    samples = [pd.concat(sample, axis=1, join="outer") for sample in samples]
    lfq = [x.fillna(0) for x in samples[:1]]
    log = [rescale_log(x) for x in samples[1:]]
    normalised = lfq + log
    normalised = [x.T for x in normalised]
    return omics, normalised
    
omics, normalised = main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [28]:
[x.shape for x in normalised]
[x.values.min() for x in normalised]

[0.0, 0.0, 0.0, 0.0, 0.0]

In [None]:
# need find samples with all omics types

replicates = [x for x in data["replicate_name"].unique()]
omics = data["Type_of_Experiment"].unique()

# uniq_samples = list()
# for omic in omics:
#     sample = data.loc[(data["Type_of_Experiment"] == omic)]["replicate_name"].unique()
#     uniq_samples.append(sample)

unique_samples = [set(list(data.loc[(data["Type_of_Experiment"] == omic)]["replicate_name"].unique())) for omic in omics]
prot_uniq = unique_samples[0].union(unique_samples[1])
meta_uniq = unique_samples[2].union(unique_samples[3])
rseq_uniq = unique_samples[4]

In [29]:
normalised[0]

Unnamed: 0,WP_000002019.1_Proteomics_MS1,WP_000002056.1_Proteomics_MS1,WP_000002068.1_Proteomics_MS1,WP_000002071.1_Proteomics_MS1,WP_000002080.1_Proteomics_MS1,WP_000002529.1_Proteomics_MS1,WP_000002542.1_Proteomics_MS1,WP_000002678.1_Proteomics_MS1,WP_000002682.1_Proteomics_MS1,WP_000002683.1_Proteomics_MS1,...,WP_146707953.1_Proteomics_MS1,WP_146707954.1_Proteomics_MS1,WP_146707955.1_Proteomics_MS1,WP_146707961.1_Proteomics_MS1,WP_146707962.1_Proteomics_MS1,WP_146707963.1_Proteomics_MS1,WP_146707965.1_Proteomics_MS1,WP_146707966.1_Proteomics_MS1,WP_146707967.1_Proteomics_MS1,WP_146708028.1_Proteomics_MS1
50979,0.0,0.0,0.0,0.0,0.0,0.000000,8.918769,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
50980,0.0,0.0,0.0,0.0,0.0,0.000000,8.890510,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
50981,0.0,0.0,0.0,0.0,0.0,0.000000,8.833721,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
50982,0.0,0.0,0.0,0.0,0.0,0.000000,8.901409,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
50983,0.0,0.0,0.0,0.0,0.0,0.000000,8.940731,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
50984,0.0,0.0,0.0,0.0,0.0,0.000000,8.884217,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
50973,0.0,0.0,0.0,0.0,0.0,0.000000,8.873919,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
50974,0.0,0.0,0.0,0.0,0.0,0.000000,8.934079,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
50975,0.0,0.0,0.0,0.0,0.0,0.000000,8.966916,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
50976,0.0,0.0,0.0,0.0,0.0,0.000000,8.954995,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [None]:
# rseq_uniq.intersection(prot_uniq, meta_uniq)
saureus = data.loc[(data["Species"] == "Staphylococcus aureus")]["replicate_name"].unique()


In [None]:
saureus = data.loc[(data["Species"] == "Staphylococcus aureus")]["replicate_name"].unique()
normalised[4].loc[saureus]

In [None]:
data["Species"].unique()

In [31]:
data["Units"].unique()

array(['LFQ', 'Log2PeakArea', 'log2_abundance', 'log2_cpm'], dtype=object)