# Correlated multi-omics features in SARS-Cov-2 infected samples

Examine the correlations returned by the integrative analysis.

Our hypothesis here is to test if there are features in multiomics data which are absent from singleomics data.

Steps:
- Flatten correlations for a more human readable format
- Preserve intraomics correlations? or split out into separate dataframes
- Also look at top loadings returned by splsda

In [1]:
import numpy as np
import pandas as pd

In [61]:
prot_plsda_infile = "../results/results/singleblock/proteome/proteome_1_PLSDA_max.txt"
prot_splsda_infile = "../results/results/singleblock/proteome/proteome_1_sPLSDA_max.txt"
tran_plsda_infile = "../results/results/singleblock/translatome/translatome_1_PLSDA_max.txt"
tran_splsda_infile = "../results/results/singleblock/translatome/translatome_1_sPLSDA_max.txt"
diablo_prot_infile = "../results/results/multiblock/proteome_1_DIABLO_var_keepx_max.txt"
diablo_tran_infile = "../results/results/multiblock/translatome_1_DIABLO_var_keepx_max.txt"
diablo_corr_infile = "../results/results/multiblock/DIABLO_var_keepx_correlations.txt"
panther_annotations_infile = "../results/results/multiblock/panther.txt"
mapping_annotations_infile = "../results/results/multiblock/mapping.csv"
covid_annotations_infile = "../results/results/multiblock/uniprot_SARS-COV2_annotations.tab"
prot_map_infile = "../data/proteome_mapfile.txt"
tran_map_infile = "../data/translatome_mapfile.txt"

prot_plsda = pd.read_csv(prot_plsda_infile, sep="\t")
prot_splsda = pd.read_csv(prot_splsda_infile, sep="\t")
tran_plsda = pd.read_csv(tran_plsda_infile, sep="\t")
tran_splsda = pd.read_csv(tran_splsda_infile, sep="\t")
diablo_prot = pd.read_csv(diablo_prot_infile, sep="\t")
diablo_tran = pd.read_csv(diablo_tran_infile, sep="\t")
diablo_corr = pd.read_csv(diablo_corr_infile, sep="\t")
panther_annotations = pd.read_csv(panther_annotations_infile, sep="\t", header=None)
covid_annotations = pd.read_csv(covid_annotations_infile, sep="\t")
mapping_annotations = pd.read_csv(mapping_annotations_infile, sep=",")
prot_map = pd.read_csv(prot_map_infile, sep="\t", usecols=["key", "val"])
tran_map = pd.read_csv(tran_map_infile, sep="\t", usecols=["key", "val"])

In [108]:
# flatten the correlations for a more human readable format
flat_corr = diablo_corr.stack().reset_index()

# order of cols doesnt matter here, matrix is symmetrical
prot = flat_corr.level_0.str.contains('_proteome')   
tran = flat_corr.level_1.str.contains('_translatome')
multiblock = flat_corr[prot & tran]

# below block proves the above point:
_prot = flat_corr.level_1.str.contains('_proteome')   
_tran = flat_corr.level_0.str.contains('_translatome')
_multiblock = flat_corr[_prot & _tran]

ver1 = multiblock.reset_index().drop("index", axis=1).sort_values(by=["level_0", "level_1"], ignore_index=True)
ver2 = _multiblock.reset_index().drop("index", axis=1)[["level_1", "level_0", 0]]
ver2.columns = ["level_0", "level_1", 0]
ver2 = ver2.sort_values(by=["level_0", "level_1"], ignore_index=True)

assert ver1[["level_0", "level_1"]].equals(ver2[["level_0", "level_1"]]) is True, \
    "Order of correlated features must be identical"
assert set(np.isclose(ver1[0], ver2[0])) == {True}, \
    "Order of correlated values are identical"

# we want correlations across modalities only for this step
prot_tran = flat_corr[(flat_corr.level_0.str.contains("_proteome")) & (flat_corr.level_1.str.contains("_translatome"))]
tran_prot = flat_corr[(flat_corr.level_1.str.contains("_proteome")) & (flat_corr.level_0.str.contains("_translatome"))]
prot_tran.columns = ["proteome", "translatome", "correlation"]
prot_tran = prot_tran.sort_values(by=["proteome", "translatome"], ignore_index=True)

# clean up file names
prot_tran = prot_tran.replace(regex={r'_proteome$': '', r'_translatome': ''})

# reverse mapping of gene name back onto uniprot id
prot_tran_annot = pd.merge(prot_map, prot_tran, how="right", left_on="val", right_on="proteome").sort_values(by=["proteome", "translatome"], ignore_index=True)
prot_tran_annot = pd.merge(tran_map, prot_tran_annot, how="right", left_on="val", right_on="translatome").sort_values(by=["proteome", "translatome"], ignore_index=True)
prot_tran_annot.columns = ["tran_uniprot", "tran_gene", "prot_uniprot", "prot_gene", "proteome", "translatome", "correlation"]
prot_tran_annot = prot_tran_annot.replace(regex={r'_prot$': '', r'_tran$': '', r'__FEATUREID$': ''})
prot_tran_annot.drop(["proteome", "translatome"], axis=1, inplace=True)

# use panther annotations
panther_annot = panther_annotations[[1,2,4]].replace(regex={r',.*$': ''})
panther_annot.columns = [1,2,3]

# use covid annotations
covid_annot = covid_annotations.copy()
covid_annot.replace(regex={r'_HUMAN$': ''}, inplace=True)
covid_annot.drop(["Status", "Gene names", "Entry name", "Length"], axis=1, inplace=True)
covid_annot.columns = [1, 2, 3]
covid_annot

# merge annotations
annot = pd.concat([panther_annot, covid_annot], ignore_index=True)

# annotate genes
prot_tran_annot = pd.merge(annot, prot_tran_annot, how="right", left_on=1, right_on="prot_uniprot")
prot_tran_annot.columns = ["tmp2", "prot_annotation", "prot_category", "tran_uniprot", "tran_gene", "prot_uniprot", "prot_gene", "correlation"]
prot_tran_annot = pd.merge(annot, prot_tran_annot, how="right", left_on=1, right_on="tran_uniprot")
prot_tran_annot.columns = ["tmp1", "tran_annotation", "tran_category", "tmp2", "prot_annotation", "prot_category", "tran_uniprot", "tran_gene", "prot_uniprot", "prot_gene", "correlation"]
prot_tran_annot.drop(["tmp1", "tmp2"], axis=1, inplace=True)
cols = ["tran_uniprot", "tran_annotation", "tran_category", "tran_gene", "prot_uniprot", "prot_annotation", "prot_category", "prot_gene", "correlation"]
prot_tran_annot = prot_tran_annot[cols]
prot_tran_annot.sort_values(by="correlation", ascending=False, ignore_index=True, inplace=True)
prot_tran_annot.to_csv("multiblock_correlations_annotated.tsv", sep="\t")