# Correlated multi-omics features in SARS-Cov-2 infected samples

Annotate the correlations returned by the integrative analysis.

In [1]:
import numpy as np
import pandas as pd

These results are already provided as part of the repository in `results`. Steps listed here are for reproducibility.

```
# feature names
# running these commands unites the features in each component
# this will work after running the pipeline with example.sh
dir="../results/case_study_1/EXAMPLE"
export dir

cd ${dir}
(head -n1 ${dir}proteome_1_sPLSDA_max.txt; cat ${dir}proteome_*sPLSDA*max.txt | grep -v "importance") \
    > proteome_sPLSDA_keepx_max.txt
(head -n1 ${dir}translatome_1_sPLSDA_max.txt; cat ${dir}translatome_*sPLSDA*max.txt | grep -v "importance") \
    > translatome_sPLSDA_keepx_max.txt
(head -n1 ${dir}proteome_1_DIABLO_var_keepx_max.txt; cat ${dir}proteome_*DIABLO*keepx_max*txt | grep -v "importance") \
    > proteome_DIABLO_keepx_max.txt
(head -n1 ${dir}translatome_1_DIABLO_var_keepx_max.txt; cat ${dir}translatome_*DIABLO*keepx_max*txt | grep -v "importance") \
    > translatome_DIABLO_keepx_max.txt
```

In [2]:
# change these file paths accordingly
prot_splsda_infile = ".../../results/case_study_1/proteome_sPLSDA_keepx_max.txt"
tran_splsda_infile = "../../results/case_study_1/translatome_sPLSDA_keepx_max.txt"
diablo_corr_infile = "../../results/case_study_1/DIABLO_var_keepx_correlations.txt"
panther_annotations_infile = "../../data/case_study_1/panther.txt"
mapping_annotations_infile = "../../data/case_study_1/mapping.csv"
covid_annotations_infile = "../../data/uniprot_SARS-COV2_annotations.tab"
prot_map_infile = "../../data/case_study_1/proteome_mapfile.txt"
tran_map_infile = "../../data/case_study_1/translatome_mapfile.txt"

prot_splsda_outfile = "../../results/case_study_1/proteome_splsda.tsv"
tran_splsda_outfile = "../../results/case_study_1/translatome_splsda.tsv"
corr_annotated_outfile = "../../results/case_study_1/multiblock_correlations_annotated.tsv"

prot_splsda = pd.read_csv(prot_splsda_infile, sep="\t")
tran_splsda = pd.read_csv(tran_splsda_infile, sep="\t")
diablo_corr = pd.read_csv(diablo_corr_infile, sep="\t")
panther_annotations = pd.read_csv(panther_annotations_infile, sep="\t", header=None)
covid_annotations = pd.read_csv(covid_annotations_infile, sep="\t")
mapping_annotations = pd.read_csv(mapping_annotations_infile, sep=",")
prot_map = pd.read_csv(prot_map_infile, sep="\t", usecols=["key", "val"])
tran_map = pd.read_csv(tran_map_infile, sep="\t", usecols=["key", "val"])

In [3]:
# flatten the correlations for a more human readable format
flat_corr = diablo_corr.stack().reset_index()

# order of cols doesnt matter here, matrix is symmetrical
prot = flat_corr.level_0.str.contains('_proteome')   
tran = flat_corr.level_1.str.contains('_translatome')
multiblock = flat_corr[prot & tran]

# below block proves the above point:
_prot = flat_corr.level_1.str.contains('_proteome')   
_tran = flat_corr.level_0.str.contains('_translatome')
_multiblock = flat_corr[_prot & _tran]

ver1 = multiblock.reset_index().drop("index", axis=1).sort_values(by=["level_0", "level_1"], ignore_index=True)
ver2 = _multiblock.reset_index().drop("index", axis=1)[["level_1", "level_0", 0]]
ver2.columns = ["level_0", "level_1", 0]
ver2 = ver2.sort_values(by=["level_0", "level_1"], ignore_index=True)

assert ver1[["level_0", "level_1"]].equals(ver2[["level_0", "level_1"]]) is True, \
    "Order of correlated features must be identical"
assert set(np.isclose(ver1[0], ver2[0])) == {True}, \
    "Order of correlated values are identical"

# we want correlations across modalities only for this step
prot_tran = flat_corr[(flat_corr.level_0.str.contains("_proteome")) & (flat_corr.level_1.str.contains("_translatome"))]
tran_prot = flat_corr[(flat_corr.level_1.str.contains("_proteome")) & (flat_corr.level_0.str.contains("_translatome"))]
prot_tran.columns = ["proteome", "translatome", "correlation"]
prot_tran = prot_tran.sort_values(by=["proteome", "translatome"], ignore_index=True)

# clean up file names
prot_tran = prot_tran.replace(regex={r'_prot_proteome$': '', r'_tran_translatome': ''})
prot_tran = prot_tran.replace(regex={r'_proteome$': '', r'_translatome': ''})

# use panther annotations
panther_annot = panther_annotations[[1,2,4]].replace(regex={r',.*$': ''})
panther_annot.columns = [1,2,3]

# use covid annotations
covid_annot = covid_annotations.copy()
covid_annot.replace(regex={r'_HUMAN$': ''}, inplace=True)
covid_annot.drop(["Status", "Gene names", "Entry name", "Length"], axis=1, inplace=True)
covid_annot.columns = [1, 2, 3]
covid_annot

# merge annotations
annot = pd.concat([panther_annot, covid_annot], ignore_index=True)
annot

prot_tran_annot = pd.merge(annot, prot_tran, how="right", left_on=1, right_on="proteome")
prot_tran_annot.columns = [
    "tmp2", "prot_annotation", "prot_category", "proteome", 
    "translatome", "correlation"
]
prot_tran_annot = pd.merge(annot, prot_tran_annot, how="right", left_on=1, right_on="translatome")
prot_tran_annot.columns = [
    "tmp1", "tran_annotation", "tran_category", "tmp2", 
    "prot_annotation", "prot_category", "proteome", 
    "translatome", "correlation"
]
prot_tran_annot.drop(["tmp1", "tmp2"], axis=1, inplace=True)
prot_tran_annot
cols = [
    "translatome", "tran_annotation", "tran_category", 
    "proteome", "prot_annotation", "prot_category", "correlation"
]
prot_tran_annot = prot_tran_annot[cols]
prot_tran_annot.sort_values(by="correlation", ascending=False, ignore_index=True, inplace=True)
prot_tran_annot.to_csv(corr_annotated_outfile, sep="\t")
prot_tran_annot

id_to_feature = mapping_annotations.iloc[:,0:2]
id_to_feature.columns = ["name", "id"]
id_to_feature
prot_tran_annot = pd.merge(id_to_feature, prot_tran_annot, how="right", left_on="id", right_on="proteome")
prot_tran_annot.columns = [
    "prot_name", "tmp1", "translatome", "tran_annotation", "tran_category", 
    "proteome", "prot_annotation", "prot_category", "correlation"
]
prot_tran_annot = pd.merge(id_to_feature, prot_tran_annot, how="right", left_on="id", right_on="translatome")
prot_tran_annot.columns = [
    "tran_name", "tmp2", "prot_name", "tmp1", "translatome", "tran_annotation", 
    "tran_category", "proteome", "prot_annotation", "prot_category", "correlation"
]
cols = ["translatome", "tran_name", "tran_annotation", "tran_category", "proteome", "prot_name", "prot_annotation", "prot_category", "correlation"]
prot_tran_annot = prot_tran_annot[cols]
prot_tran_annot.sort_values(by="correlation", ascending=False, ignore_index=True, inplace=True)
prot_tran_annot.to_csv(corr_annotated_outfile, sep="\t")
prot_tran_annot

Unnamed: 0,translatome,tran_name,tran_annotation,tran_category,proteome,prot_name,prot_annotation,prot_category,correlation
0,Q7Z3J2,C16orf62,VPS35 endosomal protein sorting factor-like;VP...,,Q7Z3J2,C16orf62,VPS35 endosomal protein sorting factor-like;VP...,,0.977400
1,Q00839,HNRNPU,Heterogeneous nuclear ribonucleoprotein U;HNRN...,,P0DTC6,,Non-structural protein 6 (ns6) (Accessory prot...,Severe acute respiratory syndrome coronavirus ...,0.973599
2,Q00839,HNRNPU,Heterogeneous nuclear ribonucleoprotein U;HNRN...,,P0DTC1,,Replicase polyprotein 1a (pp1a) (ORF1a polypro...,Severe acute respiratory syndrome coronavirus ...,0.971105
3,Q7Z3J2,C16orf62,VPS35 endosomal protein sorting factor-like;VP...,,Q9NPE3,NOP10,H/ACA ribonucleoprotein complex subunit 3;NOP1...,RNA binding protein(PC00031),0.969983
4,Q07021,C1QBP,Complement component 1 Q subcomponent-binding ...,,P0DTC6,,Non-structural protein 6 (ns6) (Accessory prot...,Severe acute respiratory syndrome coronavirus ...,0.969632
...,...,...,...,...,...,...,...,...,...
6365,Q9Y281,,Cofilin-2;CFL2;ortholog,non-motor actin binding protein(PC00165),Q15465,SHH,Sonic hedgehog protein;SHH;ortholog,,-0.962754
6366,P47756,CAPZB,F-actin-capping protein subunit beta;CAPZB;ort...,non-motor actin binding protein(PC00165),Q99519,NEU1,Sialidase-1;NEU1;ortholog,hydrolase(PC00121),-0.963688
6367,Q9NX63,CHCHD3,MICOS complex subunit MIC19;CHCHD3;ortholog,,Q99519,NEU1,Sialidase-1;NEU1;ortholog,hydrolase(PC00121),-0.972778
6368,P47756,CAPZB,F-actin-capping protein subunit beta;CAPZB;ort...,non-motor actin binding protein(PC00165),Q15465,SHH,Sonic hedgehog protein;SHH;ortholog,,-0.977356


In [4]:
# export single omics proteomics with annotations
prot_splsda = pd.read_csv(prot_splsda_infile, sep="\t")
prot_splsda.reset_index(inplace=True)
prot_splsda.replace(regex={r'_prot$': ''}, inplace=True)
prot_splsda.replace(regex={r'_tran$': ''}, inplace=True)
prot_splsda.replace(regex={r'__FEATUREID$': ''}, inplace=True)
prot_splsda.set_index("index", inplace=True)
prot_splsda

prot_splsda_annot = pd.merge(annot, prot_splsda, how="right", left_on=1, right_on="index")
prot_splsda_annot.columns = ["index", "name", "annotation"] + prot_splsda_annot.columns.to_list()[3:]
prot_splsda_annot.sort_values(by="importance", ascending=False, ignore_index=True, inplace=True)
prot_splsda_annot.set_index("index", inplace=True)
prot_splsda_annot.to_csv(prot_splsda_outfile, sep="\t")
prot_splsda_annot

Unnamed: 0_level_0,name,annotation,level_0,name,annotation,Control_10h,Control_24h,Control_2h,Control_6h,Virus_10h,...,Contrib.Control_2h,Contrib.Control_6h,Contrib.Virus_10h,Contrib.Virus_24h,Contrib.Virus_2h,Contrib.Virus_6h,Contrib,GroupContrib,color,importance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P08174,Complement decay-accelerating factor;CD55;orth...,,0,Complement decay-accelerating factor;CD55;orth...,,0.519820,0.013958,0.558856,-2.479036,0.662507,...,False,False,False,False,True,False,False,Virus_2h,black,0.793975
Q16719,Kynureninase;KYNU;ortholog,hydrolase(PC00121),1,Kynureninase;KYNU;ortholog,hydrolase(PC00121),0.656747,0.311750,0.803123,-2.509965,0.474387,...,True,False,False,False,False,False,False,Control_2h,#C2C2C2,0.368682
Q99988,Growth/differentiation factor 15;GDF15;ortholog,growth factor(PC00112),2,,,-0.773852,-1.221887,1.355051,0.397736,-1.156558,...,False,False,False,False,True,False,False,Virus_2h,black,0.359381
Q8WVV9_P14866,,,3,,,-1.208401,-1.085086,0.919806,0.180927,-1.364740,...,False,False,False,False,True,False,False,Virus_2h,black,0.343092
Q2UVX4,Complement C3;C3;ortholog,protease inhibitor(PC00191),4,,,-0.192311,-0.355171,1.092389,-1.992105,0.225023,...,False,False,False,False,True,False,False,Virus_2h,black,0.284355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q71RC2,La-related protein 4;LARP4;ortholog,RNA binding protein(PC00031),105,,,0.896134,0.500945,-1.020269,-0.273502,1.500922,...,False,False,True,False,False,False,False,Virus_10h,#CC79A7,-0.346225
P04183,Thymidine kinase,nucleotide kinase(PC00172),106,,,-0.714823,2.420303,-0.278035,-0.482255,-0.578664,...,False,False,False,False,False,False,False,Control_24h,#F68B33,-0.376758
P31350,Ribonucleoside-diphosphate reductase subunit M...,reductase(PC00198),107,,,-0.608445,2.650162,-0.049620,-0.474293,-0.641835,...,False,False,False,False,False,False,False,Control_24h,#F68B33,-0.389780
P00374,Dihydrofolate reductase;DHFR;ortholog,,108,,,-0.613884,2.413696,0.159712,-0.793074,-0.606045,...,False,False,False,False,False,False,False,Control_24h,#F68B33,-0.391759


In [5]:
# export single omics proteomics with annotations
tran_splsda = pd.read_csv(tran_splsda_infile, sep="\t")
tran_splsda.reset_index(inplace=True)
tran_splsda.replace(regex={r'_prot$': ''}, inplace=True)
tran_splsda.replace(regex={r'_tran$': ''}, inplace=True)
tran_splsda.replace(regex={r'__FEATUREID$': ''}, inplace=True)
tran_splsda.set_index("index", inplace=True)
tran_splsda

tran_splsda_annot = pd.merge(annot, tran_splsda, how="right", left_on=1, right_on="index")
tran_splsda_annot.columns = ["index", "name", "annotation"] + tran_splsda_annot.columns.to_list()[3:]
tran_splsda_annot.sort_values(by="importance", ascending=False, ignore_index=True, inplace=True)
tran_splsda_annot.set_index("index", inplace=True)
tran_splsda_annot.to_csv(tran_splsda_outfile, sep="\t")
tran_splsda_annot

Unnamed: 0_level_0,name,annotation,level_0,name,annotation,Control_10h,Control_24h,Control_2h,Control_6h,Virus_10h,...,Contrib.Control_2h,Contrib.Control_6h,Contrib.Virus_10h,Contrib.Virus_24h,Contrib.Virus_2h,Contrib.Virus_6h,Contrib,GroupContrib,color,importance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P11142,Heat shock cognate 71 kDa protein;HSPA8;ortholog,,0,Heat shock cognate 71 kDa protein;HSPA8;ortholog,,1.773624,-0.810417,-1.077815,0.705951,0.784379,...,False,False,False,False,False,False,False,Control_10h,#388ECC,0.459400
P08865,40S ribosomal protein SA;RPSA;ortholog,ribosomal protein(PC00202),1,40S ribosomal protein SA;RPSA;ortholog,ribosomal protein(PC00202),2.082822,-0.579825,-0.789736,0.320292,0.767054,...,False,False,False,False,False,False,False,Control_10h,#388ECC,0.396760
Q14978,Nucleolar and coiled-body phosphoprotein 1;NOL...,,2,,,1.686611,-0.546829,-0.836080,0.829102,0.417661,...,False,False,False,False,False,False,False,Control_10h,#388ECC,0.349158
P40616,ADP-ribosylation factor-like protein 1;ARL1;or...,G-protein(PC00020),3,,,-0.608382,0.458432,0.557968,1.289721,-1.698614,...,False,True,False,False,False,False,False,Control_6h,#009E73,0.282703
P19338,Nucleolin;NCL;ortholog,,4,Nucleolin;NCL;ortholog,,1.769154,-0.522630,-0.681460,0.798107,0.295510,...,False,False,False,False,False,False,False,Control_10h,#388ECC,0.280191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q7Z3J2,VPS35 endosomal protein sorting factor-like;VP...,,195,VPS35 endosomal protein sorting factor-like;VP...,,-0.051014,-0.771387,-0.097190,-0.381565,-0.310444,...,False,False,False,True,False,False,False,Virus_24h,#F0E442,-0.309439
Q9Y281,Cofilin-2;CFL2;ortholog,non-motor actin binding protein(PC00165),196,,,-0.236979,-0.513830,-0.003641,-0.317454,-0.316224,...,False,False,False,True,False,False,False,Virus_24h,#F0E442,-0.309923
Q07021,Complement component 1 Q subcomponent-binding ...,,197,Complement component 1 Q subcomponent-binding ...,,0.212073,-0.388812,-0.613739,-0.443112,-0.400930,...,False,False,False,True,False,False,False,Virus_24h,#F0E442,-0.344041
Q03701,CCAAT/enhancer-binding protein zeta;CEBPZ;orth...,DNA-binding transcription factor(PC00218),198,,,-0.335722,-0.460192,-0.530397,-0.171901,-0.268045,...,False,False,False,True,False,False,False,Virus_24h,#F0E442,-0.352308


```
# from this data, we then perform these steps
# extract correlations of interest from data
cut -f2,6,10 multiblock_correlations_annotated.tsv | cut -f2 | tr '_' '\n' | tail -n +2 | sort | uniq > for_grep.txt

# use the correlations returned from translatome and proteome
# take only the proteome (translatome are mostly translation involved)
# put these directly into a drug target search database
wget 'http://unmtid-shinyapps.net/download/drug.target.interaction.tsv.gz'
(head -n1 drug.target.interaction.tsv; grep -f for_grep.txt drug.target.interaction.tsv) > drug_targets.txt
cut -f1 drug_targets.tsv | tail -n +2 | sort | uniq | tr -d "\"" > potential_drugs.txt
```