# Run GO alignment on the precomputed new clixo resuts

update 4/28/20: refine notebook for submission

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns

# latex rendering of text in graphs
import matplotlib as mpl
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']

% matplotlib inline

import ddot
from ddot import Ontology

# load ASD-CHD network genes

In [4]:
# define where the supp tables are, so we can load relevant data
path_to_supp_tables = '../../../../manuscript/tables_19_01/supplemental_tables.xlsx' 

# define where to find PCnet 
path_to_PCnet='/Users/brinrosenthal/Documents/CCBB_tickets_data/PCnet/G_PCnet.gpickle'


In [5]:
# ASD_CHD_df = pd.read_csv('G_ASD_CHD_PCnetnodes_190117.txt',sep='\t',skiprows=1)
ASD_CHD_df = pd.read_excel(path_to_supp_tables,sheet_name='ASD_CHD_network_genes')

print(len(ASD_CHD_df))
ASD_CHD_df.index=ASD_CHD_df['gene']
ASD_CHD_df.head()

844


Unnamed: 0_level_0,gene,snv_indel_DECIPHERv9.25,snv_indel_DECIPHERv9.31,small_CNV_less1MB_DECIPHERv9.25,ASD_CHD_DNV,sum_damaging_variants,ASD_HC,CHD_HC,z_ASD,z_CHD,z_both,gtex_heart_percentile_exp,gtex_brain_percentile_exp
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
NAA15,NAA15,0.0,0.0,0.0,0.0,0.0,1,1,18.901197,20.892283,394.889167,0.733689,0.62083
PTEN,PTEN,1.0,1.0,0.0,1.0,2.0,1,1,16.395135,18.819067,308.541142,0.896062,0.894637
POGZ,POGZ,3.0,3.0,0.0,1.0,4.0,1,1,14.18654,14.641398,207.710787,0.908309,0.953178
KMT2C,KMT2C,0.0,0.0,0.0,1.0,1.0,1,1,13.462226,14.954173,201.316464,0.8731,0.908995
KDM5B,KDM5B,1.0,1.0,0.0,1.0,2.0,1,1,10.464431,13.490241,141.167693,0.677893,0.698533


# Load pcnet

In [6]:
G_pcnet = nx.read_gpickle(path_to_PCnet)

print(len(G_pcnet.nodes()))
print(len(G_pcnet.edges()))

19781
2724724


In [7]:
# make a subgraph from ASD_CHD_df genes
G_ASD_CHD = nx.subgraph(G_pcnet,ASD_CHD_df.index.tolist())
print(len(G_ASD_CHD.nodes()))
print(len(G_ASD_CHD.edges()))

844
28559


# Load precomputed clixo results

In [8]:
# Load precomputed clixo results
alpha='01'
beta='45'

# update 11/21/19: remove non- network proximal seeds
ont = Ontology.from_table('new_clixo/ASDCHD_parsed_alpha.cosine.'+alpha+'_beta_.'+beta+'.txt')

In [9]:
# find out if term2gene includes all genes or just unique genes --> includes all genes
print(len(ont.term_2_gene))
ont = ont.collapse_ontology()
print(len(ont.term_2_gene))

# ont.gene_2_term

# pd.Series(ont.term_2_gene).head(15)

120
collapse command: /usr/local/lib/python2.7/dist-packages/ddot/alignOntology/collapseRedundantNodes /tmp/tmp265MH1
120


In [10]:
pd.Series(ont.term_sizes).sort_values(ascending=False).head(15)

119    844
118    344
117    324
116    302
115    240
114    220
113    206
112    191
111    177
110    144
109    134
108    119
107    110
106    109
105    108
dtype: int64

# Align the data-driven ontology with the Gene Ontology (GO)

In [11]:

# load the individual GO branches, parsed separately, in bigger instance
G_BP= nx.read_gpickle('GO_branches/G_BP.gpickle')
print(len(G_BP.nodes()))
print(len(G_BP.edges()))
go_BP = Ontology.from_networkx(G_BP)
go_BP

30496
136316


17211 genes, 13285 terms, 108384 gene-term relations, 27932 term-term relations
node_attributes: ['name', u'Branch', u'Vis:Shape', u'Vis:Border Paint', u'Term_Description', u'Vis:Fill Color']
edge_attributes: [u'Vis:Visible']

In [12]:
G_MF= nx.read_gpickle('GO_branches/G_MF.gpickle')
print(len(G_MF.nodes()))
print(len(G_MF.edges()))
go_MF = Ontology.from_networkx(G_MF)
go_MF

21486
55542


17177 genes, 4309 terms, 49990 gene-term relations, 5552 term-term relations
node_attributes: ['name', u'Branch', u'Vis:Shape', u'Vis:Border Paint', u'Term_Description', u'Vis:Fill Color']
edge_attributes: [u'Vis:Visible']

In [13]:
G_CC= nx.read_gpickle('GO_branches/G_CC.gpickle')
print(len(G_CC.nodes()))
print(len(G_CC.edges()))
go_CC = Ontology.from_networkx(G_CC)
go_CC

19996
59989


18248 genes, 1748 terms, 57114 gene-term relations, 2875 term-term relations
node_attributes: ['name', u'Branch', u'Vis:Shape', u'Vis:Border Paint', u'Term_Description', u'Vis:Fill Color']
edge_attributes: [u'Vis:Visible']

In [14]:
focal_branch='CC'
if focal_branch=='MF':
    go_focal=go_MF
elif focal_branch=='BP':
    go_focal=go_BP
elif focal_branch=='CC':
    go_focal=go_CC

In [15]:
# Align ontologies
alignment = ont.align(go_focal, 
                      iterations=100,
                      update_self=['Term_Description'],
                      align_label='Term_Description',
                      verbose=True,mutual_collapse=False)
alignment.head()

Alignment command: /usr/local/lib/python2.7/dist-packages/ddot/alignOntology/calculateFDRs /tmp/tmpgrLRxX /tmp/tmpy1aFJo 0.05 criss_cross /tmp/tmp2QhHH1 100 4 gene


Unnamed: 0_level_0,Term,Similarity,FDR
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
859,GO:0001518,0.369817,0.0
887,GO:0071564,0.231228,0.0
897,GO:0034706,0.212281,0.0
851,GO:0070765,0.202726,0.0
873,GO:0000788,0.195862,0.0


In [16]:
len(alignment)

9

In [17]:
# Note how node attributes have been updated to reflect the ontology alignment

# add a new column which is just the term label (no GO)
ont.node_attr['term_label']=ont.node_attr.index.tolist()
print(len(ont.node_attr))
ont.node_attr.sort_values('Aligned_Similarity',ascending=False).head(100)


9


Unnamed: 0,Aligned_Term,Aligned_Similarity,Aligned_FDR,Aligned_Term_Description,Label,term_label
859,GO:0001518,0.369817,0.0,voltage-gated sodium channel complex,859\nvoltage-gated sodium channel complex,859
887,GO:0071564,0.231228,0.0,npBAF complex,887\nnpBAF complex,887
897,GO:0034706,0.212281,0.0,sodium channel complex,897\nsodium channel complex,897
851,GO:0070765,0.202726,0.0,gamma-secretase complex,851\ngamma-secretase complex,851
873,GO:0000788,0.195862,0.0,nuclear nucleosome,873\nnuclear nucleosome,873
866,GO:0070776,0.168028,0.0,MOZ/MORF histone acetyltransferase complex,866\nMOZ/MORF histone acetyltransferase complex,866
844,GO:0044299,0.126141,0.0,C-fiber,844\nC-fiber,844
870,GO:0072487,0.119038,0.0,MSL complex,870\nMSL complex,870
877,GO:0048787,0.104082,0.0,presynaptic active zone membrane,877\npresynaptic active zone membrane,877


In [18]:
ont.node_attr

Unnamed: 0,Aligned_Term,Aligned_Similarity,Aligned_FDR,Aligned_Term_Description,Label,term_label
844,GO:0044299,0.126141,0.0,C-fiber,844\nC-fiber,844
851,GO:0070765,0.202726,0.0,gamma-secretase complex,851\ngamma-secretase complex,851
859,GO:0001518,0.369817,0.0,voltage-gated sodium channel complex,859\nvoltage-gated sodium channel complex,859
866,GO:0070776,0.168028,0.0,MOZ/MORF histone acetyltransferase complex,866\nMOZ/MORF histone acetyltransferase complex,866
870,GO:0072487,0.119038,0.0,MSL complex,870\nMSL complex,870
873,GO:0000788,0.195862,0.0,nuclear nucleosome,873\nnuclear nucleosome,873
877,GO:0048787,0.104082,0.0,presynaptic active zone membrane,877\npresynaptic active zone membrane,877
887,GO:0071564,0.231228,0.0,npBAF complex,887\nnpBAF complex,887
897,GO:0034706,0.212281,0.0,sodium channel complex,897\nsodium channel complex,897


In [19]:
# write out the aligned results
ont.node_attr.to_csv('new_clixo/aligned_ont_cosine_alpha'+alpha+'_beta'+beta+'_'+focal_branch+'.tsv',sep=',',index=True)