# Make a GO term network out of WebGestalt results

In [2]:
# import matplotlib
# matplotlib.use('TkAgg')

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns

import community

import mygene
mg = mygene.MyGeneInfo()

# latex rendering of text in graphs
import matplotlib as mpl
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']

% matplotlib inline

import visJS2jupyter.visJS_module
import visJS2jupyter.visualizations


# Load the enrichment results

In [11]:
focal_geneset='ASD_EPI'

if focal_geneset=='ASD_CHD':
    GO_df = pd.read_csv('../../webgestalt_GIANT_brain_p2/ASD_CHD/enrichment_results_wg_result1533159103.txt',
                       sep='\t',index_col='geneset')
elif focal_geneset=='ASD_EPI':
    GO_df = pd.read_csv('../../webgestalt_GIANT_brain_p2/ASD_EPI/enrichment_results_wg_result1537204266.txt',
                       sep='\t',index_col='geneset')

GO_df = GO_df[GO_df['FDR']<0.01] # stronger filter on FDR


#GO_df = pd.read_csv('webgestalt_RNA_downregulated_GO_BP/enrichment_results_wg_result1513734536.txt',
#                    sep='\t',index_col='geneset')

#GO_df = GO_df[GO_df['FDR']<0.0000000000001] # stronger filter on FDR
print(len(GO_df))
GO_df.head()

220


Unnamed: 0_level_0,description,link,C,O,E,R,PValue,FDR,overlapGene,OverlapGene_UserID
geneset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GO:0006836,neurotransmitter transport,http://amigo.geneontology.org/amigo/term/GO:00...,161,52,13.541463,3.840058,0.0,0.0,41;320;815;1812;1814;2555;2571;3749;5663;5864;...,TRIM9;SLC32A1;RIMS4;DRD1;DRD3;ADGRL1;NLGN1;RIM...
GO:0007267,cell-cell signaling,http://amigo.geneontology.org/amigo/term/GO:00...,1361,237,114.471622,2.070382,0.0,0.0,25;40;41;153;186;273;320;324;331;408;491;775;7...,HCN4;FRAT1;CDH8;CHST4;CCL27;WIF1;CHRNA7;TRIM9;...
GO:0007268,chemical synaptic transmission,http://amigo.geneontology.org/amigo/term/GO:00...,525,136,44.156945,3.079923,0.0,0.0,40;41;273;320;491;783;785;815;869;1006;1139;11...,CDH8;CHRNA7;TRIM9;CHRNB3;CRH;CRHBP;SLC32A1;RIM...
GO:0007399,nervous system development,http://amigo.geneontology.org/amigo/term/GO:00...,1889,311,158.880892,1.957441,0.0,0.0,25;40;186;320;322;323;347;401;429;460;491;577;...,BCL2L11;FARP1;OLIG2;OLFM1;AVIL;DLL3;FUT9;PPP1R...
GO:0022008,neurogenesis,http://amigo.geneontology.org/amigo/term/GO:00...,1256,201,105.640233,1.902684,0.0,0.0,25;322;323;347;401;429;460;491;577;815;869;885...,FARP1;OLIG2;OLFM1;AVIL;DLL3;RAB35;STMN2;IL1RAP...


In [12]:

#edge_overlap_frac = .9 # for downregulated
edge_overlap_frac = 0.6 # for upregulated

edge_list=[]
for GO_1 in GO_df.index.tolist():
    for GO_2 in GO_df.index.tolist():
        if GO_1 != GO_2:
            genes_1 = GO_df.loc[GO_1]['OverlapGene_UserID'].split(';')
            genes_2 = GO_df.loc[GO_2]['OverlapGene_UserID'].split(';')

            genes_overlap = len(np.intersect1d(genes_1,genes_2))
            genes_overlap_frac = float(genes_overlap)/min(len(genes_1),len(genes_2))

            if genes_overlap_frac>edge_overlap_frac:
                edge_list.append((GO_1,GO_2,genes_overlap))
            
len(edge_list)

5406

In [13]:
G_GO = nx.Graph()
G_GO.add_weighted_edges_from(edge_list)
G_GO.add_nodes_from(GO_df.index.tolist())


In [14]:
# add features to the network
nx.set_node_attributes(G_GO,'description',dict(GO_df.loc[G_GO.nodes()]['description']))
node_to_fdr = {}
for n in G_GO.nodes():
    if GO_df.loc[n]['FDR']==0:
        node_to_fdr[n]=20
    else:
        node_to_fdr[n]=-np.log10(GO_df.loc[n]['FDR'])
    
nx.set_node_attributes(G_GO,'-logFDR',node_to_fdr)
nx.set_node_attributes(G_GO,'p-value',dict(GO_df.loc[G_GO.nodes()]['PValue']))
nx.set_node_attributes(G_GO,'size_of_set',dict(GO_df.loc[G_GO.nodes()]['C']))
nx.set_node_attributes(G_GO,'num_overlap',dict(GO_df.loc[G_GO.nodes()]['O']))

# Draw the network

In [15]:
enrich_type='ORA'
if enrich_type =='GSEA':
    G_draw = G_GSEA
elif enrich_type=='ORA':
    G_draw = G_GO


In [16]:
def wrap_GO_term(term,split_on=' ',max_line=25):
    '''
    Helper function to wrap long GO terms
    
    '''
    split_term = term.split(split_on)
    if split_term[0]=='GO':
        split_term = split_term[1:]
    char_count = 0
    wrapped_term = ''
    for s in split_term:
        if char_count<max_line:
            wrapped_term+=s+' '
            char_count+=len(s)
        else:
            wrapped_term+='\n'+s+' '
            char_count=len(s)
            
    return wrapped_term

In [17]:
if enrich_type=='ORA':
    wrapped_description = [wrap_GO_term(d,max_line=20) for d in GO_df['description'].tolist()]
    wrapped_description = pd.Series(wrapped_description,GO_df.index.tolist())
elif enrich_type=='GSEA':
    wrapped_description = [wrap_GO_term(d,split_on='_',max_line=20) for d in GSEA_df.index.tolist()]
    wrapped_description = pd.Series(wrapped_description,GSEA_df.index.tolist())

nx.set_node_attributes(G_draw,'wrapped_description',dict(wrapped_description))
wrapped_description.head()

GO:0006836        neurotransmitter transport 
GO:0007267               cell-cell signaling 
GO:0007268    chemical synaptic transmission 
GO:0007399        nervous system development 
GO:0022008                      neurogenesis 
dtype: object

In [18]:
from networkx.drawing.nx_agraph import graphviz_layout

In [19]:
pos = graphviz_layout(G_draw,prog='neato')
for n in pos.keys():
    pos[n]=np.array([pos[n][0]/300.0,pos[n][1]/300.0])

In [20]:
# development versions of return_node_to_color and return_edge_to_color

def return_node_to_color(G,field_to_map='degree',cmap=mpl.cm.jet,alpha = 1.0, color_vals_transform = None,ceil_val=10,
                        color_max_frac = 1.0,color_min_frac = 0.0,vmin=None,vmax=None):
    

    '''
    Function to return a dictionary mapping nodes (keys) to colors (values), based on the selected field_to_map.
        - field_to_map must be a node attribute
        - cmap must be a valid matplotlib colormap
        - color_max_frac and color_min_frac allow user to set lower and upper ranges for colormap
    
    '''
    
    
    
    nodes_with_data = [(n[0],n[1][field_to_map]) for n in G.nodes(data=True)]
    
    
    if color_vals_transform == 'log':
        nodes,data = zip(*nodes_with_data)
        min_dn0 = np.nanmin([d for d in data if d>0])
        data = [np.log(np.max([d,min_dn0])) for d in data]  # set the zero d values to minimum non0 value
        data = [(d-np.nanmin(data)) for d in data] # shift so we don't have any negative values
        nodes_with_data = zip(nodes,data)
        
    elif color_vals_transform == 'sqrt':
        nodes,data = zip(*nodes_with_data)
        data = [np.sqrt(d) for d in data]
        nodes_with_data = zip(nodes,data)
        
    elif color_vals_transform == 'ceil':
        nodes,data = zip(*nodes_with_data)
        data = [min(d,ceil_val) for d in data]
        nodes_with_data = zip(nodes,data)
    else:
        nodes,data = zip(*nodes_with_data)
        
    # if vmin and vmax aren't set, set them to min and max of the data
    if vmin == None:
        vmin = np.nanmin(data)
    if vmax == None:
        vmax = np.nanmax(data)
        
    node_to_mapField = dict(nodes_with_data)
    
    color_to_mult = 256*(color_max_frac-color_min_frac)
    color_to_add = 256*color_min_frac
    print(color_to_mult)
    print(color_to_add)
    print(np.nanmax(list(node_to_mapField.values())))
    
    color_list = [np.multiply(cmap(int(float(node_to_mapField[d]-vmin)/(vmax-vmin)*color_to_mult+color_to_add)),256) 
                  if ~np.isnan(node_to_mapField[d])
                  else [np.nan]
                  for d in G.nodes()]
    
    color_list = [(int(c[0]),int(c[1]),int(c[2]),alpha) 
                  if ~np.isnan(c[0])
                  else (200,200,200,alpha)
                  for c in color_list]

    node_to_color = dict(zip(G.nodes(),['rgba'+str(c) for c in color_list]))
    
    return node_to_color

def return_edge_to_color(G,field_to_map='degree',cmap=mpl.cm.jet,alpha = 1.0, color_vals_transform = None,ceil_val=10,
                        vmin=None,vmax=None):
    
    
    '''
    Function to return a dictionary mapping edges (keys) to colors (values), based on the selected field_to_map.
        - field_to_map must be an edge attribute
        - cmap must be a valid matplotlib colormap
    
    '''
    
    edges_with_data = [(e[0],e[1],e[2][field_to_map]) for e in G.edges(data=True)]
    
    edges1,edges2,data = zip(*edges_with_data)
    
    
    
    
    if color_vals_transform == 'log':
        data = [np.log(d) for d in data]
        data = [(d-np.min(data)) for d in data] # shift so we don't have any negative values
        edges_with_data = zip(zip(edges1,edges2),data)
        
    elif color_vals_transform == 'sqrt':
        data = [np.sqrt(d) for d in data]
        edges_with_data = zip(zip(edges1,edges2),data)
        
    elif color_vals_transform == 'ceil':
        data = [max(d,ceil_val) for d in data]
        edges_with_data = zip(zip(edges1,edges2),data)
    else:
        
        edges_with_data = zip(zip(edges1,edges2),data)
        
    # if vmin and vmax aren't set, set them to min and max of the data
    if vmin == None:
        vmin = np.nanmin(data)
    if vmax == None:
        vmax = np.nanmax(data)
        
    edge_to_mapField = dict(edges_with_data)
    
    color_list = [np.multiply(cmap(int(float(edge_to_mapField[d]-vmin)/(vmax-vmin)*256)),256) for d in G.edges()]
    
    color_list = [(int(c[0]),int(c[1]),int(c[2]),alpha) for c in color_list]
    
    edge_to_color = dict(zip(G.edges(),['rgba'+str(c) for c in color_list]))
    
    return edge_to_color


In [22]:
nodes = G_draw.nodes()
nodes_data = G_draw.nodes(data=True)
edges = G_draw.edges()

node_to_color = return_node_to_color(G_draw,field_to_map='-logFDR',cmap=mpl.cm.autumn_r,alpha = 1)

node_to_size = {}
for n in nodes:
    if enrich_type=='ORA':
        node_to_size[n] = int(node_to_fdr[n]) #min(node_to_fdr[n]+3,10)
    elif enrich_type=='GSEA':
        node_to_size[n]=min(node_to_fdr[n]+3,10)
    

if enrich_type=='ORA':
    nodes_dict = [{"id":n,"color":node_to_color[n],
                   "node_size":node_to_size[n],
                   "neglogFDR":np.float(nx.get_node_attributes(G_draw,'-logFDR')[n]),
                   "pvalue":np.float(nx.get_node_attributes(G_draw,'p-value')[n]),
                   "size_of_set":np.float(nx.get_node_attributes(G_draw,'size_of_set')[n]),
                   "node_label":wrapped_description.loc[n],
                   "title":GO_df.loc[n]['description'],
                  "x":pos[n][0]*1000,
                  "y":pos[n][1]*1000} for n in nodes
                  ]
elif enrich_type=='GSEA':
    nodes_dict = [{"id":n,"color":node_to_color[n],
                   "node_size":node_to_size[n],
                   "node_label":wrapped_description.loc[n],
                   "title":n,
                  "x":pos[n][0]*1000,
                  "y":pos[n][1]*1000} for n in nodes
                  ]
node_map = dict(zip(nodes,range(len(nodes))))  # map to indices for source/target in edges

edge_to_color = return_edge_to_color(G_draw,field_to_map='weight',cmap=mpl.cm.BuPu)

edges_dict = [{"source":node_map[edges[i][0]], "target":node_map[edges[i][1]], 
              "color":'rgba(150,150,250,.4)',"title":'test'} for i in range(len(edges))]

# set some network-wide styles
visJS2jupyter.visJS_module.visjs_network(nodes_dict,edges_dict,
                          node_size_multiplier=5,
                          node_label_field='node_label',
                          node_size_transform = '',
                          node_size_field='node_size',
                          node_color_highlight_border='red',
                          node_color_highlight_background='#D3918B',
                          node_color_hover_border='blue',
                          node_color_hover_background='#8BADD3',
                          node_font_size=45,
                          edge_arrow_to=False,
                          physics_enabled=False,
                          edge_color_highlight='#8A324E',
                          edge_color_hover='#8BADD3',
                          edge_width=5,
                          max_velocity=15,
                          min_velocity=1,
                          scaling_factor=1,
                          export_network=True,
                          export_file='GO_'+focal_geneset+'.json',
                          edge_smooth_enabled=False,
                         node_scaling_label_draw_threshold=0)

256.0
0.0
20.0
220
220


In [9]:
visJS2jupyter.visJS_module.visjs_network?