# BZR Dataset Analysis

Fro wikipedia:

Benzodiazepine receptors, which are found on postsynaptic nerve endings in the central nervous system (CNS), are part of the GABA receptor complex. GABA is the primary inhibitory neurotransmitter of the CNS. The GABA receptor complex is composed of two α-subunits and two β-subunits.

In [None]:
!mlflow ui

In [None]:
import networkx as nx
import numpy as np
import scipy as sc
import os
import re
import util

In [None]:

def read_graphfile_viz(datadir, dataname, max_nodes=None, node_map=None):
    prefix = os.path.join(datadir, dataname, dataname)
    filename_graph_indic = prefix + '_graph_indicator.txt'
    graph_indic={}
    with open(filename_graph_indic) as f:
        i=1
        for line in f:
            line=line.strip("\n")
            graph_indic[i]=int(line)
            i+=1

    filename_nodes=prefix + '_node_labels.txt'
    node_labels=[]
    
#     node_map = get_node_map()
    print(f"node map: {node_map}")
    
    try:
        with open(filename_nodes) as f:
            for line in f:
                line=line.strip("\n")
#                 print(line)
#                 node_labels+=[int(line) - 1]
                # node_labels += node_map.get(line)
                node_labels += line
#         print(node_labels)
        num_unique_node_labels = len(set(node_labels))
    except IOError:
        print('No node labels')
 
    filename_node_attrs=prefix + '_node_attributes.txt'
    node_attrs=[]
    try:
        with open(filename_node_attrs) as f:
            for line in f:
                line = line.strip("\s\n")
                attrs = [float(attr) for attr in re.split("[,\s]+", line) if not attr == '']
                node_attrs.append(np.array(attrs))
    except IOError:
        print('No node attributes')
       
    label_has_zero = False
    filename_graphs=prefix + '_graph_labels.txt'
    graph_labels=[]

    label_vals = []
    with open(filename_graphs) as f:
        for line in f:
            line=line.strip("\n")
            val = int(line)
            if val not in label_vals:
                label_vals.append(val)
            graph_labels.append(val)

    label_map_to_int = {val: i for i, val in enumerate(label_vals)}
    graph_labels = np.array([label_map_to_int[l] for l in graph_labels])

    filename_adj=prefix + '_A.txt'
    adj_list={i:[] for i in range(1,len(graph_labels)+1)}    
    index_graph={i:[] for i in range(1,len(graph_labels)+1)}
    num_edges = 0
    with open(filename_adj) as f:
        for line in f:
            line=line.strip("\n").split(",")
            e0,e1=(int(line[0].strip(" ")),int(line[1].strip(" ")))
            adj_list[graph_indic[e0]].append((e0,e1))
            index_graph[graph_indic[e0]]+=[e0,e1]
            num_edges += 1
    for k in index_graph.keys():
        index_graph[k]=[u-1 for u in set(index_graph[k])]
    
    
    graphs=[]
    for i in range(1,1+len(adj_list)):
        G=nx.from_edgelist(adj_list[i])
        G.graph['label'] = graph_labels[i-1]
        for u in util.node_iter(G):
            if len(node_labels) > 0:
                # node_label_one_hot = { node_lbl: 0 for node_lbl in range(num_unique_node_labels)}
                node_label = node_labels[u-1]
                # node_label_one_hot[node_label] = 1
                util.node_dict(G)[u]['label'] = node_label
            if len(node_attrs) > 0:
                util.node_dict(G)[u]['feat'] = node_attrs[u-1]
        if len(node_attrs) > 0:
            G.graph['feat_dim'] = node_attrs[0].shape[0]
        # print(node_labels)
#         print(G.nodes())
#         print(G.edges())
        # print(G.get get_node_attribute('label'))
        # nx.draw(G, with_labels=True)
        
        mapping={}
        
        for n in util.node_iter(G):
            if node_map is not None:
                mapping[n]= f"{node_map.get(G.nodes[n]['label']) + '-' + str(n)}"
            else:
                mapping[n]= f"{G.nodes[n]['label']}"
    #         graphs.append(nx.relabel_nodes(G, mapping))
        graphs.append((G, mapping))

        # node_label_mapping = {}
#         nx.draw(G, labels = mapping, with_labels=True)
#         nx.draw(graphs[, labels = mapping, with_labels=True, node_size=1000)
    return graphs

In [None]:
def get_bzr_node_map():
    activities = """0	O
    1	C
    2	N
    3	F
    4	Cl
    5	S
    6	Br
    7	Si
    8	Na
    9	I
    10	Hg
    11	B
    12	K
    13	P
    14	Au
    15	Cr
    16	Sn
    17	Ca
    18	Cd
    19	Zn
    20	V
    21	As
    22	Li
    23	Cu
    24	Co
    25	Ag
    26	Se
    27	Pt
    28	Al
    29	Bi
    30	Sb
    31	Ba
    32	Fe
    33	H
    34	Ti
    35	Tl
    36	Sr
    37	In
    38	Dy
    39	Ni
    40	Be
    41	Mg
    42	Nd
    43	Pd
    44	Mn
    45	Zr
    46	Pb
    47	Yb
    48	Mo
    49	Ge
    50	Ru
    51	Eu
    52	Sc
    53	Gd"""

    node_map = {i.split('\t')[0].strip() : i.split('\t')[1].strip() for i in activities.split("\n")}

    return node_map

In [None]:
node_map = get_bzr_node_map()
bzr_graphs = read_graphfile_viz('./../dataset', dataname='BZR', node_map = node_map)

In [None]:
G = bzr_graphs[28][0].copy()
labels = bzr_graphs[28][1].copy()

for n1 in list(G.nodes(data=True)):
    print(n1)

In [None]:
G.degree()

In [None]:
# edges present / max number of possible edges
nx.density(G)

In [None]:
# Why is it called transitivity? if A=B and B=C, the A must equal C. 
# Similarly, in triadic closure, if person A knows person B and person B knows person C, then person A probably knows person C: hence, transitivity. ↩
nx.transitivity(G)

## Centrality Measures

It is used to measure the importance (or “centrality” as in how “central” a node is in the graph) of various nodes in a graph

- Degree
- Betweenness centrality - This metric defines and measures the importance of a node in a network based upon how many times it occurs in the shortest path between all pairs of nodes in a graph.
- Closeness centrality - 
- Eigenvector centrality - This metric measures the importance of a node in a graph as a function of the importance of its neighbors. If a node is connected to highly important nodes, it will have a higher Eigen Vector Centrality score as compared to a node which is connected to lesser important nodes.

In [None]:
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G, max_iter=600) # Run eigenvector centrality

degree_dict = nx.degree(G)
pageranks = nx.pagerank(G, max_iter=400)


# Assign each to an attribute in your network
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, degree_dict, 'degree')
nx.set_node_attributes(G, dict(pageranks), 'pagerank')

In [None]:
from operator import itemgetter

sorted_degree = sorted(dict(degree_dict).items(), key=itemgetter(1), reverse=True)

print("Top nodes by degree:")
for d in sorted_degree:
    print(d)

In [None]:
nx.draw(G, labels=labels, with_labels=True, node_size=1000)

In [None]:
only_labels = {k: v.split("-")[0] for k, v in labels.items()}
nx.draw_kamada_kawai(G, labels=only_labels, with_labels = True, node_size=1000)

In [None]:
nx.draw_kamada_kawai(G, labels=only_labels, with_labels = True, node_size=1000)

In [None]:
 # A dictionary
# print(sorted(pageranks, key=lambda x: x[0], reverse=True))
sorted_pageranks = sorted(dict(pageranks).items(), key=itemgetter(1), reverse=True)
sorted_pageranks

In [None]:
import pandas as pd
graph0 = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')

In [None]:
graph0.describe()

In [None]:
df_centrality_measures = graph0.groupby('label').agg({'betweenness': np.mean, 'pagerank': np.mean,  'eigenvector': np.mean, }).reset_index()

In [None]:
df_centrality_measures['label'] = df_centrality_measures['label'].apply(lambda x: node_map.get(x))

In [None]:
import matplotlib.pyplot as plt

fig, (ax1) = plt.subplots(1, 3, figsize=(11,5))
fig.suptitle('Centrality Measures for A Drug')
# ax1.plot(x, y)
df_centrality_measures.plot(kind='bar', x='label', y='betweenness', ax=ax1[0])
df_centrality_measures.plot(kind='bar', x='label', y='pagerank', ax= ax1[1])
# df_centrality_measures.plot(kind='bar', x='label', y='closeness', ax= ax2[0])
df_centrality_measures.plot(kind='bar', x='label', y='eigenvector', ax= ax1[2])

In [None]:
df_centrality_measures = graph0.groupby('label').agg({'betweenness': np.mean, 'pagerank': np.mean,  'eigenvector': np.mean, }).reset_index()

In [None]:
df_centrality_measures['label'] = df_centrality_measures['label'].apply(lambda x: node_map.get(x))

In [None]:
import matplotlib.pyplot as plt

fig, (ax1) = plt.subplots(1, 3, figsize=(11,5))
fig.suptitle('Centrality Measures for A Drug')
# ax1.plot(x, y)
df_centrality_measures.plot(kind='bar', x='label', y='betweenness', ax=ax1[0])
df_centrality_measures.plot(kind='bar', x='label', y='pagerank', ax= ax1[1])
# df_centrality_measures.plot(kind='bar', x='label', y='closeness', ax= ax2[0])
df_centrality_measures.plot(kind='bar', x='label', y='eigenvector', ax= ax1[2])

In [None]:
graph0.groupby('label').agg({'betweenness': np.sum, 'pageranks': np.sum}).reset_index().plot(kind='bar', x='label', y='betweenness')

In [None]:
graph0.groupby('label').agg({'betweenness': np.sum, 'pageranks': np.sum}).reset_index().plot(kind='bar', x='label', y='pageranks')

# AIDS Dataset Analysis

The DTP AIDS Antiviral Screen has checked tens of thousands of compounds for evidence of anti-HIV activity. Available are screening results and chemical structural data on compounds that are not covered by a confidentiality agreement.

Screening Results (May 2004 Release)
The results of the screening tests are evaluated and placed in one of three categories:

CA - Confirmed active
CM - Confirmed moderately active
CI - Confirmed inactive


Chemical Structural Data - AIDS Screened (October 99 release)
The 2D structure (connection table) for each of the 42,390 compounds was retrieved from the DTP's Drug Information System. Conversion to a 3D structure was accomplished using the program Corina, created by Prof. Gasteiger's group.


In [None]:
import networkx as nx
import numpy as np
import scipy as sc
import os
import re

import util

def get_aids_node_map():
    activities = """0	C  
1	O  
2	N  
3	Cl 
4	F  
5	S  
6	Se 
7	P  
8	Na 
9	I  
10	Co 
11	Br 
12	Li 
13	Si 
14	Mg 
15	Cu 
16	As 
17	B  
18	Pt 
19	Ru 
20	K  
21	Pd 
22	Au 
23	Te 
24	W  
25	Rh 
26	Zn 
27	Bi 
28	Pb 
29	Ge 
30	Sb 
31	Sn 
32	Ga 
33	Hg 
34	Ho 
35	Tl 
36	Ni 
37	Tb"""

    node_map = {i.split('\t')[0].strip() : i.split('\t')[1].strip() for i in activities.split("\n")}

    return node_map

In [None]:
node_map = get_aids_node_map()
aids_graphs = read_graphfile_viz('./../dataset', dataname='AIDS', node_map = node_map)

In [None]:
G = aids_graphs[0][0].copy()
labels = aids_graphs[0][1].copy()
for n1 in list(G.nodes(data=True)):
    print(n1)

In [None]:
G.degree()

In [None]:
# edges present / max number of possible edges
nx.density(G)

In [None]:
# Why is it called transitivity? if A=B and B=C, the A must equal C. 
# Similarly, in triadic closure, if person A knows person B and person B knows person C, then person A probably knows person C: hence, transitivity. ↩
nx.transitivity(G)

## Centrality Measures

It is used to measure the importance (or “centrality” as in how “central” a node is in the graph) of various nodes in a graph

- Degree
- Betweenness centrality - This metric defines and measures the importance of a node in a network based upon how many times it occurs in the shortest path between all pairs of nodes in a graph.
- Closeness centrality - 
- Eigenvector centrality - This metric measures the importance of a node in a graph as a function of the importance of its neighbors. If a node is connected to highly important nodes, it will have a higher Eigen Vector Centrality score as compared to a node which is connected to lesser important nodes.

In [None]:
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G, max_iter=600) # Run eigenvector centrality

degree_dict = nx.degree(G)
pageranks = nx.pagerank(G, max_iter=400)


# Assign each to an attribute in your network
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, degree_dict, 'degree')
nx.set_node_attributes(G, dict(pageranks), 'pagerank')

In [None]:
from operator import itemgetter

sorted_degree = sorted(dict(degree_dict).items(), key=itemgetter(1), reverse=True)

print("Top nodes by degree:")
for d in sorted_degree:
    print(d)

In [None]:
nx.draw(G, labels=labels, with_labels=True, node_size=1000)

In [None]:
only_labels = {k: v.split("-")[0] for k, v in labels.items()}

In [None]:
nx.draw_kamada_kawai(G, labels=only_labels, with_labels = True, node_size=1000)

In [None]:
len(aids_graphs)

In [None]:
 # A dictionary
# print(sorted(pageranks, key=lambda x: x[0], reverse=True))
sorted_pageranks = sorted(dict(pageranks).items(), key=itemgetter(1), reverse=True)
sorted_pageranks

In [None]:
import pandas as pd
graph0 = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')

In [None]:
graph0.head()

In [None]:
node_map_reversed = {v:k for k, v in node_map.items()}
node_map_reversed

In [None]:
df_centrality_measures = graph0.groupby('label').agg({'betweenness': np.mean, 'pagerank': np.mean,  'eigenvector': np.mean, }).reset_index()

In [None]:
df_centrality_measures['label'] = df_centrality_measures['label'].apply(lambda x: node_map.get(x))

In [None]:
import matplotlib.pyplot as plt

fig, (ax1) = plt.subplots(1, 3, figsize=(11,5))
fig.suptitle('Centrality Measures for A Drug')
# ax1.plot(x, y)
df_centrality_measures.plot(kind='bar', x='label', y='betweenness', ax=ax1[0])
df_centrality_measures.plot(kind='bar', x='label', y='pagerank', ax= ax1[1])
# df_centrality_measures.plot(kind='bar', x='label', y='closeness', ax= ax2[0])
df_centrality_measures.plot(kind='bar', x='label', y='eigenvector', ax= ax1[2])

In [None]:
df_centrality_measures.plot(kind='bar', x='label', y='pageranks')

# DHFR Dataset Analysis

In [None]:
# node_map = get_aids_node_map()
dhfr_graphs = read_graphfile_viz('./../dataset', dataname='DHFR')

In [None]:
G = dhfr_graphs[1][0].copy()
labels = dhfr_graphs[1][1].copy()
for n1 in list(G.nodes(data=True)):
    print(n1)

In [None]:
G.degree()

In [None]:
# edges present / max number of possible edges
nx.density(G)

In [None]:
# Why is it called transitivity? if A=B and B=C, the A must equal C. 
# Similarly, in triadic closure, if person A knows person B and person B knows person C, then person A probably knows person C: hence, transitivity. ↩
nx.transitivity(G)

## Centrality Measures

It is used to measure the importance (or “centrality” as in how “central” a node is in the graph) of various nodes in a graph

- Degree
- Betweenness centrality - This metric defines and measures the importance of a node in a network based upon how many times it occurs in the shortest path between all pairs of nodes in a graph.
- Closeness centrality - 
- Eigenvector centrality - This metric measures the importance of a node in a graph as a function of the importance of its neighbors. If a node is connected to highly important nodes, it will have a higher Eigen Vector Centrality score as compared to a node which is connected to lesser important nodes.

In [None]:
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G, max_iter=600) # Run eigenvector centrality

degree_dict = nx.degree(G)
pageranks = nx.pagerank(G, max_iter=400)


# Assign each to an attribute in your network
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, degree_dict, 'degree')
nx.set_node_attributes(G, dict(pageranks), 'pagerank')

In [None]:
from operator import itemgetter

sorted_degree = sorted(dict(degree_dict).items(), key=itemgetter(1), reverse=True)

print("Top nodes by degree:")
for d in sorted_degree:
    print(d)

In [None]:
nx.draw(G,labels=labels, with_labels=True, node_size=1000)

In [None]:
only_labels = {k: v.split("-")[0] for k, v in labels.items()}

In [None]:
nx.draw_kamada_kawai(G, labels=only_labels, with_labels = True, node_size=1000)

In [None]:
 # A dictionary
# print(sorted(pageranks, key=lambda x: x[0], reverse=True))
sorted_pageranks = sorted(dict(pageranks).items(), key=itemgetter(1), reverse=True)
sorted_pageranks

In [None]:
import pandas as pd
graph0 = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')

In [None]:
graph0.describe()

In [None]:
df_centrality_measures = graph0.groupby('label').agg({'betweenness': np.sum, 'pageranks': np.sum}).reset_index()

In [None]:
df_centrality_measures.plot(kind='bar', x='label', y='betweenness')

In [None]:
df_centrality_measures.plot(kind='bar', x='label', y='pageranks')