# Make a 6-Metanode-network

In [1]:
import sys
sys.path.append('../../hetnet-ml/src')

import pandas as pd
import graph_tools as gt
from hetio.hetnet import MetaGraph

In [2]:
def num_metapaths(nodes, edges):
    abv, met = gt.get_abbrev_dict_and_edge_tuples(gt.add_colons(nodes), gt.add_colons(edges))
    return len(MetaGraph.from_edge_tuples(met, abv).extract_metapaths('Chemicals & Drugs', 'Disorders', 4))

In [3]:
nodes = gt.remove_colons(pd.read_csv('../data/nodes_VER31_R_consolidated_condensed_filtered_001.csv'))
edges = gt.remove_colons(pd.read_csv('../data/edges_VER31_R_consolidated_condensed_filtered_001.csv'))

In [4]:
print('{:,} Potnential Metapaths'.format(num_metapaths(nodes, edges)))

8,745 Potnential Metapaths


Some nodes really have little to do with bio-medical knowledge and these will also be removed.

- REMOVE NODES:
    - Organizations
    - Activities & Behaviors
    - Concepts & Ideas
    - Procedures
    - Devices
    - Living Beings

NOTE: While `Living Beings` seems like it would be useful, many of these are far too general, making for replationships and paths that are not meaningful (`Researcher - Asscoided With - Amino Acids` for example).  It is too difficult to filter out the general terms from the more specific, so all Living Beings will be removed

### Removal of un-needed metanodes

In [5]:
# Remove nodes of types that are less-useful
remove_types = ['Organizations', 'Activities & Behaviors', 'Concepts & Ideas', 'Procedures', 'Devices', 'Living Beings']
idx = gt.remove_colons(nodes).query('label in @remove_types').index
nodes.drop(idx, inplace=True)

In [6]:
# Make sure that 
ok_ids = nodes['id'].unique()
edges = edges.query('start_id in @ok_ids and end_id in @ok_ids')

print('{:,} Unique IDs in the nodes'.format(len(ok_ids)))
print('{:,} Unique IDs found wihtin the remaining edges'.format(len(set(edges['start_id']).union(set(edges['end_id'])))))

148,381 Unique IDs in the nodes
135,879 Unique IDs found wihtin the remaining edges


In [7]:
ok_ids = list(set(edges['start_id']).union(set(edges['end_id'])))
nodes = nodes.query('id in @ok_ids')

print('Remvoed IDs from nodes that no longer have edges...')
print('{:,} IDs found in edges'.format(len(ok_ids)))
print('{:,} IDs found in nodes'.format(len(set(edges['start_id']).union(set(edges['end_id'])))))

Remvoed IDs from nodes that no longer have edges...
135,879 IDs found in edges
135,879 IDs found in nodes


In [8]:
print('After cutting down to 6 Most Revelent Metanodes... ')
print('{:,} Potnential Metapaths'.format(num_metapaths(nodes, edges)))

After cutting down to 6 Most Revelent Metanodes... 
2,439 Potnential Metapaths


In [9]:
nodes['label'].value_counts()

Chemicals & Drugs              64641
Disorders                      29882
Genes & Molecular Sequences    19054
Anatomy                        14241
Physiology                      7013
Phenomena                       1048
Name: label, dtype: int64

In [10]:
edges['type'].nunique()

30

In [11]:
edges['type'].value_counts()

LOCATION_OF_AloCD         949442
REGULATES_CDreg>CD        810322
INTERACTS_WITH_CDiwG      677059
TREATS_CDtDO              518241
LOCATION_OF_AloG          513366
ASSOCIATED_WITH_DOawDO    460999
LOCATION_OF_AloDO         450282
STIMULATES_CDstG          373995
AFFECTS_CDafA             308558
ASSOCIATED_WITH_GawDO     258232
AFFECTS_GafPS             240690
INHIBITS_GinCD            239618
LOCATION_OF_AloA          223253
AUGMENTS_CDagPS           218588
INTERACTS_WITH_GiwG       200611
REGULATES_Greg>G          200257
RELATED_TO_CDrtCD         178639
DISRUPTS_CDdsPS           174447
AUGMENTS_GagDO            129113
AFFECTS_GafA              123446
AFFECTS_PSafDO            111961
INHIBITS_GinDO            105307
AFFECTS_PSafPS             75484
AFFECTS_CDafPH             71088
CAUSES_DOcPS               49057
ASSOCIATED_WITH_PSawCD     32321
AFFECTS_PHafDO             29140
LOCATION_OF_AloPS          27199
AFFECTS_PHafPS             21182
AFFECTS_PHafG              12378
Name: type

In [12]:
gt.add_colons(nodes).to_csv('../data/nodes_VER31_R_cons_6_metanode.csv', index=False)
gt.add_colons(edges).to_csv('../data/edges_VER31_R_cons_6_metanode.csv', index=False)