# Using KGFE to understand important features

Basically, given a list of genes/proteins/metabolites that are mapped onto SPOKE, we want to identify nodes on the graph that are in some way "similar" to the input nodes.

In [1]:
# imports
import numpy as np
import pandas as pd
import networkx as nx
import kgfe

## 1. Load genes

In [2]:
data = pd.read_excel('LC_mets_prots_V1_V2_Compare.xlsx')

In [3]:
data.head()

Unnamed: 0,ID,Lasso_CV,Name,UniProt,Protein,PubChem,Chembl_ID,KEGG
0,met10,-0.100203,"1,2,4-benzenetriol",,,10787.0,CHEMBL3092389,C02814
1,met1054,-0.052794,pyridine,,,1049.0,CHEMBL266158,C00747
2,met1073,0.051544,uric acid,,,1175.0,CHEMBL792,C00366
3,met1091,-0.054557,formononetin,,,5280378.0,CHEMBL242341,C00858
4,met142,0.02583,4-Hydroxyvalproic acid,,,134467.0,CHEMBL3706504,C16649


## 2. Load SPOKE

In [5]:
spoke_graph = kgfe.spoke_loader.load_spoke_networkx('../../graph_utils/spoke_2021.jsonl.gz', directed=True,
                                                    verbose=False)

In [6]:
# undirected version of spoke graph
spoke_graph_ud = nx.Graph(spoke_graph)

In [7]:
spoke_ids = list(sorted(spoke_graph_ud.nodes.keys()))

In [8]:
names_to_ids = kgfe.graph_info.get_names_to_ids(spoke_graph_ud)

### Map proteins and metabolites to SPOKE nodes

In [9]:
len(names_to_ids)

617546

In [11]:
uniprot_to_ids = kgfe.graph_info.spoke_identifiers_to_ids(spoke_graph_ud, 'Protein')

In [23]:
kgfe.graph_info.spoke_identifiers_to_ids?

[0;31mSignature:[0m [0mkgfe[0m[0;34m.[0m[0mgraph_info[0m[0;34m.[0m[0mspoke_identifiers_to_ids[0m[0;34m([0m[0mgraph[0m[0;34m,[0m [0mcategory[0m[0;34m,[0m [0msource[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Returns a mapping from SPOKE identifiers to IDs.

category: 'Protein', 'Gene', 'Compound', 'Disease', etc
source: 'KEGG', ...
[0;31mFile:[0m      ~/isb/kg_feature_engineering/src/kgfe/graph_info.py
[0;31mType:[0m      function

In [24]:
chembl_to_ids = kgfe.graph_info.spoke_identifiers_to_ids(spoke_graph_ud, 'Compound')

In [28]:
prot_spoke_ids = [(x, uniprot_to_ids[x]) for x in data['UniProt'] if isinstance(x, str)]

In [32]:
chem_spoke_ids = [(x, chembl_to_ids[x]) for x in data['Chembl_ID'] if isinstance(x, str) and x in chembl_to_ids]

In [33]:
chem_spoke_ids

[('CHEMBL3092389', 1051506),
 ('CHEMBL266158', 1529834),
 ('CHEMBL792', 1282230),
 ('CHEMBL242341', 440588),
 ('CHEMBL454808', 1168062),
 ('CHEMBL449129', 1175855),
 ('CHEMBL78', 268338),
 ('CHEMBL228057', 5555),
 ('CHEMBL366563', 930796),
 ('CHEMBL1256480', 1328370),
 ('CHEMBL394875', 1843928),
 ('CHEMBL1236395', 1497509),
 ('CHEMBL8165', 759937),
 ('CHEMBL1226', 1551766)]

In [31]:
len(prot_spoke_ids)

54

In [34]:
len(chem_spoke_ids)

14

In [None]:
## add spoke ids to table

## 3. Run topic PageRank

In [24]:
pr_results_personalized = nx.pagerank(spoke_graph_ud, personalization={i: 1 for i in prot_spoke_ids})

In [25]:
pr_results_base = nx.pagerank(spoke_graph_ud)

pr_results_ratio is the ratio between the topic pagerank results and the baseline pagerank results - this indicates the increase in likelihood that the nodes will be visited in a personalized random walk, and is an attempt to control for nodes that are important globally.

In [39]:
pr_results_ratio = {x: pr_results_personalized[x]/pr_results_base[x] for x in spoke_ids}

## Visualizing subgraphs

In [35]:
import ipycytoscape