In [1]:
import os
import pandas as pd
import numpy as np
from networkx import pagerank, from_pandas_edgelist # conda install 'networkx<2.7'
import networkx as nx

import random
from tqdm.notebook import tqdm

In [2]:
def load_pegasus_results():
    np.random.seed(42)
    
    # Creating base data for 20 genes
    data = {
        "Gene": [f"g{i+1}" for i in range(20)],
        "Gene NCBI ID": [f"{i+1}" for i in range(20)],
        "Chr": [f"Chr{np.random.randint(1, 23)}" for _ in range(20)],
        "nSNPs": np.random.randint(10, 101, size=20),
        "Start": np.arange(5, 205, 10),
        "Stop": np.arange(10, 210, 10),
        "Pvalue": np.random.uniform(0.0001, 0.05, 20),
        "Score": np.random.uniform(0, 3, size=20)
    }

    # Creating DataFrame
    data = pd.DataFrame(data)
    return data

def load_catalog():
    np.random.seed(42)

    pegasus_data = load_pegasus_results()
    snp_list = []
    pvalue_list = []
    mapped_genes_list = []
    
    snp_counter = 1
    for _, row in pegasus_data.iterrows():
        gene = row["Gene"]
        n_snps = row["nSNPs"]
        
        for _ in range(n_snps):
            snp_list.append(f"snp{snp_counter}")
            pvalue_list.append(np.random.uniform(0.001, 0.05))
            
            # Randomly select 1 to 4 genes to map
            mapped_genes_count = random.randint(1, 4)
            mapped_genes = random.sample(list(pegasus_data["Gene"]), mapped_genes_count)
            mapped_genes_list.append(", ".join(mapped_genes))
            
            snp_counter += 1
    
    snp_data = pd.DataFrame({
        "SNP": snp_list,
        "pValue": pvalue_list,
        "mappedGenes": mapped_genes_list
    })
    
    return snp_data

# Load gwas catalog and pegasus output
gwas_catalog = load_catalog()
pegasus_data = load_pegasus_results()

In [3]:
gwas_catalog

Unnamed: 0,SNP,pValue,mappedGenes
0,snp1,0.015517,"g1, g11, g5"
1,snp2,0.009098,"g4, g15, g18, g13"
2,snp3,0.001766,"g15, g3, g14, g7"
3,snp4,0.021747,"g20, g10"
4,snp5,0.020349,"g3, g7, g10"
...,...,...,...
1278,snp1279,0.042466,g14
1279,snp1280,0.031809,"g19, g5, g17, g11"
1280,snp1281,0.037032,g15
1281,snp1282,0.038618,"g5, g4, g2"


In [4]:
pegasus_data

Unnamed: 0,Gene,Gene NCBI ID,Chr,nSNPs,Start,Stop,Pvalue,Score
0,g1,1,Chr7,73,5,10,0.019185,1.275468
1,g2,2,Chr20,69,15,20,0.049163,0.623825
2,g3,3,Chr15,30,25,30,0.023391,1.703101
3,g4,4,Chr11,42,35,40,0.043011,0.09394
4,g5,5,Chr8,85,45,50,0.034047,2.526854
5,g6,6,Chr21,67,55,60,0.02258,1.349262
6,g7,7,Chr7,31,65,70,0.000762,1.185451
7,g8,8,Chr19,98,75,80,0.047116,2.779977
8,g9,9,Chr11,58,85,90,0.028208,2.181816
9,g10,10,Chr11,100,95,100,0.019332,0.979622


In [5]:
def load_seeds_and_targets():
    # Load seeds
    pegasus_data = load_pegasus_results()
    gene_seeds_ncbi = pegasus_data.loc[pegasus_data['Pvalue'] <= 0.05, 'Gene NCBI ID'].tolist()

    # Load targets
    gwas_catalog = load_catalog()
    ncbi_targets = set() 
    for i, row in gwas_catalog.iterrows():
        gns = row["mappedGenes"].split(", ")
        for gn in gns:
            if gn in pegasus_data['Gene'].values:
                ncbi_targets.add(pegasus_data.loc[pegasus_data['Gene'] == gn, 'Gene NCBI ID'].iloc[0])
        ncbi_targets_sorted = sorted(list(ncbi_targets), key=lambda x: int(x[0:]))
    
    return gene_seeds_ncbi, ncbi_targets_sorted

seeds, targets = load_seeds_and_targets()

In [6]:
seeds

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20']

In [7]:
targets

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20']

In [8]:
# Generate node pairs
def generate_node_pairs(num_rows, num_nodes, min_frequency):
    pairs = []
    node_counts = {f"{i}": 0 for i in range(1, num_nodes + 1)}
    
    # Initial loop to generate up to num_rows pairs
    while len(pairs) < num_rows:
        node1 = f"{random.randint(1, num_nodes)}"
        node2 = f"{random.randint(1, num_nodes)}"
        if node1 != node2 and int(node1[0:]) < int(node2[0:]):
            pairs.append((node1, node2))
            node_counts[node1] += 1
    
    # Ensure each of node 1 to 5 appear at least min_frequency times
    for i in range(1, 6):
        node = f"{i}"
        additional_pairs_needed = min_frequency - node_counts[node]
        for _ in range(additional_pairs_needed):
            if len(pairs) >= num_rows:
                break
            node2 = f"{random.randint(i+1, num_nodes)}"
            pairs.append((node, node2))
            node_counts[node] += 1
    
    return pairs

# Generate graph
def load_graph_nx():
    # Generate 100 rows of node pairs
    num_rows = 100
    num_nodes = 20
    min_frequency = 10
    node_pairs = generate_node_pairs(num_rows, num_nodes, min_frequency)

    # Convert to DataFrame
    node_data = pd.DataFrame(node_pairs, columns=["node1", "node2"])
    node_data_without_duplicates = node_data.drop_duplicates()

    graph = from_pandas_edgelist(node_data_without_duplicates, source="node1", target="node2")
    
    return graph, node_data_without_duplicates

graph, node = load_graph_nx()

In [9]:
node

Unnamed: 0,node1,node2
0,4,17
1,8,13
2,8,19
3,8,15
4,10,14
...,...,...
90,7,13
95,6,10
96,12,15
98,3,19


In [10]:
def init_rwr_scores_nx(graph, data):   
    pegasus_genes = set(data['Gene NCBI ID'])
    pegasus_scores = dict(zip(data['Gene NCBI ID'], data['Score']))
    
    pagerank_seeds = {}
    for node in graph.nodes:
        if node in pegasus_genes:
            pagerank_seeds[node] = pegasus_scores[node]
        else:
            pagerank_seeds[node] = 0
            
    return pagerank_seeds

pagerank_seeds = init_rwr_scores_nx(graph, pegasus_data)

In [11]:
pagerank_seeds

{'4': 0.09393987736667575,
 '17': 1.7602534969915447,
 '8': 2.7799765973813826,
 '13': 2.8835160730480474,
 '19': 1.8211027430600542,
 '15': 2.2419603304121427,
 '10': 0.9796223064175061,
 '14': 2.533601546034454,
 '16': 1.6190763971672393,
 '2': 0.6238249886045665,
 '6': 1.349262400109297,
 '7': 1.1854507080054433,
 '20': 0.8279975460676301,
 '12': 1.562502780077471,
 '18': 2.895765921792414,
 '11': 1.711331923216198,
 '3': 1.7031009834599744,
 '5': 2.5268543237849954,
 '9': 2.1818159875692626,
 '1': 1.275467623473734}

In [12]:
def perform_rwr_nx(alpha, graph, seeds):   
    rwr_scores = pagerank(graph, alpha=alpha, personalization=seeds)
    return rwr_scores

alpha = 0.85
rwr_scores = perform_rwr_nx(alpha, graph, pagerank_seeds)

In [13]:
rwr_scores

{'4': 0.0565917203549835,
 '17': 0.0687262133483769,
 '8': 0.058592731857878,
 '13': 0.06629294697569106,
 '19': 0.058292905014017404,
 '15': 0.06255482389576751,
 '10': 0.05121814301017755,
 '14': 0.046529200123154736,
 '16': 0.05287550482894114,
 '2': 0.04347036242118573,
 '6': 0.036902951052819405,
 '7': 0.041682605335521414,
 '20': 0.040293461359168406,
 '12': 0.05360502281592967,
 '18': 0.044742259979236844,
 '11': 0.03264168298131158,
 '3': 0.054942749992860904,
 '5': 0.04267181384441596,
 '9': 0.06257794919287901,
 '1': 0.024794951615682966}

In [14]:
def process_rwr_results_nx(scores, graph, data, seeds):
    node2idx = {str(n): i for i, n in enumerate(graph.nodes)}
    idx2node = {v: k for k, v in node2idx.items()}
    ncbi2gene = dict(zip(data['Gene NCBI ID'], data['Gene']))
    
    seeds_vals = np.fromiter(seeds.values(), dtype="float")
    max_val = np.max(seeds_vals[~np.isinf(seeds_vals)])
    rwr_results = []
    for i, node in enumerate(graph.nodes):
        row = {}
        init_score = seeds[node]
        if np.isinf(init_score):
            init_score = max_val
        row["Gene NCBI ID"] = node
        row["Gene"] = ncbi2gene[node] if node in ncbi2gene.keys() else "-"
        row["Initial Score"] = init_score
        row["Final Score"] = scores[node]
        rwr_results.append(row)
    rwr_results = pd.DataFrame(rwr_results).sort_values(by="Final Score", ascending=False)

    return rwr_results

rwr_results = process_rwr_results_nx(rwr_scores, graph, pegasus_data, pagerank_seeds)

In [15]:
rwr_results

Unnamed: 0,Gene NCBI ID,Gene,Initial Score,Final Score
1,17,g17,1.760253,0.068726
3,13,g13,2.883516,0.066293
18,9,g9,2.181816,0.062578
5,15,g15,2.24196,0.062555
2,8,g8,2.779977,0.058593
4,19,g19,1.821103,0.058293
0,4,g4,0.09394,0.056592
16,3,g3,1.703101,0.054943
13,12,g12,1.562503,0.053605
8,16,g16,1.619076,0.052876
