In [11]:
import os 
import sys

import networkx as nx
import numpy as np
import pandas as pd

sys.path.append("../src/modeval")
from modulescomparison import ModulesComparison
from modulecontainers import Modules 

In [12]:
sys.path.insert(0,os.path.abspath("../src/modeval/"))

In [13]:
## Get the known RegulonDB network with modification described in inspect_input_elements.ipynb
known_trn = pd.read_csv("../data/Regulatory/QCd_Network.csv", index_col=0)
known_trn.head()

Unnamed: 0,regulatorId,regulatorName,RegulatorGeneName,regulatedId,regulatedName,function,confidenceLevel
313,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00450,hns,-,S
315,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00539,lrp,-,S
316,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00804,rbsA,-,S
317,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00805,rbsB,-,S
318,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00806,rbsC,-,S


In [14]:
## Load the annotation E Coli 
annotation = pd.read_csv("../data/Annotation/gene_info.csv")
annotation.head()

Unnamed: 0,locus_tag,gene_name,old_locus_tag,start,end,strand,gene_product,COG,uniprot
0,b0001,thrL,,190,255,+,thr operon leader peptide,No COG annotation,P0AD86
1,b0002,thrA,,337,2799,+,fused aspartate kinase/homoserine dehydrogenase 1,Amino acid transport and metabolism,P00561
2,b0003,thrB,,2801,3733,+,homoserine kinase,Nucleotide transport and metabolism,P00547
3,b0004,thrC,,3734,5020,+,threonine synthase,Amino acid transport and metabolism,P00934
4,b0005,yaaX,,5234,5530,+,DUF2502 domain-containing protein YaaX,Function unknown,P75616


In [15]:
# Define the set of all known genes (E)
all_genes = annotation.locus_tag.values 

# Define a set of all known regulators
known_regulators = known_trn['RegulatorGeneName'].unique()

# Map gene names to locus tags for comparable output with GENIE3 ouput
annotation_map = annotation[annotation.gene_name.notna()].set_index("gene_name")["locus_tag"].to_dict()
known_trn['targets_tags'] = known_trn["regulatedName"].map(annotation_map)

In [16]:
# Iterate over regulators and gene targets to load a graph from edges
known_ecoli_trn = nx.Graph() 
for i in known_trn[['RegulatorGeneName', 'targets_tags']].itertuples():
    known_ecoli_trn.add_edges_from([tuple([i[1], i[2]])]) 
    
edges_ecoli_trn = known_ecoli_trn.edges

# Find the neighbors of the regulators to define gene targets 
known_modules = []
for reg in known_regulators:
    known_modules.append(list(known_ecoli_trn.neighbors(reg)))

In [17]:
## Load the results from GENIE3 undirected run w/o regulator input
genie3_undirected = pd.read_csv("../data/predicted_results/GENIE3_5_ET_Ksqrt_nTrees1000_nRegs0_2024-06-11.csv", index_col=0)
genie3_undirected.head()

Unnamed: 0,regulatoryGene,targetGene,weight
1,b0573,b0574,0.041188
2,b0575,b0574,0.039607
3,b0574,b0575,0.039262
4,b2000,b2001,0.038284
5,b1973,b0296,0.03597


In [18]:
## Load the results from GENIE3 directed run w/o regulator input
genie3_directed = pd.read_csv("../data/predicted_results/GENIE3_4_RF_Ksqrt_nTrees1000_nRegs106_2024-06-26.csv", index_col=0)
genie3_directed.head()

Unnamed: 0,regulatoryGene,targetGene,weight
1,b1921,b1071,0.344796
2,b1921,b1924,0.34082
3,b0571,b0572,0.339699
4,b1921,b1925,0.336633
5,b1921,b1926,0.330987


In [30]:
directed_list = []
undirected_list = []

for i in range(100, 10000, 100):
    # Load the predicted undirected network
    ecoli_trn_undirected = nx.Graph()
    ecoli_trn_undirected.add_edges_from([tuple(val) for val in genie3_undirected[['regulatoryGene', 'targetGene']].head(i).values])
        # Load the predicted directed network
    ecoli_trn_directed = nx.Graph()
    ecoli_trn_directed.add_edges_from([tuple(val) for val in genie3_directed[['regulatoryGene', 'targetGene']].head(i).values])

    # Filter the network for connections with known regulators
    # Append neighborhood connected genes to the list of genesets
    regulator_ids = annotation[annotation['gene_name'].isin(known_regulators)]['locus_tag'].values

    modules_undirected = []
    for reg in regulator_ids:
        if reg in ecoli_trn_undirected.nodes:
            module = list(ecoli_trn_undirected.neighbors(reg))
            modules_undirected.append(module)

    # Get the sets 
    undirected_modules_as_sets = [set(module) for module in modules_undirected]
    
    modules_directed = []
    for reg in regulator_ids:
        if reg in ecoli_trn_directed.nodes:
            module = list(ecoli_trn_directed.neighbors(reg))
            modules_directed.append(module)

    # Get the sets 
    directed_modules_as_sets = [set(module) for module in modules_directed]

    ModulesB = Modules(known_modules)
    ModulesA_undirected = Modules(undirected_modules_as_sets)
    ModulesA_directed = Modules(directed_modules_as_sets)

    undirect_comp = ModulesComparison(ModulesA_undirected, ModulesB, all_genes).score(None)
    direct_comp = ModulesComparison(ModulesA_directed, ModulesB, all_genes).score(None)

    directed_list.append((i, direct_comp['precision'], direct_comp['recall'], direct_comp['recovery'], direct_comp['relevance'], direct_comp['F1rprr']))
    undirected_list.append((i, undirect_comp['precision'], undirect_comp['recall'], undirect_comp['recovery'], undirect_comp['relevance'], undirect_comp['F1rprr']))

directed_scores = pd.DataFrame(directed_list, columns=['edges', 'precision', 'recall', 'recovery', 'relevance', 'f1rprr'])
undirected_scores = pd.DataFrame(undirected_list, columns=['edges', 'precision', 'recall', 'recovery', 'relevance', 'f1rprr'])
    

In [33]:
directed_scores

Unnamed: 0,edges,precision,recall,recovery,relevance,f1rprr
0,100,0.002694,0.004932,0.289750,0.056400,0.006721
1,200,0.004052,0.009425,0.208927,0.102920,0.010888
2,300,0.004430,0.010680,0.196678,0.116267,0.012011
3,400,0.004725,0.011865,0.194786,0.125187,0.012943
4,500,0.005154,0.013351,0.180458,0.132976,0.014186
...,...,...,...,...,...,...
94,9500,0.011102,0.006452,0.095347,0.085162,0.014965
95,9600,0.011178,0.006389,0.095063,0.085358,0.014913
96,9700,0.011275,0.006360,0.095342,0.086113,0.014925
97,9800,0.011290,0.006306,0.094598,0.085265,0.014845


In [34]:
undirected_scores

Unnamed: 0,edges,precision,recall,recovery,relevance,f1rprr
0,100,0.000001,0.000153,0.208333,0.005518,0.000005
1,200,0.000008,0.000804,0.500000,0.012403,0.000033
2,300,0.000020,0.001191,0.438025,0.016057,0.000080
3,400,0.000159,0.001481,0.392011,0.018510,0.000570
4,500,0.000159,0.001481,0.392011,0.018510,0.000570
...,...,...,...,...,...,...
94,9500,0.004315,0.010686,0.173925,0.113662,0.011768
95,9600,0.004312,0.010732,0.169772,0.114237,0.011773
96,9700,0.004455,0.010836,0.170448,0.114385,0.012071
97,9800,0.004455,0.010836,0.170448,0.114385,0.012071


In [35]:
#merge the data frames, compute combined scores, and find which number of edges has the highest combined score
combined_scores = pd.merge(directed_scores, undirected_scores, on='edges', suffixes=('_directed', '_undirected'))
combined_scores['combined_f1rprr'] = combined_scores['f1rprr_directed'] + combined_scores['f1rprr_undirected']
max_combined_score = combined_scores.loc[combined_scores['combined_f1rprr'].idxmax()]

print("Edge count with highest combined F1rprr score:")
print(max_combined_score)

Edge count with highest combined F1rprr score:
edges                   7900.000000
precision_directed         0.010275
recall_directed            0.007442
recovery_directed          0.100343
relevance_directed         0.093375
f1rprr_directed            0.015850
precision_undirected       0.004075
recall_undirected          0.010815
recovery_undirected        0.216044
relevance_undirected       0.108215
f1rprr_undirected          0.011373
combined_f1rprr            0.027223
Name: 78, dtype: float64


Based on the F1rprr scores, it would appear that 7900 edges is the optimal number.