In [1]:

import os 
import sys

import networkx as nx
import pandas as pd

sys.path.append("../src/modeval")
from modulescomparison import ModulesComparison
from modulecontainers import Modules 

In [2]:
from modeval import ebcubed

In [3]:
sys.path.insert(0,os.path.abspath("../src/modeval/"))


In [4]:
## Get the known RegulonDB network with modification described in inspect_input_elements.ipynb
known_trn = pd.read_csv("../data/Regulatory/QCd_Network.csv", index_col=0)
known_trn.head()

Unnamed: 0,regulatorId,regulatorName,RegulatorGeneName,regulatedId,regulatedName,function,confidenceLevel
313,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00450,hns,-,S
315,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00539,lrp,-,S
316,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00804,rbsA,-,S
317,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00805,rbsB,-,S
318,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00806,rbsC,-,S


In [5]:
## Load the annotation E Coli 
annotation = pd.read_csv("../data/Annotation/gene_info.csv")
annotation.head()

Unnamed: 0,locus_tag,gene_name,old_locus_tag,start,end,strand,gene_product,COG,uniprot
0,b0001,thrL,,190,255,+,thr operon leader peptide,No COG annotation,P0AD86
1,b0002,thrA,,337,2799,+,fused aspartate kinase/homoserine dehydrogenase 1,Amino acid transport and metabolism,P00561
2,b0003,thrB,,2801,3733,+,homoserine kinase,Nucleotide transport and metabolism,P00547
3,b0004,thrC,,3734,5020,+,threonine synthase,Amino acid transport and metabolism,P00934
4,b0005,yaaX,,5234,5530,+,DUF2502 domain-containing protein YaaX,Function unknown,P75616


In [6]:
# Define the set of all known genes (E)
all_genes = annotation.locus_tag.values 

# Define a set of all known regulators
known_regulators = known_trn['RegulatorGeneName'].unique()

# Map gene names to locus tags for comparable output with GENIE3 ouput
annotation_map = annotation[annotation.gene_name.notna()].set_index("gene_name")["locus_tag"].to_dict()
known_trn['targets_tags'] = known_trn["regulatedName"].map(annotation_map)

In [7]:

# Iterate over regulators and gene targets to load a graph from edges
known_ecoli_trn = nx.Graph() 
for i in known_trn[['RegulatorGeneName', 'targets_tags']].itertuples():
    known_ecoli_trn.add_edges_from([tuple([i[1], i[2]])]) 
    
edges_ecoli_trn = known_ecoli_trn.edges

# Find the neighbors of the regulators to define gene targets 
known_modules = []
for reg in known_regulators:
    known_modules.append(list(known_ecoli_trn.neighbors(reg)))

In [8]:
## Load the results from GENIE3 run w/o regulator input
genie3_res = pd.read_csv("../data/predicted_results/GENIE3_5_ET_Ksqrt_nTrees1000_nRegs0_2024-06-11.csv", index_col=0)
genie3_res.head()

Unnamed: 0,regulatoryGene,targetGene,weight
1,b0573,b0574,0.041188
2,b0575,b0574,0.039607
3,b0574,b0575,0.039262
4,b2000,b2001,0.038284
5,b1973,b0296,0.03597


In [9]:
ecoli_trn = nx.Graph()
ecoli_trn.add_edges_from([tuple(val) for val in genie3_res[['regulatoryGene', 'targetGene']].head(2000).values])


In [10]:
regulator_ids = annotation[annotation['gene_name'].isin(known_regulators)]['locus_tag'].values

modules = []
for reg in regulator_ids:
    if reg in ecoli_trn.nodes:
        module = list(ecoli_trn.neighbors(reg))
        modules.append(module)


In [11]:
modules_as_sets = [set(module) for module in modules]
len(modules_as_sets)

12

In [13]:
#regulator_modules = [[gene] for gene in regulator_ids]
#regulator_modules

In [15]:
ModulesA = Modules(modules_as_sets)
ModulesB = Modules(known_modules)
Modules_genes = Modules(all_genes)

In [18]:
ModComp = ModulesComparison(ModulesA, ModulesB, all_genes)

In [20]:
print(ModComp.membershipsA.sum(axis = 0))
print(ModComp.membershipsB.sum(axis = 0))

M0      5
M1      1
M2      2
M3      4
M4     12
M5      1
M6      1
M7      2
M8      5
M9      2
M10     1
M11     6
dtype: uint64
M0       8
M1      12
M2      22
M3       8
M4       6
        ..
M124    37
M125    11
M126     9
M127     8
M128    34
Length: 129, dtype: uint64


In [21]:
scores = ModComp.score(baselines=None)

In [22]:
scores

{'recoveries': array([0.625     , 0.16666667, 0.25      , 0.23529412, 0.25      ,
        0.        , 0.05882353, 0.07142857, 0.26315789, 0.25      ,
        0.025     , 0.71428571]),
 'relevances': array([0.        , 0.        , 0.        , 0.        , 0.16666667,
        0.        , 0.        , 0.        , 0.125     , 0.1       ,
        0.1       , 0.08333333, 0.        , 0.        , 0.08333333,
        0.06666667, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.0952381 , 0.        , 0.125     , 0.        ,
        0.16129032, 0.03846154, 0.        , 0.        , 0.        ,
        0.1       , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.02173913, 0.        , 0.05882353, 0.07142857,
        0.        , 0.25      , 0.01030928, 0.        , 0.07142857,
        0.01639344, 0.        , 0.        , 0.08333333, 0.        ,
        0.        , 0.        , 0.        , 0.625     , 0.02439024,
        0.03225806, 0.        , 0.15789474, 0.        

Based on the recoveries and relevances, it seems that our observed modules map onto the known modules with not a particularly high degree of accuracy, but not a particualrly low degree of accuracy either, whereas the known modules map fairly poorly onto the observed modules. This would indicate that a small but nonetheless statistically significant portion of the observed modules are contained within the known modules, while the known modules are not well predicted by the observed model.

The recall and precision values are fairly low as well, indicating that a low number of genes from the observed modules were actually contained within the known modules and an even lower number of genes from the known modules were contaied within the observed modules. This leads to a low F1rp value, as both recall and precision were quite low. 

With a low F1rp value and fairly low recoveries and relevences, it is no surprise that the overall harmonic mean, f1rprr, is also quite low. This is strong evidence that the observed modules represent a poor predictive model. 