In [1]:
import json
from pymodulon.io import load_json_model
from pymodulon import example_data
from pymodulon.core import IcaData

import networkx as nx
import os 
import sys
import numpy as np
import pandas as pd

sys.path.append("../src/modeval")
from modulescomparison import ModulesComparison
from modulecontainers import Modules 

In [2]:
## Get the known RegulonDB network with modification described in inspect_input_elements.ipynb
known_trn = pd.read_csv("../data/Regulatory/QCd_Network.csv", index_col=0)
known_trn.head()

Unnamed: 0,regulatorId,regulatorName,RegulatorGeneName,regulatedId,regulatedName,function,confidenceLevel
313,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00450,hns,-,S
315,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00539,lrp,-,S
316,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00804,rbsA,-,S
317,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00805,rbsB,-,S
318,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00806,rbsC,-,S


In [3]:
## Load the annotation E Coli 
annotation = pd.read_csv("../data/Annotation/gene_info.csv")
annotation.head()

Unnamed: 0,locus_tag,gene_name,old_locus_tag,start,end,strand,gene_product,COG,uniprot
0,b0001,thrL,,190,255,+,thr operon leader peptide,No COG annotation,P0AD86
1,b0002,thrA,,337,2799,+,fused aspartate kinase/homoserine dehydrogenase 1,Amino acid transport and metabolism,P00561
2,b0003,thrB,,2801,3733,+,homoserine kinase,Nucleotide transport and metabolism,P00547
3,b0004,thrC,,3734,5020,+,threonine synthase,Amino acid transport and metabolism,P00934
4,b0005,yaaX,,5234,5530,+,DUF2502 domain-containing protein YaaX,Function unknown,P75616


In [4]:
# Define the set of all known genes (E)
all_genes = annotation.locus_tag.values 

# Define a set of all known regulators
known_regulators = known_trn['RegulatorGeneName'].unique()

# Map gene names to locus tags for comparable output with GENIE3 ouput
annotation_map = annotation[annotation.gene_name.notna()].set_index("gene_name")["locus_tag"].to_dict()
known_trn['targets_tags'] = known_trn["regulatedName"].map(annotation_map)

In [5]:

# Iterate over regulators and gene targets to load a graph from edges
known_ecoli_trn = nx.Graph() 
for i in known_trn[['RegulatorGeneName', 'targets_tags']].itertuples():
    known_ecoli_trn.add_edges_from([tuple([i[1], i[2]])]) 
    
edges_ecoli_trn = known_ecoli_trn.edges

# Find the neighbors of the regulators to define gene targets 
known_modules = []
for reg in known_regulators:
    known_modules.append(list(known_ecoli_trn.neighbors(reg)))

In [7]:
example_data.A 

Unnamed: 0,control__wt_glc__1,control__wt_glc__2,fur__wt_dpd__1,fur__wt_dpd__2,fur__wt_fe__1,fur__wt_fe__2,fur__delfur_dpd__1,fur__delfur_dpd__2,fur__delfur_fe2__1,fur__delfur_fe2__2,...,efeU__menFentC_ale29__1,efeU__menFentC_ale29__2,efeU__menFentC_ale30__1,efeU__menFentC_ale30__2,efeU__menFentCubiC_ale36__1,efeU__menFentCubiC_ale36__2,efeU__menFentCubiC_ale37__1,efeU__menFentCubiC_ale37__2,efeU__menFentCubiC_ale38__1,efeU__menFentCubiC_ale38__2
AllR/AraC/FucR,0.378690,-0.378690,2.457678,2.248678,-0.327344,-0.259164,1.777251,2.690655,0.656937,0.319583,...,1.041336,2.203940,3.698292,0.856998,1.557323,0.337806,0.943742,1.736640,0.499461,1.581476
ArcA-1,-0.440210,0.440210,-5.367360,-5.684301,0.131174,0.348843,-4.436389,-4.770469,-1.799113,-1.474222,...,-6.471714,-6.549861,-3.109145,-2.716183,-2.531192,-1.461022,-0.408849,-0.210397,-5.700321,-6.237836
ArcA-2,0.762258,-0.762258,2.619623,2.900696,3.120724,2.743634,1.989803,1.555835,1.782500,1.530811,...,2.789653,3.959650,1.585147,0.811182,0.300414,2.537535,1.061408,2.634524,0.125513,1.178747
ArgR,-0.289630,0.289630,-10.085719,-13.187916,2.371129,1.861918,-8.708701,-7.881588,-1.237027,-1.235604,...,-11.263744,-10.366813,-0.289217,0.389228,-5.142768,-5.014526,-3.648777,-4.125952,-4.286326,-5.475940
AtoC,0.250770,-0.250770,1.844767,2.055052,0.299345,0.425502,1.801217,1.790987,0.921254,1.410026,...,3.821909,3.306573,2.652394,1.910173,0.927772,1.327549,1.846321,0.909667,2.064662,2.371405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
uncharacterized-4,0.597480,-0.597480,-8.707623,-7.812802,0.323410,1.331085,-7.711778,-6.652678,1.087082,1.005071,...,-3.803681,-3.201872,-3.475330,-3.297351,-5.398908,-5.611798,-5.662947,-4.948391,-4.582840,-5.443213
uncharacterized-5,-0.497327,0.497327,-16.669187,-17.120903,0.228383,0.056799,-14.972028,-13.434012,-3.997564,-4.300753,...,-27.990992,-28.543522,-3.447801,-1.648500,-5.807751,-6.571386,-8.404540,-8.295630,-6.039384,-7.430400
uncharacterized-6,0.581999,-0.581999,-7.842130,-9.298792,2.352067,2.750720,-6.310719,-6.370995,2.112088,1.685886,...,0.866587,1.694439,0.436875,0.705672,3.500528,3.741011,5.988410,5.595631,7.619875,6.295446
ydcI-KO,0.269057,-0.269057,-0.254271,0.184537,0.651527,0.691914,0.016636,-0.465781,1.212097,1.122801,...,-3.646464,-3.606055,-0.665810,-0.830586,-1.044574,0.119876,-0.045670,-0.826608,-1.393160,-1.587978


In [8]:
example_trn = example_data.trn
example_sample = example_data.sample_table
example_gene = example_data.gene_table
example_A = example_data.A
example_M = example_data.M
example_X = example_data.X

In [9]:
example_M.columns = list(range(len(example_M.columns)))
example_A.index = list(range(len(example_A.index)))
example_A

Unnamed: 0,control__wt_glc__1,control__wt_glc__2,fur__wt_dpd__1,fur__wt_dpd__2,fur__wt_fe__1,fur__wt_fe__2,fur__delfur_dpd__1,fur__delfur_dpd__2,fur__delfur_fe2__1,fur__delfur_fe2__2,...,efeU__menFentC_ale29__1,efeU__menFentC_ale29__2,efeU__menFentC_ale30__1,efeU__menFentC_ale30__2,efeU__menFentCubiC_ale36__1,efeU__menFentCubiC_ale36__2,efeU__menFentCubiC_ale37__1,efeU__menFentCubiC_ale37__2,efeU__menFentCubiC_ale38__1,efeU__menFentCubiC_ale38__2
0,0.378690,-0.378690,2.457678,2.248678,-0.327344,-0.259164,1.777251,2.690655,0.656937,0.319583,...,1.041336,2.203940,3.698292,0.856998,1.557323,0.337806,0.943742,1.736640,0.499461,1.581476
1,-0.440210,0.440210,-5.367360,-5.684301,0.131174,0.348843,-4.436389,-4.770469,-1.799113,-1.474222,...,-6.471714,-6.549861,-3.109145,-2.716183,-2.531192,-1.461022,-0.408849,-0.210397,-5.700321,-6.237836
2,0.762258,-0.762258,2.619623,2.900696,3.120724,2.743634,1.989803,1.555835,1.782500,1.530811,...,2.789653,3.959650,1.585147,0.811182,0.300414,2.537535,1.061408,2.634524,0.125513,1.178747
3,-0.289630,0.289630,-10.085719,-13.187916,2.371129,1.861918,-8.708701,-7.881588,-1.237027,-1.235604,...,-11.263744,-10.366813,-0.289217,0.389228,-5.142768,-5.014526,-3.648777,-4.125952,-4.286326,-5.475940
4,0.250770,-0.250770,1.844767,2.055052,0.299345,0.425502,1.801217,1.790987,0.921254,1.410026,...,3.821909,3.306573,2.652394,1.910173,0.927772,1.327549,1.846321,0.909667,2.064662,2.371405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,0.597480,-0.597480,-8.707623,-7.812802,0.323410,1.331085,-7.711778,-6.652678,1.087082,1.005071,...,-3.803681,-3.201872,-3.475330,-3.297351,-5.398908,-5.611798,-5.662947,-4.948391,-4.582840,-5.443213
88,-0.497327,0.497327,-16.669187,-17.120903,0.228383,0.056799,-14.972028,-13.434012,-3.997564,-4.300753,...,-27.990992,-28.543522,-3.447801,-1.648500,-5.807751,-6.571386,-8.404540,-8.295630,-6.039384,-7.430400
89,0.581999,-0.581999,-7.842130,-9.298792,2.352067,2.750720,-6.310719,-6.370995,2.112088,1.685886,...,0.866587,1.694439,0.436875,0.705672,3.500528,3.741011,5.988410,5.595631,7.619875,6.295446
90,0.269057,-0.269057,-0.254271,0.184537,0.651527,0.691914,0.016636,-0.465781,1.212097,1.122801,...,-3.646464,-3.606055,-0.665810,-0.830586,-1.044574,0.119876,-0.045670,-0.826608,-1.393160,-1.587978


In [10]:
ica_data = IcaData(example_M,example_A,example_X)
ica_data.trn = example_trn
ica_data.gene_table = example_gene
ica_data.sample_table = example_sample

In [11]:
modulons = IcaData(ica_data.M,
                   ica_data.A,
                   ica_data.X,
                   gene_table=ica_data.gene_table,
                   trn=ica_data.trn,
                   threshold_method='dagostino',
                   optimize_cutoff=True)



In [12]:
modulons.view_imodulon(0)

Unnamed: 0,gene_weight,start,end,strand,gene_name,length,operon,COG,accession,regulator
b0061,0.159019,65854,66550,-,araD,696,araBAD,Carbohydrate transport and metabolism,NC_000913.3,"araC,crp,rpoD"
b0062,0.164871,66834,68337,-,araA,1503,araBAD,Carbohydrate transport and metabolism,NC_000913.3,"araC,crp,rpoD"
b0063,0.17655,68347,70048,-,araB,1701,araBAD,Carbohydrate transport and metabolism,NC_000913.3,"araC,crp,rpoD"
b0505,0.161867,532450,532933,+,allA,483,allA,Nucleotide transport and metabolism,NC_000913.3,allR
b0507,0.265127,533915,535697,+,gcl,1782,gcl-hyi-glxR-ybbW-allB-ybbY-glxK,Function unknown,NC_000913.3,"allR,rpoD"
b0508,0.316052,535709,536486,+,hyi,777,gcl-hyi-glxR-ybbW-allB-ybbY-glxK,Carbohydrate transport and metabolism,NC_000913.3,"allR,rpoD"
b0509,0.321224,536585,537464,+,glxR,879,gcl-hyi-glxR-ybbW-allB-ybbY-glxK,Lipid transport and metabolism,NC_000913.3,"allR,rpoD"
b0511,0.150539,537632,539087,+,ybbW,1455,gcl-hyi-glxR-ybbW-allB-ybbY-glxK,Nucleotide transport and metabolism,NC_000913.3,"allR,rpoD,yeiE"
b0512,0.248491,539146,540508,+,allB,1362,gcl-hyi-glxR-ybbW-allB-ybbY-glxK,Nucleotide transport and metabolism,NC_000913.3,"allR,rpoD"
b1900,0.091065,1983554,1985069,-,araG,1515,araFGH,Inorganic ion transport and metabolism,NC_000913.3,"araC,crp,nac,rpoD,rpoS"


In [13]:
locus_tags = []
for k in modulons.imodulon_names:
    locus_tag = list(modulons.view_imodulon(k).index)
    locus_tags.append(locus_tag)


In [14]:
locus_tags

[['b0061',
  'b0062',
  'b0063',
  'b0505',
  'b0507',
  'b0508',
  'b0509',
  'b0511',
  'b0512',
  'b1900',
  'b1901',
  'b2799',
  'b2800',
  'b2801',
  'b2802',
  'b2803',
  'b2841',
  'b4460'],
 ['b0042',
  'b0113',
  'b0124',
  'b0330',
  'b0346',
  'b0429',
  'b0430',
  'b0431',
  'b0432',
  'b0458',
  'b0557',
  'b0621',
  'b0721',
  'b0722',
  'b0723',
  'b0724',
  'b0836',
  'b0848',
  'b1020',
  'b1189',
  'b1190',
  'b1297',
  'b1298',
  'b1332',
  'b1422',
  'b1518',
  'b1607',
  'b1748',
  'b1777',
  'b1778',
  'b2181',
  'b2210',
  'b2241',
  'b2242',
  'b2518',
  'b2582',
  'b2587',
  'b2847',
  'b2848',
  'b2980',
  'b3351',
  'b3426',
  'b3452',
  'b3453',
  'b3528',
  'b3603',
  'b3820',
  'b3894',
  'b3962',
  'b4705'],
 ['b0034',
  'b0429',
  'b0430',
  'b0431',
  'b0432',
  'b0733',
  'b0734',
  'b0763',
  'b0972',
  'b0973',
  'b0974',
  'b0975',
  'b0976',
  'b0977',
  'b0978',
  'b0979',
  'b1109',
  'b1256',
  'b2579',
  'b2720',
  'b2725',
  'b2998',
  'b4070

In [15]:
modules_as_sets = [set(module) for module in locus_tags]
modules_as_sets

[{'b0061',
  'b0062',
  'b0063',
  'b0505',
  'b0507',
  'b0508',
  'b0509',
  'b0511',
  'b0512',
  'b1900',
  'b1901',
  'b2799',
  'b2800',
  'b2801',
  'b2802',
  'b2803',
  'b2841',
  'b4460'},
 {'b0042',
  'b0113',
  'b0124',
  'b0330',
  'b0346',
  'b0429',
  'b0430',
  'b0431',
  'b0432',
  'b0458',
  'b0557',
  'b0621',
  'b0721',
  'b0722',
  'b0723',
  'b0724',
  'b0836',
  'b0848',
  'b1020',
  'b1189',
  'b1190',
  'b1297',
  'b1298',
  'b1332',
  'b1422',
  'b1518',
  'b1607',
  'b1748',
  'b1777',
  'b1778',
  'b2181',
  'b2210',
  'b2241',
  'b2242',
  'b2518',
  'b2582',
  'b2587',
  'b2847',
  'b2848',
  'b2980',
  'b3351',
  'b3426',
  'b3452',
  'b3453',
  'b3528',
  'b3603',
  'b3820',
  'b3894',
  'b3962',
  'b4705'},
 {'b0034',
  'b0429',
  'b0430',
  'b0431',
  'b0432',
  'b0733',
  'b0734',
  'b0763',
  'b0972',
  'b0973',
  'b0974',
  'b0975',
  'b0976',
  'b0977',
  'b0978',
  'b0979',
  'b1109',
  'b1256',
  'b2579',
  'b2720',
  'b2725',
  'b2998',
  'b4070

In [16]:
ModulesA = Modules(modules_as_sets)
ModulesB = Modules(known_modules)
Modules_genes = Modules(all_genes)
mod_comp = ModulesComparison(ModulesA, ModulesB, all_genes)

In [20]:
scores_list = []
scores = ModulesComparison(ModulesA, ModulesB, all_genes).score(None)
scores_list.append((scores['precision'], scores['recall'], scores['F1rp'], scores['recovery'], scores['relevance'], scores['F1rr'], scores['F1rprr']))
scores_method = pd.DataFrame(scores_list, columns=['precision', 'recall', 'f1rp', 'recovery', 'relevance', 'f1rr', 'f1rprr'])

In [21]:
scores_method

Unnamed: 0,precision,recall,f1rp,recovery,relevance,f1rr,f1rprr
0,0.012267,0.030796,0.017545,0.231147,0.227852,0.229488,0.032597


In [22]:
allgenes = sorted(list({g for module in known_modules for g in module}))

In [23]:
scores_list = []
scores = ModulesComparison(ModulesA, ModulesB, allgenes).score(None)
scores_list.append((scores['precision'], scores['recall'], scores['F1rp'], scores['recovery'], scores['relevance'], scores['F1rr'], scores['F1rprr']))
scores_method = pd.DataFrame(scores_list, columns=['precision', 'recall', 'f1rp', 'recovery', 'relevance', 'f1rr', 'f1rprr'])

In [24]:
scores_method

Unnamed: 0,precision,recall,f1rp,recovery,relevance,f1rr,f1rprr
0,0.030314,0.085846,0.044806,0.259555,0.248805,0.254066,0.076178
