In [1]:
# To detect 'type 2' models, (models with low reaction accuracy.. due to bad species naming)
import libsbml
import numpy as np
import os
import pickle
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline  

PROJ_DIR = "/Users/woosubs/Desktop/AutomateAnnotation/AnnotationRecommender/"
MOD_DIR = os.path.join(PROJ_DIR, "annotation_recommender")
sys.path.append(MOD_DIR)

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA/bigg'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")


from annotation_recommender import species_annotation as sa
from annotation_recommender import reaction_annotation as ra
from annotation_recommender import constants as cn
from annotation_recommender import iterator as it
from annotation_recommender import tools

# chebi to shortened formula
with open(os.path.join(CHEBI_DIR, 'chebi_shortened_formula_30apr2022.pickle'), 'rb') as f:
  ref_shortened_chebi_to_formula = pickle.load(f)
# shortened formula to chebi
with open(os.path.join(CHEBI_DIR, 'shortened_formula_to_chebis_20jul2022.pickle'), 'rb') as f:
  ref_shortened_formula_to_chebi = pickle.load(f)

with open(os.path.join(CHEBI_DIR, 'chebi_synonyms.pickle'), 'rb') as f:
  chebi_synonyms = pickle.load(f)
chebi_low_synonyms = dict()
for one_k in chebi_synonyms.keys():
  chebi_low_synonyms[one_k] = list(set([val.lower() for val in chebi_synonyms[one_k]]))

with open(os.path.join(RHEA_DIR, 'kegg2rhea_master.pickle'), 'rb') as handle:
  ref_kegg2rhea_master = pickle.load(handle)
with open(os.path.join(RHEA_DIR, 'kegg2rhea_bi.pickle'), 'rb') as handle:
  ref_kegg2rhea_bi = pickle.load(handle)

# load reference matrix
with open(os.path.join(ALGO_DIR, 'binary_ref_df.pickle'), 'rb') as handle:
    ref_mat = pickle.load(handle)
# check its shape
print(ref_mat.shape)

(13651, 3790)


In [2]:
df = pd.read_csv("eval_species_annotation.csv", index_col=0)

In [3]:
len(np.unique(df['model']))

47

In [4]:
print(df.shape)
df.head()

(1699, 7)


Unnamed: 0,model,species_id,species_name,len_word,match_score,num_cands,accuracy
0,BIOMD0000000172.xml,GLCo,glc(ext),8,0.5,15,0
1,BIOMD0000000172.xml,GLCi,glc(int),8,0.5,20,0
2,BIOMD0000000172.xml,ATP,atp,3,1.0,2,1
3,BIOMD0000000172.xml,G6P,glu6p,5,0.8,1,1
4,BIOMD0000000172.xml,ADP,adp,3,1.0,3,1


In [5]:
X = df[['len_word', 'match_score', 'num_cands']]
y = df['accuracy']

In [6]:
from sklearn.linear_model import LogisticRegressionCV

In [7]:
df_0 = df[df['accuracy']==0] 
df_1 = df[df['accuracy']==1]

clf_res = []
for _ in range(10):
  df_sel_comb = pd.concat([df_0, df_1.sample(n=df_0.shape[0], replace=False)], axis=0, join='inner')
  X = df_sel_comb[['match_score', 'num_cands']]
  y = df_sel_comb['accuracy']
  clf = LogisticRegressionCV(cv=10, random_state=0, class_weight='balanced').fit(X, y)
  clf_res.append(clf.score(X, y))
print(np.mean(clf_res))

0.8280201342281879


In [8]:
for idx, one_val in enumerate(clf.coef_[0]):
  print("Coefficient of %s: %.02f" % (['match_score', 'num_cands'][idx], one_val))

Coefficient of match_score: 8.46
Coefficient of num_cands: 0.00


In [9]:
clf.intercept_

array([-7.08934583])

In [10]:
clf.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,

In [11]:
# re-run using statsmodels;
import statsmodels.api as sm

df_sel_comb = pd.concat([df_0, df_1.sample(n=df_0.shape[0], replace=False)], axis=0, join='inner')
X = df_sel_comb[['len_word', 'match_score', 'num_cands']]
y = df_sel_comb['accuracy']
log_reg = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.600155
         Iterations 8


In [12]:
print(log_reg.summary())

                           Logit Regression Results                           
Dep. Variable:               accuracy   No. Observations:                  596
Model:                          Logit   Df Residuals:                      593
Method:                           MLE   Df Model:                            2
Date:                Tue, 23 Aug 2022   Pseudo R-squ.:                  0.1342
Time:                        08:26:59   Log-Likelihood:                -357.69
converged:                       True   LL-Null:                       -413.12
Covariance Type:            nonrobust   LLR p-value:                 8.508e-25
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
len_word       -0.0075      0.008     -0.883      0.377      -0.024       0.009
match_score     1.0623      0.181      5.877      0.000       0.708       1.417
num_cands      -0.1633      0.026     -6.213    

In [None]:
# TODO: Now, using the cv(s), figure out models that 'could' be evaluated by this algorithm
