In [1]:
# Try to use char2vec to predict species annotation
# Main goal is to speed up the process
import collections
import compress_pickle
import itertools
import libsbml
import numpy as np
import os
import pickle
import pandas as pd
import re
import sys
import matplotlib.pyplot as plt
%matplotlib inline  

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA/bigg'

# dir for alternative methods for species
ALT_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/AMAS_suppl/alt_methods_species_data'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")

PROJ_DIR = os.path.join(os.getcwd(), os.pardir)
AMAS_DIR = os.path.join(PROJ_DIR, "AMAS")
sys.path.append(PROJ_DIR)

from AMAS import species_annotation as sa
from AMAS import reaction_annotation as ra
from AMAS import recommender
from AMAS import constants as cn
from AMAS import iterator as it
from AMAS import tools

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [2]:
with open(os.path.join(cn.REF_DIR, 'chebi_low_synonyms_comp.lzma'), 'rb') as f:
  chebi_low_synonyms = compress_pickle.load(f)

In [3]:
all_chars = set(itertools.chain(*[itertools.chain(*chebi_low_synonyms[val]) for val in chebi_low_synonyms]))

In [4]:
def getCountOfIndividualCharacters(inp_str):
  """
  Get a list of characters
  between a-z and 0-9. 
  
  Parameters
  ----------
  inp_str: str
  
  Returns
  -------
  : list
  """
  return collections.Counter(itertools.chain(*re.findall('[a-z0-9]+', inp_str)))

In [5]:
def getTupleFromDict(inp_dict):
  """
  Get a list of tuples,
  where each tuple has an item and a key
  from a dictionary. 
  e.g.) {'key': 'x', 'y'}
  will be transofrmed as 
  [('x', 'key'), ('y', 'key')]
   
  Parameters
  ----------
  inp_dict: dict
  
  Returns
  -------
  : list-tuple
  """
  res_list = []
  for one_k in inp_dict.keys():
    one_itm = inp_dict[one_k]
    res_list.append([(val, one_k) for val in one_itm])
  return list(itertools.chain(*res_list))

In [6]:
chebi_str_tups = getTupleFromDict(chebi_low_synonyms)
chebi_count_tups = [(getCountOfIndividualCharacters(val[0]), val[1]) for val in chebi_str_tups]

In [7]:
chebi_str_tups[:3]

[('l-norepinephrine', 'CHEBI:18357'),
 ('norepinefrina', 'CHEBI:18357'),
 ('norepinephrine', 'CHEBI:18357')]

In [144]:
# with open(os.path.join(ALT_DIR, "chebi_name_id_tuples.pickle"), "wb") as f:   #Pickling
#   pickle.dump(chebi_str_tups, f)

In [8]:
col_vals = [chr(val) for val in range(ord('a'), ord('z')+1)] + [str(val) for val in range(0, 10)]
print(col_vals)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [9]:
nref_mat = pd.DataFrame(0, columns=col_vals,
                       index=range(0, len(chebi_count_tups)))

In [10]:
for idx, itm in enumerate(chebi_count_tups):
  for one_k in itm[0]:
    nref_mat.loc[idx, one_k] = itm[0][one_k]

In [12]:
print(nref_mat.shape)
nref_mat

(431024, 36)


Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,0,1,2,3,4,5,6,7,8,9
0,0,0,0,0,3,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,2,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,3,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,1,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,1,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431019,10,1,3,2,9,0,0,7,3,0,...,0,5,10,2,3,2,1,0,1,0
431020,10,4,1,5,9,2,0,7,4,0,...,0,4,3,5,4,2,1,0,0,2
431021,9,2,4,2,8,1,0,6,3,0,...,0,4,9,3,3,2,1,0,1,0
431022,8,1,4,6,9,1,0,7,8,0,...,3,12,7,3,2,1,1,1,1,2


In [13]:
# nref_mat.to_csv(os.path.join(ALT_DIR, 'charcount_df.csv'), index=False)

In [124]:
# dum_df = pd.read_csv(os.path.join(ALT_DIR, 'charcount_df.csv'))
# dum_df

In [14]:
chebi_low_synonyms['CHEBI:99995']

['2-[(2s,4as,12as)-5-methyl-6-oxo-8-[(1-oxo-2-phenylethyl)amino]-2,3,4,4a,12,12a-hexahydropyrano[2,3-c][1,5]benzoxazocin-2-yl]-n-(2-methoxyethyl)acetamide']

In [46]:
# query_mat = pd.DataFrame(0, index=col_vals, columns=['one_spec'])
# one_char = 'glucose'
# one_count = getCountOfIndividualChracters(one_char)
# for one_char in one_count:
#   query_mat.loc[one_char, 'one_spec'] = one_count[one_char]

In [16]:
div_col = nref_mat.apply(lambda row : np.sqrt(np.sum([val**2 for val in row])), axis = 1)
norm_df = nref_mat.divide(div_col, axis=0)

In [18]:
div_col

0          5.744563
1          5.000000
2          5.656854
3          5.000000
4          5.477226
            ...    
431019    26.627054
431020    29.274562
431021    26.608269
431022    31.080541
431023    23.173260
Length: 431024, dtype: float64

In [19]:
# to make sure if this was done well
norm_df.iloc[:2, :].apply(lambda row : np.sqrt(np.sum([val**2 for val in row])), axis = 1)

0    1.0
1    1.0
dtype: float64

In [20]:
# norm_df.to_csv(os.path.join(ALT_DIR, 'charcount_df_normalized.csv'), index=False)

In [None]:
# Something like below; last two columns to be synonyms & chebis

In [24]:
import copy

In [25]:
comb_df = copy.deepcopy(norm_df)

comb_df['synonym'] = [val[0] for val in chebi_str_tups]
comb_df['chebi'] = [val[1] for val in chebi_str_tups]

In [26]:
comb_df.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,2,3,4,5,6,7,8,9,synonym,chebi
0,0.0,0.0,0.0,0.0,0.522233,0.0,0.0,0.174078,0.348155,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,l-norepinephrine,CHEBI:18357
1,0.2,0.0,0.0,0.0,0.4,0.2,0.0,0.0,0.4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,norepinefrina,CHEBI:18357
2,0.0,0.0,0.0,0.0,0.53033,0.0,0.0,0.176777,0.353553,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,norepinephrine,CHEBI:18357
3,0.4,0.0,0.0,0.2,0.4,0.0,0.0,0.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,noradrenaline,CHEBI:18357
4,0.365148,0.0,0.0,0.182574,0.365148,0.0,0.0,0.0,0.182574,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,(r)-noradrenaline,CHEBI:18357


In [27]:
compress_pickle.dump(comb_df,
                     os.path.join(cn.REF_DIR, 'charcount_df_scaled.lzma'), 
                     compression="lzma", set_default_extension=False)

In [140]:
# pd.read_csv(os.path.join(ALT_DIR, 'charcount_df_normalized.csv'))

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,0,1,2,3,4,5,6,7,8,9
0,0.000000,0.000000,0.000000,0.000000,0.530330,0.000000,0.0,0.176777,0.353553,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.200000,0.000000,0.000000,0.000000,0.400000,0.200000,0.0,0.000000,0.400000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.277350,0.000000,0.000000,0.000000,0.554700,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.371391,0.000000,0.0,0.185695,0.371391,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.113228,0.113228,0.000000,0.226455,0.452911,0.000000,0.0,0.226455,0.226455,0.0,...,0.000000,0.226455,0.226455,0.000000,0.113228,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466443,0.375558,0.037556,0.112667,0.075112,0.338002,0.000000,0.0,0.262891,0.112667,0.0,...,0.000000,0.187779,0.375558,0.075112,0.112667,0.075112,0.037556,0.000000,0.037556,0.000000
466444,0.341593,0.136637,0.034159,0.170797,0.307434,0.068319,0.0,0.239115,0.136637,0.0,...,0.000000,0.136637,0.102478,0.170797,0.136637,0.068319,0.034159,0.000000,0.000000,0.068319
466445,0.338241,0.075165,0.150329,0.075165,0.300658,0.037582,0.0,0.225494,0.112747,0.0,...,0.000000,0.150329,0.338241,0.112747,0.112747,0.075165,0.037582,0.000000,0.037582,0.000000
466446,0.257396,0.032174,0.128698,0.193047,0.289570,0.032174,0.0,0.225221,0.257396,0.0,...,0.096523,0.386094,0.225221,0.096523,0.064349,0.032174,0.032174,0.032174,0.032174,0.064349


In [49]:
div_val = np.sqrt(np.sum([val**2 for val in query_mat['one_spec']]))
print(div_val)
norm_query = query_mat.divide(div_val)

2.6457513110645907


In [112]:
multi_mat = norm_df.dot(norm_query)

In [113]:
max_val = multi_mat['one_spec'].max()
print(max_val)

0.9999999999999998


In [114]:
multi_mat[multi_mat==1.0].index

RangeIndex(start=0, stop=466448, step=1)

In [115]:
cand_index = multi_mat[abs(multi_mat['one_spec']-max_val)<0.00001].index

In [116]:
cands = [chebi_count_tups[val] for val in cand_index]

In [117]:
cands

[(Counter({'g': 1, 'l': 1, 'u': 1, 'c': 1, 'o': 1, 's': 1, 'e': 1}),
  'CHEBI:4167'),
 (Counter({'g': 1, 'l': 1, 'u': 1, 'c': 1, 'o': 1, 's': 1, 'e': 1}),
  'CHEBI:42758'),
 (Counter({'g': 1, 'l': 1, 'u': 1, 'c': 1, 'o': 1, 's': 1, 'e': 1}),
  'CHEBI:17234')]