In [1]:
# Try to use char2vec to predict species annotation
# Main goal is to speed up the process
import collections
import compress_pickle
import itertools
import libsbml
import numpy as np
import os
import pickle
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline  

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA/bigg'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")

PROJ_DIR = os.path.join(os.getcwd(), os.pardir)
AMAS_DIR = os.path.join(PROJ_DIR, "AMAS")
sys.path.append(PROJ_DIR)

from AMAS import species_annotation as sa
from AMAS import reaction_annotation as ra
from AMAS import recommender
from AMAS import constants as cn
from AMAS import iterator as it
from AMAS import tools

In [2]:
with open(os.path.join(cn.REF_DIR, 'chebi_low_synonyms_comp.lzma'), 'rb') as f:
  chebi_low_synonyms = compress_pickle.load(f)

In [3]:
import re

In [4]:
all_chars = set(itertools.chain(*[itertools.chain(*chebi_low_synonyms[val]) for val in chebi_low_synonyms]))

In [16]:
def getCountOfIndividualChracters(inp_str):
  """
  Get a list of characters
  between a-z and 0-9. 
  
  Parameters
  ----------
  inp_str: str
  
  Returns
  -------
  : list
  """
  return collections.Counter(itertools.chain(*re.findall('[a-z0-9]+', inp_str)))

In [17]:
def getTupleFromDict(inp_dict):
  """
  Get a list of tuples,
  where each tuple has an item and a key
  from a dictionary. 
  e.g.) {'key': 'x', 'y'}
  will be transofrmed as 
  [('x', 'key'), ('y', 'key')]
   
  Parameters
  ----------
  inp_dict: dict
  
  Returns
  -------
  : list-tuple
  """
  res_list = []
  for one_k in inp_dict.keys():
    one_itm = inp_dict[one_k]
    res_list.append([(val, one_k) for val in one_itm])
  return list(itertools.chain(*res_list))

In [18]:
chebi_str_tups = getTupleFromDict(chebi_low_synonyms)
chebi_count_tups = [(getCountOfIndividualChracters(val[0]), val[1]) for val in chebi_str_tups]

In [19]:
chebi_count_tups[:3]

[(Counter({'n': 3, 'o': 1, 'r': 2, 'e': 3, 'p': 2, 'i': 2, 'h': 1}),
  'CHEBI:18357'),
 (Counter({'n': 3, 'o': 1, 'r': 2, 'e': 2, 'p': 1, 'i': 2, 'f': 1, 'a': 1}),
  'CHEBI:18357'),
 (Counter({'a': 1, 'r': 2, 't': 1, 'e': 2, 'n': 1, 'o': 1, 'l': 1}),
  'CHEBI:18357')]

In [20]:
col_vals = [chr(val) for val in range(ord('a'), ord('z')+1)] + [str(val) for val in range(0, 10)]
print(col_vals)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [21]:
nref_mat = pd.DataFrame(0, columns=col_vals,
                       index=range(0, len(chebi_count_tups)))

In [22]:
for idx, itm in enumerate(chebi_count_tups):
  for one_k in itm[0]:
    nref_mat.loc[idx, one_k] = itm[0][one_k]

In [106]:
print(nref_mat.shape)
nref_mat

(466448, 36)


Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,0,1,2,3,4,5,6,7,8,9
0,0,0,0,0,3,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,2,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,2,4,0,0,2,2,0,...,0,2,2,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466443,10,1,3,2,9,0,0,7,3,0,...,0,5,10,2,3,2,1,0,1,0
466444,10,4,1,5,9,2,0,7,4,0,...,0,4,3,5,4,2,1,0,0,2
466445,9,2,4,2,8,1,0,6,3,0,...,0,4,9,3,3,2,1,0,1,0
466446,8,1,4,6,9,1,0,7,8,0,...,3,12,7,3,2,1,1,1,1,2


In [110]:
chebi_low_synonyms['CHEBI:99995']

['2-[(2s,4as,12as)-5-methyl-6-oxo-8-[(1-oxo-2-phenylethyl)amino]-2,3,4,4a,12,12a-hexahydropyrano[2,3-c][1,5]benzoxazocin-2-yl]-n-(2-methoxyethyl)acetamide']

In [109]:
chebi_count_tups[466443]

(Counter({'2': 10,
          's': 3,
          '4': 3,
          'a': 10,
          '1': 5,
          '5': 2,
          'm': 4,
          'e': 9,
          't': 5,
          'h': 7,
          'y': 8,
          'l': 5,
          '6': 1,
          'o': 10,
          'x': 5,
          '8': 1,
          'p': 2,
          'n': 6,
          'i': 3,
          '3': 2,
          'd': 2,
          'r': 2,
          'c': 3,
          'b': 1,
          'z': 2}),
 'CHEBI:99995')

In [46]:
query_mat = pd.DataFrame(0, index=col_vals, columns=['one_spec'])
one_char = 'glucose'
one_count = getCountOfIndividualChracters(one_char)
for one_char in one_count:
  query_mat.loc[one_char, 'one_spec'] = one_count[one_char]

In [34]:
div_col = nref_mat.apply(lambda row : np.sqrt(np.sum([val**2 for val in row])), axis = 1)
norm_df = nref_mat.divide(div_col, axis=0)

In [49]:
div_val = np.sqrt(np.sum([val**2 for val in query_mat['one_spec']]))
print(div_val)
norm_query = query_mat.divide(div_val)

2.6457513110645907


In [112]:
multi_mat = norm_df.dot(norm_query)

In [113]:
max_val = multi_mat['one_spec'].max()
print(max_val)

0.9999999999999998


In [114]:
multi_mat[multi_mat==1.0].index

RangeIndex(start=0, stop=466448, step=1)

In [115]:
cand_index = multi_mat[abs(multi_mat['one_spec']-max_val)<0.00001].index

In [116]:
cands = [chebi_count_tups[val] for val in cand_index]

In [117]:
cands

[(Counter({'g': 1, 'l': 1, 'u': 1, 'c': 1, 'o': 1, 's': 1, 'e': 1}),
  'CHEBI:4167'),
 (Counter({'g': 1, 'l': 1, 'u': 1, 'c': 1, 'o': 1, 's': 1, 'e': 1}),
  'CHEBI:42758'),
 (Counter({'g': 1, 'l': 1, 'u': 1, 'c': 1, 'o': 1, 's': 1, 'e': 1}),
  'CHEBI:17234')]