In [30]:
import bioservices
import json

from cache import kegg_cached_reqest
from helpers import print_percent_done

In [31]:
k = bioservices.kegg.KEGG()
parser = bioservices.kegg.KEGGParser()

# This file is WIP still:
### TODO:
- Convert the enzymes to a SMILES strings
- Allow to fetch data from Kegg for ECs that are in Brenda (or other databases)
    - perhaps first, a master list of all FlavoEnzymes can be made, then itterated over
        

### QUESTIONS:
- SMILES string
    - https://www.genome.jp/tools/simcomp/

### Configurables
Edit the folling options before running the scraper

In [32]:
useCaching = True # setting this parameter to False will fetch new data from server, instead of using cache
keywords = ['FAD', 'FMN', 'flavin', 'flavoenzyme']
import_file = "export/kegg.json"
export_file = "export/kegg.json"

### Helper functions

In [60]:
def read_past_data(path=import_file):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except:
        return {}

In [61]:
def get_ids(keyword):
    results = k.find(database='enzyme', query=keyword)
    results_array = results.split('\n')
    ids_array = [i.split('\t')[0] for i in results_array if (i)]
    return set(ids_array)

In [62]:
def get_ids_for_keywords(keywords):
    all_ids_set = set()
    for keyword in keywords:
        kw_ids = get_ids(keyword);
        all_ids_set = all_ids_set | kw_ids
    return all_ids_set

In [63]:
def get_ec_data(id):
    resp = kegg_cached_reqest(request_name=f'kegg_ec_request', ec=id, request_fn=k.get, useCaching=useCaching)
    ec_parse = parser.parse(resp)
    return ec_parse

In [64]:
def get_all_data(ids, previous_json, verbose=False):    
    data_dict = previous_json
    for index,id in enumerate(ids):
        ec_data = get_ec_data(id)
        name = ec_data.get('SYSNAME',id) # if sysname not found, will use ec_number
        ec_number = id.replace('ec:','')
        
        ec_data['KEGG_ID'] = id
        ec_data['EC_NUMBER'] = ec_number
        # WARNING: keys are now EC numbers istead of sysname, but can be changed 
        data_dict[ec_number] = ec_data 
        if verbose: 
            print_percent_done(index=index, length=len(ids))
    return data_dict

In [82]:
def scrape_kegg():
    print(f'Kegg scraping script started...')
    # Reading past results from KEGG
    previous_json = read_past_data()
    print(f'1. Successfully read previous json data, that has total of {len(previous_json)} records')
    prev_ids = {enzyme['KEGG_ID'] for enzyme in list(previous_json.values())}
    
    # Getting IDs of all entries that are missing from past
#     new_ids = get_ids_for_keywords(keywords)
    new_ids = set(brenda_blacklist) - set(benda_not_found)
    
    all_ids = new_ids - prev_ids    
                
    # If new ids have been found, fetch the data
    if len(all_ids) > 0:
        print(f'2. Following potential flavins are missing from past results:')
        [print(f'{ec}', end=" | ") for ec in all_ids]
    
        # Scraping the data
        print(f'\n3. Fetching the data')
        flavins = get_all_data(all_ids, previous_json, verbose=True)
        
        # Writing out the results to the file
        with open(export_file, 'w') as outfile:
            json.dump(flavins, outfile)
        print(f'\nSuccessfully written out {len(all_ids)} results to "{export_file}"')
    else:
        print("[i] Doesn't look like there are any new flavins on KEGG!")

### Running the program

In [None]:
scrape_kegg()

In [26]:
# prints all the ECs of items without SYSNAME
kegg = read_past_data()
sysnames = [ec for (ec, details) in list(kegg.items()) if not details.get('SYSNAME')]
sysnames

['3.4.22.61',
 '1.16.8.1',
 '1.6.5.11',
 '2.7.7.95',
 '1.6.8.2',
 '1.14.99.40',
 '1.14.99.7',
 '3.4.24.15',
 '1.6.8.1',
 '1.7.99.5',
 '1.13.11.32',
 '2.5.1.77',
 '3.4.24.59',
 '3.4.22.3',
 '1.5.1.29',
 '3.4.24.69',
 '3.4.21.109',
 '3.4.16.4',
 '3.4.21.1',
 '1.14.19.7',
 '1.1.99.15',
 '3.4.22.56']

### Seeing how many brenda flavins are in Kegg

In [27]:
brenda = read_past_data('export/brenda.json')
brenda_ecs = sorted(brenda.keys())
# brenda_flavins = get_all_data(brenda_ecs, previous_json = {}, verbose=True)
benda_not_found=[]
for index,id in enumerate(brenda_ecs):
        try:
            ec_data = get_ec_data(id)
        except:
            benda_not_found.append(id)

In [29]:
len(benda_not_found)

781

In [93]:
CPD = k.parse(k.get("cpd:C00156"))

In [94]:
CPD.keys()

dict_keys(['ENTRY', 'NAME', 'FORMULA', 'EXACT_MASS', 'MOL_WEIGHT', 'REACTION', 'PATHWAY', 'MODULE', 'ENZYME', 'DBLINKS', 'ATOM', 'BOND'])

In [95]:
CPD['FORMULA']

'C7H6O3'

In [2]:
import requests

In [105]:
cpd = 'cpd:C00156'
url = f'https://www.genome.jp/dbget-bin/www_bget?-f+m+compound+{cpd}'
resp = requests.get(url)
print(resp.text)

 
 
 
 10 10  0  0  0  0  0  0  0  0999 V2000
   25.6200  -14.7656    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   25.6137  -13.3697    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   24.4098  -15.4637    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   26.8430  -15.4637    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   24.4098  -12.6779    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
   26.8174  -12.6716    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
   24.4098  -16.8659    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   26.8430  -16.8659    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   25.6200  -17.5831    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   25.6137  -18.9727    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  1  0     0  0
  1  3  1  0     0  0
  1  4  2  0     0  0
  2  5  1  0     0  0
  2  6  2  0     0  0
  3  7  2  0     0  0
  4  8  1  0     0  0
  7  9  1  0     0  0
  9 10  1  0     0  0
  8  9  2  0     0  0
M  END



In [6]:

    
target_db = 'uniprot'
source_db = 'compound'
cpd = 'cpd:C00156'
url = f'http://rest.kegg.jp/conv/{target_db}/{source_db}'
resp = requests.get(url)
print(resp.text)




In [25]:
# perhaps we can use openbable or pybel?
# https://pypi.org/project/openbabel/3.0.0/
# to convert to smiles string?