In [30]:
import bioservices
import json

from cache import kegg_cached_reqest
from helpers import print_percent_done

In [31]:
k = bioservices.kegg.KEGG()
parser = bioservices.kegg.KEGGParser()

# This file is WIP still:
### TODO:
- Convert the enzymes to a SMILES strings
- Allow to fetch data from Kegg for ECs that are in Brenda (or other databases)
    - perhaps first, a master list of all FlavoEnzymes can be made, then itterated over
        

### QUESTIONS:
- SMILES string
    - https://www.genome.jp/tools/simcomp/

### Configurables
Edit the folling options before running the scraper

In [32]:
useCaching = True # setting this parameter to False will fetch new data from server, instead of using cache
keywords = ['FAD', 'FMN', 'flavin', 'flavoenzyme']
import_file = "export/kegg.json"
export_file = "export/kegg.json"

### Helper functions

In [60]:
def read_past_data(path=import_file):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except:
        return {}

In [61]:
def get_ids(keyword):
    results = k.find(database='enzyme', query=keyword)
    results_array = results.split('\n')
    ids_array = [i.split('\t')[0] for i in results_array if (i)]
    return set(ids_array)

In [62]:
def get_ids_for_keywords(keywords):
    all_ids_set = set()
    for keyword in keywords:
        kw_ids = get_ids(keyword);
        all_ids_set = all_ids_set | kw_ids
    return all_ids_set

In [63]:
def get_ec_data(id):
    resp = kegg_cached_reqest(request_name=f'kegg_ec_request', ec=id, request_fn=k.get, useCaching=useCaching)
    ec_parse = parser.parse(resp)
    return ec_parse

In [64]:
def get_all_data(ids, previous_json, verbose=False):    
    data_dict = previous_json
    for index,id in enumerate(ids):
        ec_data = get_ec_data(id)
        name = ec_data.get('SYSNAME',id) # if sysname not found, will use ec_number
        ec_number = id.replace('ec:','')
        
        ec_data['KEGG_ID'] = id
        ec_data['EC_NUMBER'] = ec_number
        # WARNING: keys are now EC numbers istead of sysname, but can be changed 
        data_dict[ec_number] = ec_data 
        if verbose: 
            print_percent_done(index=index, length=len(ids))
    return data_dict

In [82]:
def scrape_kegg():
    print(f'Kegg scraping script started...')
    # Reading past results from KEGG
    previous_json = read_past_data()
    print(f'1. Successfully read previous json data, that has total of {len(previous_json)} records')
    prev_ids = {enzyme['KEGG_ID'] for enzyme in list(previous_json.values())}
    
    # Getting IDs of all entries that are missing from past
#     new_ids = get_ids_for_keywords(keywords)
    new_ids = set(brenda_blacklist) - set(benda_not_found)
    
    all_ids = new_ids - prev_ids    
                
    # If new ids have been found, fetch the data
    if len(all_ids) > 0:
        print(f'2. Following potential flavins are missing from past results:')
        [print(f'{ec}', end=" | ") for ec in all_ids]
    
        # Scraping the data
        print(f'\n3. Fetching the data')
        flavins = get_all_data(all_ids, previous_json, verbose=True)
        
        # Writing out the results to the file
        with open(export_file, 'w') as outfile:
            json.dump(flavins, outfile)
        print(f'\nSuccessfully written out {len(all_ids)} results to "{export_file}"')
    else:
        print("[i] Doesn't look like there are any new flavins on KEGG!")

### Running the program

In [83]:
scrape_kegg()

Kegg scraping script started...
1. Successfully read previous json data, that has total of 766 records
2. Following potential flavins are missing from past results:
2.7.7.95 | 1.6.8.2 | 1.14.99.40 | 1.14.99.7 | 3.4.24.15 | 1.6.8.1 | 1.7.99.5 | 1.13.11.32 | 7.2.1.1 | 2.5.1.77 | 3.4.24.59 | 1.1.1.205 | 3.4.22.3 | 1.5.1.29 | 3.4.24.69 | 3.4.21.109 | 3.4.16.4 | 3.4.21.1 | 1.14.19.7 | 3.4.22.61 | 1.1.99.15 | 3.4.22.56 | 1.3.3.6 | 
3. Fetching the data
Getting cached data...
kegg_ec_request?ec=2.7.7.95


----> Progress: ░░░░░░░░░ 	4% done
Getting cached data...
kegg_ec_request?ec=1.6.8.2


----> Progress: ░░░░░░░░░ 	9% done
Getting cached data...
kegg_ec_request?ec=1.14.99.40


----> Progress: █░░░░░░░░ 	13% done
Getting cached data...
kegg_ec_request?ec=1.14.99.7


----> Progress: █░░░░░░░░ 	17% done
Getting cached data...
kegg_ec_request?ec=3.4.24.15


----> Progress: ██░░░░░░░ 	22% done
Getting cached data...
kegg_ec_request?ec=1.6.8.1


----> Progress: ██░░░░░░░ 	26% done
Getting cached 

In [12]:
# prints all the ECs of items without SYSNAME
kegg = read_past_data()
sysnames = [ec for (ec, details) in list(kegg.items()) if not details.get('SYSNAME')]
sysnames

['3.4.22.61']

### Seeing how many brenda flavins are in Kegg

In [69]:
brenda = read_past_data('export/brenda.json')
brenda_ecs = sorted(brenda.keys())
# brenda_flavins = get_all_data(brenda_ecs, previous_json = {}, verbose=True)
benda_not_found=[]
for index,id in enumerate(brenda_ecs):
        try:
            ec_data = get_ec_data(id)
        except:
            benda_not_found.append(id)

Getting cached data...
kegg_ec_request?ec=1.1.1.1
Getting cached data...
kegg_ec_request?ec=1.1.1.125
Getting cached data...
kegg_ec_request?ec=1.1.1.181
Getting cached data...
kegg_ec_request?ec=1.1.1.184
Getting cached data...
kegg_ec_request?ec=1.1.1.189
Getting cached data...
kegg_ec_request?ec=1.1.1.193
Getting cached data...
kegg_ec_request?ec=1.1.1.215
Getting cached data...
kegg_ec_request?ec=1.1.1.216
Getting cached data...
kegg_ec_request?ec=1.1.1.217
Getting cached data...
kegg_ec_request?ec=1.1.1.229
Getting cached data...
kegg_ec_request?ec=1.1.1.25
Getting cached data...
kegg_ec_request?ec=1.1.1.267
Getting cached data...
kegg_ec_request?ec=1.1.1.27
Getting cached data...
kegg_ec_request?ec=1.1.1.28
Getting cached data...
kegg_ec_request?ec=1.1.1.284
Getting cached data...
kegg_ec_request?ec=1.1.1.289
Getting cached data...
kegg_ec_request?ec=1.1.1.306
Getting cached data...
kegg_ec_request?ec=1.1.1.328
Getting cached data...
kegg_ec_request?ec=1.1.1.35
Getting cached dat

In [70]:
benda_not_found

['1.1.1.B3',
 '1.1.1.B58',
 '1.1.99.B1',
 '1.1.99.B3',
 '1.1.99.B9',
 '1.14.13.B1',
 '1.14.19.B8',
 '1.14.19.B9',
 '1.3.99.B2',
 '1.4.99.B3',
 '1.5.99.B2',
 '1.5.99.B4',
 '1.7.1.B1',
 '1.7.1.B3',
 '1.8.1.B4',
 '2.1.1.B117',
 '4.2.1.B25',
 '4.2.1.B26',
 '5.3.3.B2']

In [24]:
sorted(kegg.keys())

['1.1.2.3',
 '1.1.5.3',
 '1.1.5.4',
 '1.1.5.9',
 '1.1.99.27',
 '1.12.98.1',
 '1.13.11.79',
 '1.13.12.17',
 '1.14.13.111',
 '1.14.13.113',
 '1.14.13.148',
 '1.14.13.32',
 '1.14.13.8',
 '1.14.14.20',
 '1.14.14.27',
 '1.14.14.3',
 '1.14.14.34',
 '1.14.14.5',
 '1.14.14.8',
 '1.14.14.9',
 '1.14.19.22',
 '1.14.19.23',
 '1.14.19.25',
 '1.14.19.3',
 '1.14.19.30',
 '1.14.19.31',
 '1.14.19.33',
 '1.14.19.34',
 '1.14.19.35',
 '1.14.19.42',
 '1.14.19.43',
 '1.14.19.44',
 '1.14.99.66',
 '1.2.99.7',
 '1.2.99.8',
 '1.3.8.2',
 '1.4.3.4',
 '1.5.1.20',
 '1.5.1.30',
 '1.5.1.36',
 '1.5.1.37',
 '1.5.1.38',
 '1.5.1.39',
 '1.5.1.40',
 '1.5.1.41',
 '1.5.1.42',
 '1.5.1.45',
 '1.6.2.4',
 '1.7.1.17',
 '2.1.1.148',
 '2.1.1.343',
 '2.1.1.349',
 '2.1.1.74',
 '2.5.1.129',
 '2.5.1.9',
 '2.6.1.114',
 '2.7.1.161',
 '2.7.1.180',
 '2.7.1.26',
 '2.7.1.42',
 '2.7.10.2',
 '2.7.7.2',
 '2.7.8.28',
 '3.1.3.102',
 '3.4.22.61',
 '3.5.99.1',
 '3.6.1.18',
 '4.3.1.32',
 '4.3.3.5',
 '4.6.1.15',
 '6.2.1.2',
 '6.2.1.41',
 '6.2.1.42',


In [73]:
get_ec_data('3.4.22.61')

Getting cached data...
kegg_ec_request?ec=3.4.22.61


{'ENTRY': 'EC 3.4.22.61                Enzyme',
 'NAME': ['caspase-8;',
  'FLICE, FADD-like ICE;',
  'MACH;',
  'MORT1-associated CED-3 homolog;',
  'Mch5;',
  'mammalian Ced-3 homolog 5;',
  'CASP-8;',
  'ICE-like apoptotic protease 5;',
  'FADD-homologous ICE/CED-3-like protease;',
  'apoptotic cysteine protease;',
  'apoptotic protease Mch-5;',
  'CAP4'],
 'CLASS': 'Hydrolases;             Acting on peptide bonds (peptidases);             Cysteine endopeptidases',
 'REACTION': ['Strict',
  'requirement',
  'for',
  'Asp',
  'at',
  'position',
  'P1',
  'and',
  'has',
  'a',
  'preferred',
  'cleavage',
  'sequence',
  'of',
  '(Leu/Asp/Val)-Glu-Thr-Asp!(Gly/Ser/Ala)'],
 'COMMENT': ["Caspase-8 is an initiator caspase, as are caspase-2 (EC 3.4.22.55), caspase-9 (EC 3.4.22.62) and caspase-10 (EC 3.4.22.63) [1]. Caspase-8 is the apical activator of the extrinsic (death receptor) apoptosis pathway, triggered by death receptor ligation [2]. It contains two tandem death effector domains 

In [93]:
CPD = k.parse(k.get("cpd:C00156"))

In [94]:
CPD.keys()

dict_keys(['ENTRY', 'NAME', 'FORMULA', 'EXACT_MASS', 'MOL_WEIGHT', 'REACTION', 'PATHWAY', 'MODULE', 'ENZYME', 'DBLINKS', 'ATOM', 'BOND'])

In [95]:
CPD['FORMULA']

'C7H6O3'

In [97]:
import requests

In [105]:
cpd = 'cpd:C00156'
url = f'https://www.genome.jp/dbget-bin/www_bget?-f+m+compound+{cpd}'
resp = requests.get(url)
print(resp.text)

 
 
 
 10 10  0  0  0  0  0  0  0  0999 V2000
   25.6200  -14.7656    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   25.6137  -13.3697    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   24.4098  -15.4637    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   26.8430  -15.4637    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   24.4098  -12.6779    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
   26.8174  -12.6716    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
   24.4098  -16.8659    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   26.8430  -16.8659    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   25.6200  -17.5831    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   25.6137  -18.9727    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  1  0     0  0
  1  3  1  0     0  0
  1  4  2  0     0  0
  2  5  1  0     0  0
  2  6  2  0     0  0
  3  7  2  0     0  0
  4  8  1  0     0  0
  7  9  1  0     0  0
  9 10  1  0     0  0
  8  9  2  0     0  0
M  END

