In [5]:
import bioservices
import json

from cache import kegg_cached_reqest
from helpers import print_percent_done

In [6]:
k = bioservices.kegg.KEGG()
parser = bioservices.kegg.KEGGParser()

### Configurables
Edit the folling options before running the scraper

In [7]:
keywords = ['FAD', 'FMN', 'flavin', 'flavoenzyme']
import_file = "export/kegg.json"
export_file = "export/kegg.json"

### Helper functions

In [23]:
def read_past_data(path=import_file):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except:
        return {}

In [24]:
def get_ids(keyword):
    results = k.find(database='enzyme', query=keyword)
    results_array = results.split('\n')
    ids_array = [i.split('\t')[0] for i in results_array if (i)]
    return set(ids_array)

In [25]:
def get_ids_for_keywords(keywords):
    all_ids_set = set()
    for keyword in keywords:
        kw_ids = get_ids(keyword);
        all_ids_set = all_ids_set | kw_ids
    return all_ids_set

In [26]:
def get_ec_data(id):
    #     entry = k.get(id)
    resp = kegg_cached_reqest(request_name=f'kegg_ec_request', ec=id, request_fn=k.get)

    ec_parse = parser.parse(resp)
    return ec_parse

In [37]:
def get_all_data(ids, previous_json, verbose=False):    
    data_dict = previous_json
    data_dict = {} # TODO: remove
    for index,id in enumerate(ids):
        ec_data = get_ec_data(id)
        name = ec_data.get('SYSNAME',id) # if sysname not found, will use ec_number
        ec_number = id.replace('ec:','')
        
        ec_data['KEGG_ID'] = id
        ec_data['EC_NUMBER'] = ec_number
        # TODO: this is now ID istead of sysname, but can be changed 
        data_dict[ec_number] = ec_data 
        if verbose: 
            print_percent_done(index=index, length=len(ids))
    return data_dict

In [38]:
def scrape_kegg():
    print(f'Kegg scraping script started...')
    # Reading past results from KEGG
    previous_json = read_past_data()
    print(f'1. Successfully read previous json data, that has total of {len(previous_json)} records')
    prev_ids = {enzyme['KEGG_ID'] for enzyme in list(previous_json.values())}
#     print(list(previous_json.values())[0]['EC'])
    
    # Getting IDs of all entries that are missing from past
    new_ids = get_ids_for_keywords(keywords)
    
    all_ids = new_ids - prev_ids
    all_ids = new_ids; # TODO: remove
    
                
    # If new ids have been found, fetch the data
    if len(all_ids) > 0:
        print(f'2. Following potential flavins are missing from past results:')
        [print(f'{ec}', end=" | ") for ec in all_ids]
    
        # Scraping the data
        print(f'\n3. Fetching the data')
        flavins = get_all_data(all_ids, previous_json, verbose=True)
        
        # Writing out the results to the file
        with open(export_file, 'w') as outfile:
            json.dump(flavins, outfile)
        print(f'\nSuccessfully written out {len(all_ids)} results to "{export_file}"')
    else:
        print("[i] Doesn't look like there are any new flavins on KEGG!")

### Running the program

In [39]:
scrape_kegg()

Kegg scraping script started...
1. Successfully read previous json data, that has total of 78 records
2. Following potential flavins are missing from past results:
ec:1.2.99.7 | ec:6.2.1.41 | ec:1.14.99.66 | ec:1.5.1.42 | ec:6.2.1.50 | ec:2.7.1.26 | ec:4.3.1.32 | ec:1.14.19.35 | ec:1.5.1.45 | ec:1.6.2.4 | ec:1.14.19.33 | ec:2.6.1.114 | ec:1.7.1.17 | ec:2.5.1.129 | ec:1.13.12.17 | ec:1.1.2.3 | ec:2.1.1.74 | ec:3.1.3.102 | ec:1.1.5.3 | ec:1.5.1.36 | ec:2.7.1.161 | ec:2.7.8.28 | ec:1.12.98.1 | ec:1.14.13.8 | ec:6.2.1.51 | ec:1.14.14.20 | ec:4.6.1.15 | ec:1.14.14.8 | ec:1.14.19.44 | ec:1.14.13.111 | ec:1.14.14.5 | ec:1.14.19.3 | ec:1.14.13.148 | ec:2.1.1.343 | ec:1.14.19.42 | ec:1.14.19.30 | ec:3.4.22.61 | ec:1.5.1.39 | ec:6.2.1.57 | ec:1.14.14.34 | ec:1.4.3.4 | ec:2.7.1.42 | ec:1.14.13.32 | ec:1.14.19.23 | ec:1.14.19.34 | ec:1.13.11.79 | ec:2.5.1.9 | ec:1.5.1.38 | ec:1.14.19.43 | ec:1.14.13.113 | ec:1.5.1.41 | ec:6.2.1.2 | ec:1.14.19.25 | ec:1.1.5.9 | ec:1.1.99.27 | ec:1.14.19.22 | ec:1.1

In [40]:
# prints all the ECs of items without SYSNAME
kegg = read_past_data()
sysnames = [ec for (ec, details) in list(kegg.items()) if not details.get('SYSNAME')]
sysnames

['3.4.22.61']

In [17]:
(kegg['(7Z,10Z)-hexadeca-7,10-dienoyl-[glycerolipid],ferredoxin:oxygen oxidoreductase (13,14 cis-dehydrogenating)']).keys()

KeyError: '(7Z,10Z)-hexadeca-7,10-dienoyl-[glycerolipid],ferredoxin:oxygen oxidoreductase (13,14 cis-dehydrogenating)'

In [41]:
# list(kegg.values())[0]
# next(iter())
kegg.keys()

dict_keys(['1.2.99.7', '6.2.1.41', '1.14.99.66', '1.5.1.42', '6.2.1.50', '2.7.1.26', '4.3.1.32', '1.14.19.35', '1.5.1.45', '1.6.2.4', '1.14.19.33', '2.6.1.114', '1.7.1.17', '2.5.1.129', '1.13.12.17', '1.1.2.3', '2.1.1.74', '3.1.3.102', '1.1.5.3', '1.5.1.36', '2.7.1.161', '2.7.8.28', '1.12.98.1', '1.14.13.8', '6.2.1.51', '1.14.14.20', '4.6.1.15', '1.14.14.8', '1.14.19.44', '1.14.13.111', '1.14.14.5', '1.14.19.3', '1.14.13.148', '2.1.1.343', '1.14.19.42', '1.14.19.30', '3.4.22.61', '1.5.1.39', '6.2.1.57', '1.14.14.34', '1.4.3.4', '2.7.1.42', '1.14.13.32', '1.14.19.23', '1.14.19.34', '1.13.11.79', '2.5.1.9', '1.5.1.38', '1.14.19.43', '1.14.13.113', '1.5.1.41', '6.2.1.2', '1.14.19.25', '1.1.5.9', '1.1.99.27', '1.14.19.22', '1.14.14.3', '1.5.1.37', '4.3.3.5', '1.5.1.20', '2.1.1.349', '6.2.1.59', '2.1.1.148', '1.14.14.27', '6.2.1.49', '6.2.1.42', '1.3.8.2', '1.5.1.40', '3.6.1.18', '1.5.1.30', '2.7.1.180', '1.2.99.8', '1.1.5.4', '1.14.14.9', '2.7.10.2', '3.5.99.1', '1.14.19.31', '2.7.7.2'])