In [2]:
import bioservices
import json

from cache import kegg_cached_reqest
from helpers import print_percent_done

In [3]:
k = bioservices.kegg.KEGG()
parser = bioservices.kegg.KEGGParser()

### Configurables
Edit the folling options before running the scraper

In [4]:
useCaching = True # setting this parameter to False will fetch new data from server, instead of using cache
keywords = ['FAD', 'FMN', 'flavin', 'flavoenzyme']
import_file = "export/kegg.json"
export_file = "export/kegg.json"

### Helper functions

In [5]:
def read_past_data(path=import_file):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except:
        return {}

In [6]:
def get_ids(keyword):
    results = k.find(database='enzyme', query=keyword)
    results_array = results.split('\n')
    ids_array = [i.split('\t')[0] for i in results_array if (i)]
    return set(ids_array)

In [7]:
def get_ids_for_keywords(keywords):
    all_ids_set = set()
    for keyword in keywords:
        kw_ids = get_ids(keyword);
        all_ids_set = all_ids_set | kw_ids
    return all_ids_set

In [25]:
def get_ec_data(id):
    resp = kegg_cached_reqest(request_name=f'kegg_ec_request', ec=id, request_fn=k.get, useCaching=useCaching)

    ec_parse = parser.parse(resp)
    return ec_parse

In [26]:
def get_all_data(ids, previous_json, verbose=False):    
    data_dict = previous_json
    for index,id in enumerate(ids):
        ec_data = get_ec_data(id)
        name = ec_data.get('SYSNAME',id) # if sysname not found, will use ec_number
        ec_number = id.replace('ec:','')
        
        ec_data['KEGG_ID'] = id
        ec_data['EC_NUMBER'] = ec_number
        # WARNING: keys are now EC numbers istead of sysname, but can be changed 
        data_dict[ec_number] = ec_data 
        if verbose: 
            print_percent_done(index=index, length=len(ids))
    return data_dict

In [27]:
def scrape_kegg():
    print(f'Kegg scraping script started...')
    # Reading past results from KEGG
    previous_json = read_past_data()
    print(f'1. Successfully read previous json data, that has total of {len(previous_json)} records')
    prev_ids = {enzyme['KEGG_ID'] for enzyme in list(previous_json.values())}
    
    # Getting IDs of all entries that are missing from past
    new_ids = get_ids_for_keywords(keywords)
    
    all_ids = new_ids - prev_ids    
                
    # If new ids have been found, fetch the data
    if len(all_ids) > 0:
        print(f'2. Following potential flavins are missing from past results:')
        [print(f'{ec}', end=" | ") for ec in all_ids]
    
        # Scraping the data
        print(f'\n3. Fetching the data')
        flavins = get_all_data(all_ids, previous_json, verbose=True)
        
        # Writing out the results to the file
        with open(export_file, 'w') as outfile:
            json.dump(flavins, outfile)
        print(f'\nSuccessfully written out {len(all_ids)} results to "{export_file}"')
    else:
        print("[i] Doesn't look like there are any new flavins on KEGG!")

### Running the program

In [28]:
scrape_kegg()

Kegg scraping script started...
1. Successfully read previous json data, that has total of 78 records
[i] Doesn't look like there are any new flavins on KEGG!


In [12]:
# prints all the ECs of items without SYSNAME
kegg = read_past_data()
sysnames = [ec for (ec, details) in list(kegg.items()) if not details.get('SYSNAME')]
sysnames

['3.4.22.61']

In [24]:
sorted(kegg.keys())

['1.1.2.3',
 '1.1.5.3',
 '1.1.5.4',
 '1.1.5.9',
 '1.1.99.27',
 '1.12.98.1',
 '1.13.11.79',
 '1.13.12.17',
 '1.14.13.111',
 '1.14.13.113',
 '1.14.13.148',
 '1.14.13.32',
 '1.14.13.8',
 '1.14.14.20',
 '1.14.14.27',
 '1.14.14.3',
 '1.14.14.34',
 '1.14.14.5',
 '1.14.14.8',
 '1.14.14.9',
 '1.14.19.22',
 '1.14.19.23',
 '1.14.19.25',
 '1.14.19.3',
 '1.14.19.30',
 '1.14.19.31',
 '1.14.19.33',
 '1.14.19.34',
 '1.14.19.35',
 '1.14.19.42',
 '1.14.19.43',
 '1.14.19.44',
 '1.14.99.66',
 '1.2.99.7',
 '1.2.99.8',
 '1.3.8.2',
 '1.4.3.4',
 '1.5.1.20',
 '1.5.1.30',
 '1.5.1.36',
 '1.5.1.37',
 '1.5.1.38',
 '1.5.1.39',
 '1.5.1.40',
 '1.5.1.41',
 '1.5.1.42',
 '1.5.1.45',
 '1.6.2.4',
 '1.7.1.17',
 '2.1.1.148',
 '2.1.1.343',
 '2.1.1.349',
 '2.1.1.74',
 '2.5.1.129',
 '2.5.1.9',
 '2.6.1.114',
 '2.7.1.161',
 '2.7.1.180',
 '2.7.1.26',
 '2.7.1.42',
 '2.7.10.2',
 '2.7.7.2',
 '2.7.8.28',
 '3.1.3.102',
 '3.4.22.61',
 '3.5.99.1',
 '3.6.1.18',
 '4.3.1.32',
 '4.3.3.5',
 '4.6.1.15',
 '6.2.1.2',
 '6.2.1.41',
 '6.2.1.42',
