In [3]:
import bioservices
import json

In [4]:
k = bioservices.kegg.KEGG()
parser = bioservices.kegg.KEGGParser()

### Configurables
Edit the folling options before running the scraper

In [5]:
keywords = ['FAD', 'FMN', 'flavin', 'flavoenzyme']
import_file = "export/kegg.json"
export_file = "export/kegg.json"

### Helper functions

In [6]:
def read_past_data(path=import_file):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except:
        return {}

In [7]:
def get_ids(keyword):
    results = k.find(database='enzyme', query=keyword)
    results_array = results.split('\n')
    ids_array = [i.split('\t')[0] for i in results_array if (i)]
    return ids_array

In [8]:
def get_ec_data(id):
    entry = k.get(id)
    ec_parse = parser.parse(entry)
    return ec_parse

In [9]:
def get_all_data(ids, previous_json, verbose=False):    
    data_dict = previous_json
    for id in ids:
        ec_data = get_ec_data(id)
        ec_data['EC'] = id
        name = ec_data.get('SYSNAME',id)
        data_dict[name] = ec_data
        if verbose: print(". ", end="")    
    return data_dict

In [13]:
def scrape_kegg():
    print(f'Kegg scraping script started...')
    # Reading past results from KEGG
    previous_json = read_past_data()
    print(f'1. Successfully read previous json data, that has total of {len(previous_json)} records')
#     prev_ids = list(previous_json.keys())
    prev_ids = [enzyme['EC'] for enzyme in list(previous_json.values())]
#     print(list(previous_json.values())[0]['EC'])
    
    # Getting IDs of all entries that are missing from past
    all_ids = set()
    for keyword in keywords:
        ids = get_ids(keyword)
        for id in ids:
            if (id not in prev_ids):
                all_ids.add(id)
                
    # If new ids have been found, fetch the data
    if len(all_ids) > 0:
        print(f'2. Following potential flavins are missing from past results:')
        [print(f'{ec}', end=" | ") for ec in all_ids]
    
        # Scraping the data
        print(f'\n3. Fetching the data')
        flavins = get_all_data(all_ids, previous_json, verbose=True)
        
        # Writing out the results to the file
        with open(export_file, 'w') as outfile:
            json.dump(flavins, outfile)
        print(f'\nSuccessfully written out {len(all_ids)} results to "{export_file}"')
    else:
        print("[i] Doesn't look like there are any new flavins on KEGG!")

### Running the program

In [14]:
scrape_kegg()

Kegg scraping script started...
1. Successfully read previous json data, that has total of 78 records
[i] Doesn't look like there are any new flavins on KEGG!


In [23]:
# prints all the ECs of items without SYSNAME
kegg = read_past_data()
sysnames = [ec for (ec, details) in list(kegg.items()) if not details.get('SYSNAME')]
sysnames

['ec:3.4.22.61']