In [None]:
import pandas as pd
from io import StringIO
import requests
import lxml
import importlib
import json

import importlib
import cache
importlib.reload(cache)

from helpers import print_percent_done



### TODO still
- Deduplicate the data within the entities so they are not repeating

### Configurables
Edit the folling options before running the scraper
- Add the terms to search for on Brenda's website here:
- Make sure the output and input files are what you want

In [43]:
useCaching = False # setting this parameter to False will fetch new data from server, instead of using cache
terms = ['FAD','FMN','flavoenzyme','flavin', 'flavoprotein']
import_file = "export/brenda.json"
export_file = "export/brenda_new_export.json" # made the filename different so that old one can not be overriden

In [44]:
def brenda_request(url):
    headers = {
      "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
      "X-Requested-With": "XMLHttpRequest"
    }
    response = cache.cached_reqest(url, headers=headers, useCaching=useCaching)
    return response
    

In [45]:
def search_ligands_brenda(term):
    columns = ['Ligand','EC Number', 'Role', 'Id', 'Structure', 'Discard']
    url = f'https://www.brenda-enzymes.org/result_download.php?a=13&RN=&RNV=1&os=1&pt=&FNV=1&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&V[1]=1&V[2]=2&W[3]={term}&T[3]=2&nolimit=1'
    response = brenda_request(url)
    df = pd.read_csv(StringIO(response), sep='\t', names=columns)
    return df

In [46]:
def search_enzymes_brenda(term):
    columns = ['EC Number', 'Recommended Name', 'Synonyms', 'Commentary', 'Discard']
    url = f'https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]={term}&T[2]=2&nolimit=1'    
    response = brenda_request(url)
    df = pd.read_csv(StringIO(response), sep='\t', names=columns)
    return df

In [47]:
def brenda_get_enzyme_data(id):
    link = f'https://www.brenda-enzymes.info/enzyme.php?ecno={id}#NATURAL%20SUBSTRATE'
    response = brenda_request(link)
    return response

In [48]:
def search_all_terms(terms, search_fn):
    all_dfs = []

    for term in terms:
        df = search_fn(term)
        if (len(df) < 2):
            print(f'[!] skipping search for {term} since nothing was found')
            continue
        else:
            all_dfs.append(df)
    return pd.concat(all_dfs)

In [49]:
def get_all_ecs(terms):
    
    enzymes_list = set(search_all_terms(terms,search_enzymes_brenda)['EC Number'])
    ligands_list = set(search_all_terms(terms,search_ligands_brenda)['EC Number'])

    ec_set = enzymes_list | ligands_list
    print(f'total ecs found: {len(ec_set)}')

    return ec_set

# SOAP helper 

In [59]:
import zeep


from zeep import Client
import hashlib

email = 'si485@dispostable.com'
password = 'si485@dispostable'

wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
password = hashlib.sha256(password.encode("utf-8")).hexdigest()
client = Client(wsdl)

In [51]:
poster_child = '1.14.13.2'

In [52]:
def brendaSOAP(parameters, fn_name):
    fn = client.service[fn_name]
    client.settings.strict = True
    return cache.generic_cached_reqest(request_name=f'brenda_{fn}', params=parameters, request_fn=fn, useCaching=useCaching)

def getSynonyms(ecNumber):
    parameters = (email, password, f"ecNumber*{ecNumber}", 'organism*', 'synonyms*', 'commentary*', 'literature*')
    return brendaSOAP(parameters, 'getSynonyms')

def getReactions(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "reaction*", "commentary*", "literature*", "organism*")
    return brendaSOAP(parameters, 'getReaction')

def getSystematicName(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "systematicName*")
    return brendaSOAP(parameters, 'getSystematicName')

def getSubstrate(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "substrate*", "reactionPartners*", "ligandStructureId*")
    return brendaSOAP(parameters,'getSubstrate')

def getProduct(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "product*", "reactionPartners*", "ligandStructureId*")
    return brendaSOAP(parameters,'getProduct')

def getPdb(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "pdb*")
    return brendaSOAP(parameters,'getPdb')

In [17]:
all_ecs = get_all_ecs(terms)

found JSON cache
Getting cached data...
https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]=FAD&T[2]=2&nolimit=1
found JSON cache
Getting cached data...
https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]=FMN&T[2]=2&nolimit=1
found JSON cache
Getting cached data...
https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]=flavoenzyme&T[2]=2&nolimit=1
[!] skipping search for flavoenzyme since nothing was found
found JSON cache
Getting cached data...
https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]=flavin&T[2]=2&nolimit=1
found JSON cache
Getting cached data...
https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]=flavoprotein&T[2]=2&nolimit=1
found JSON cache
Get

### Initializing BrendaDB

In [53]:
def read_past_data(path=import_file):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except:
        return {}

In [54]:
def create_brenda_ec_entry(ec):
    return {
        'SYSNAME': getSystematicName(ec),
        'REACTIONS': getReactions(ec),
        'NAME': getSynonyms(ec),
        'SUBSTRATE': getSubstrate(ec),
        'PRODUCT': getProduct(ec),
        'PDB': getPdb(ec),
        'EC_NUMBER': ec
    }

In [61]:
client = Client(wsdl)
fn = client.service['getSynonyms']
resp = fn(email, password, "ecNumber*1.1.99.15", 'organism*', 'synonyms*', 'commentary*', 'literature*')


# getSubstrate('1.1.99.15')
resp

[]

In [62]:
fn

<zeep.proxy.OperationProxy at 0x120a027b8>

In [None]:
def create_brenda_db(list_of_ecs, previous_db, verbose=False):
    new_db = previous_db
    for index, ec in enumerate(list_of_ecs):
        if ec not in previous_db:
            try:
                entry = create_brenda_ec_entry(ec)
                sysnames = entry['SYSNAME']
                entry_name = sorted(sysnames)[0]['ecNumber']
                new_db[entry_name] = (entry)
                if verbose: 
                    print_percent_done(index=index, length=len(list_of_ecs))
            except:
                print(f"[!] couldn't fetch info for {ec}")
        
    
    # Writing out the results to the file
    with open(export_file, 'w') as outfile:
        json.dump(new_db, outfile)
    print(f'\nSuccessfully written out {len(new_db)} results to "{export_file}"')

    return new_db

### Running the Update/Scrape Script

In [19]:
previous_db = read_past_data(import_file)

# Example of the BrendaDB:
# brenda_enzymes = {
#             ecNumber: {
#                 'SYSNAME': "...", # should this be an array?
#                 'REACTIONS': [{reaction},],
#                 'NAME': ['...',],
#                 'SUBSTRATE': [{substrate},],
#                 'PRODUCT': [{product},],
#                 'PDB': [{pdb}]
#                 'EC_NUMBER': 'ec:1.1.1.1'
#             },
#         }
sorted_ecs = sorted(list(all_ecs))
db = create_brenda_db(sorted_ecs, previous_db, verbose=True)

Getting cached data...
brenda_<zeep.proxy.OperationProxy object at 0x133f75588>?68a8b3ecc5cecaa6b4bf94e0c89ff02d5d77366ca90f1774aae28e37f3b202a7=68a8b3ecc5cecaa6b4bf94e0c89ff02d5d77366ca90f1774aae28e37f3b202a7&ecNumber*1.1.1.205=ecNumber*1.1.1.205&organism*=organism*&si485@dispostable.com=si485@dispostable.com&systematicName*=systematicName*
Getting cached data...
brenda_<zeep.proxy.OperationProxy object at 0x133f75e80>?68a8b3ecc5cecaa6b4bf94e0c89ff02d5d77366ca90f1774aae28e37f3b202a7=68a8b3ecc5cecaa6b4bf94e0c89ff02d5d77366ca90f1774aae28e37f3b202a7&commentary*=commentary*&ecNumber*1.1.1.205=ecNumber*1.1.1.205&literature*=literature*&organism*=organism*&reaction*=reaction*&si485@dispostable.com=si485@dispostable.com
Getting cached data...
brenda_<zeep.proxy.OperationProxy object at 0x133f75358>?68a8b3ecc5cecaa6b4bf94e0c89ff02d5d77366ca90f1774aae28e37f3b202a7=68a8b3ecc5cecaa6b4bf94e0c89ff02d5d77366ca90f1774aae28e37f3b202a7&commentary*=commentary*&ecNumber*1.1.1.205=ecNumber*1.1.1.205&lite

NameError: name 'db' is not defined

In [77]:
# check if there are ever more then two sysnames
[db[ez_name]['SYSNAME'][0]['ecNumber'] for ez_name in db.keys() if len(db[ez_name]['SYSNAME']) > 1]

[]

In [78]:
# dataframe to easily work with
df = pd.DataFrame(db).T

In [79]:
df['NAME']['1.1.1.125']

[{'commentary': None,
  'synonyms': '2-deoxy-D-gluconate 3-dehydrogenase',
  'literature': [726324],
  'organism': 'Escherichia coli',
  'ecNumber': '1.1.1.125'},
 {'commentary': None,
  'synonyms': '2-deoxygluconate dehydrogenase',
  'literature': [0],
  'organism': None,
  'ecNumber': '1.1.1.125'},
 {'commentary': None,
  'synonyms': '2-deoxygluconic acid dehydrogenase',
  'literature': [285818],
  'organism': 'Pseudomonas sp.',
  'ecNumber': '1.1.1.125'},
 {'commentary': None,
  'synonyms': 'KduD',
  'literature': [726324],
  'organism': 'Escherichia coli',
  'ecNumber': '1.1.1.125'}]

In [80]:
# next(iter(db.keys()))
db.keys()

dict_keys(['1.1.1.1', '1.1.1.125', '1.1.1.181', '1.1.1.184', '1.1.1.189', '1.1.1.193', '1.1.1.215', '1.1.1.216', '1.1.1.217', '1.1.1.229', '1.1.1.27', '1.7.1.2', '1.5.99.12', '1.1.1.25', '1.1.1.267', '1.1.1.28', '1.1.1.284', '1.1.1.289', '1.1.1.306', '1.1.1.328', '1.1.1.35', '1.1.1.404', '1.1.1.47', '1.1.1.94', '1.1.1.B3', '1.1.1.B58', '1.1.2.3', '1.1.2.4', '1.1.3.10', '1.1.3.12', '1.1.3.13', '1.1.3.15', '1.1.3.17', '1.1.3.19', '1.1.3.2', '1.1.3.20', '1.1.3.21', '1.1.3.23', '1.1.3.28', '1.1.3.37', '1.1.3.38', '1.1.3.39', '1.1.3.4', '1.1.3.41', '1.1.3.42', '1.1.3.43', '1.1.3.44', '1.1.3.45', '1.1.3.46', '1.1.3.47', '1.1.3.49', '1.1.3.5', '1.1.3.6', '1.1.3.7', '1.1.3.8', '1.1.3.9', '1.1.5.10', '1.1.5.12', '1.1.5.3', '1.1.5.4', '1.1.5.9', '1.1.98.3', '1.1.98.4', '1.1.99.1', '1.1.99.11', '1.1.99.13', '1.1.99.18', '1.1.99.2', '1.1.99.20', '1.1.99.21', '1.1.99.27', '1.1.99.29', '1.1.99.3', '1.1.99.31', '1.1.99.37', '1.1.99.39', '1.1.99.4', '1.1.99.40', '1.1.99.42', '1.1.99.6', '1.1.99.9', '1