In [44]:
import pandas as pd
from io import StringIO
import requests
import lxml
import importlib
import json

import importlib
import cache
importlib.reload(cache)

from helpers import print_percent_done

### TODO
- 

### Configurables
Edit the folling options before running the scraper
1. Add the terms to search for on Brenda's website here:
1. Make sure the output and input files are what you want
1. Update the blacklist if new errors arise

In [2]:
useCaching = True # setting this parameter to False will fetch new data from server, instead of using cache
blacklist = [
    # ec number for which requests should not be done (Brenda has some corrupt data)
    # if you run the script and find some ECs produce errors, add them to this list
    '1.1.1.205',
    '1.1.99.15',
    '1.13.11.32',
    '1.14.19.7',
    '1.14.99.40',
    '1.14.99.7',
    '1.3.3.6',
    '1.3.99.B12',
    '1.4.99.B4',
    '1.5.1.29',
    '1.6.8.1',
    '1.6.8.2',
    '1.7.99.5',
    '1.8.1.B1',
    '1.8.99.B1',
    '2.5.1.77',
    '2.7.7.95',
    '3.4.16.4',
    '3.4.21.1',
    '3.4.21.109',
    '3.4.22.3',
    '3.4.22.56',
    '3.4.22.61',
    '3.4.24.15',
    '3.4.24.59',
    '3.4.24.69',
    '7.2.1.1'
]
terms = ['FAD','FMN','flavoenzyme','flavin', 'flavoprotein']
import_file = "export/brenda.json"
export_file = "export/brenda_new_export.json" # made the filename different so that old one can not be overriden

In [3]:
def brenda_request(url):
    headers = {
      "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
      "X-Requested-With": "XMLHttpRequest"
    }
    response = cache.cached_reqest(url, headers=headers, useCaching=useCaching)
    return response
    

In [4]:
def search_ligands_brenda(term):
    columns = ['Ligand','EC Number', 'Role', 'Id', 'Structure', 'Discard']
    url = f'https://www.brenda-enzymes.org/result_download.php?a=13&RN=&RNV=1&os=1&pt=&FNV=1&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&V[1]=1&V[2]=2&W[3]={term}&T[3]=2&nolimit=1'
    response = brenda_request(url)
    df = pd.read_csv(StringIO(response), sep='\t', names=columns)
    return df

In [5]:
def search_enzymes_brenda(term):
    columns = ['EC Number', 'Recommended Name', 'Synonyms', 'Commentary', 'Discard']
    url = f'https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]={term}&T[2]=2&nolimit=1'    
    response = brenda_request(url)
    df = pd.read_csv(StringIO(response), sep='\t', names=columns)
    return df

In [6]:
def brenda_get_enzyme_data(id):
    link = f'https://www.brenda-enzymes.info/enzyme.php?ecno={id}#NATURAL%20SUBSTRATE'
    response = brenda_request(link)
    return response

In [7]:
def search_all_terms(terms, search_fn):
    all_dfs = []

    for term in terms:
        df = search_fn(term)
        if (len(df) < 2):
            print(f'[!] skipping search for {term} since nothing was found')
            continue
        else:
            all_dfs.append(df)
    return pd.concat(all_dfs)

In [8]:
def get_all_ecs(terms):
    
    enzymes_list = set(search_all_terms(terms,search_enzymes_brenda)['EC Number'])
    ligands_list = set(search_all_terms(terms,search_ligands_brenda)['EC Number'])

    ec_set = enzymes_list | ligands_list
    print(f'total ecs found: {len(ec_set)}')

    return ec_set

# SOAP helper 

In [9]:
import zeep


from zeep import Client
import hashlib

email = 'si485@dispostable.com'
password = 'si485@dispostable'

wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
password = hashlib.sha256(password.encode("utf-8")).hexdigest()
client = Client(wsdl)

In [10]:
poster_child = '1.14.13.2'

In [11]:
def brendaSOAP(parameters, fn_name):
    fn = client.service[fn_name]
    client.settings.strict = True
    return cache.brenda_cached_reqest(request_name=f'brenda_{fn_name}', params=parameters, request_fn=fn, useCaching=useCaching)

def getSynonyms(ecNumber):
    parameters = (email, password, f"ecNumber*{ecNumber}", 'organism*', 'synonyms*', 'commentary*', 'literature*')
    resp = brendaSOAP(parameters, 'getSynonyms')
    return sorted({synonym['synonyms'] for synonym in resp})

def getReactions(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "reaction*", "commentary*", "literature*", "organism*")
    resp = brendaSOAP(parameters, 'getReaction')
    return sorted({reaction['reaction'] for reaction in resp})

def getSystematicName(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "systematicName*")
    resp = brendaSOAP(parameters, 'getSystematicName')
    return resp[0]['systematicName'] # there should be only one sysname per entry

def getSubstrate(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "substrate*", "reactionPartners*", "ligandStructureId*")
    resp = brendaSOAP(parameters,'getSubstrate')
    return sorted({substrate['substrate'] for substrate in resp})

def getProduct(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "product*", "reactionPartners*", "ligandStructureId*")
    resp = brendaSOAP(parameters,'getProduct')
    return sorted({product['product'] for product in resp})

def getPdb(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "pdb*")
    resp =  brendaSOAP(parameters,'getPdb')
    return sorted({pdb['pdb'] for pdb in resp})

In [12]:
all_ecs = get_all_ecs(terms)

Getting cached data...
https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]=FAD&T[2]=2&nolimit=1
Getting cached data...
https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]=FMN&T[2]=2&nolimit=1
Getting cached data...
https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]=flavoenzyme&T[2]=2&nolimit=1
[!] skipping search for flavoenzyme since nothing was found
Getting cached data...
https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]=flavin&T[2]=2&nolimit=1
Getting cached data...
https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]=flavoprotein&T[2]=2&nolimit=1
Getting cached data...
https://www.brenda-enzymes.org/result_download.php?a=13&RN=&RNV=1&os=1&pt=&FNV=1&t

### Initializing BrendaDB

In [13]:
def read_past_data(path=import_file):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except:
        return {}

In [14]:
def create_brenda_ec_entry(ec):
    return {
        'SYSNAME': getSystematicName(ec),
        'REACTIONS': getReactions(ec),
        'SYNONYMS': getSynonyms(ec),
        'SUBSTRATE': getSubstrate(ec),
        'PRODUCT': getProduct(ec),
        'PDB': getPdb(ec),
        'EC_NUMBER': ec
    }

In [15]:
def create_brenda_db(list_of_ecs, previous_db, verbose=False):
    new_db = previous_db
    for index, ec in enumerate(list_of_ecs):
        if ec not in previous_db and ec not in blacklist:
            if verbose:
                print(f'Fetching data for {ec} ...')
            try:
                entry = create_brenda_ec_entry(ec)
                sysname = entry['SYSNAME']
                entry_name = ec
                new_db[entry_name] = (entry)
                if verbose: 
                    print_percent_done(index=index, length=len(list_of_ecs))
            except Exception as e:
                print(f"[!] couldn't fetch info for {ec}")
                print("     Error: ", e)
                print("\n\n")
                
    # Writing out the results to the file
    with open(export_file, 'w') as outfile:
        json.dump(new_db, outfile)
    print(f'\nSuccessfully written out {len(new_db)} results to "{export_file}"')

    return new_db

### Running the Update/Scrape Script

In [45]:
# Example of the BrendaDB:
# brenda_enzymes = {
#             ecNumber: {
#                 'SYSNAME': "...", # should this be an array?
#                 'REACTIONS': [{reaction},],
#                 'SYNONYMS': ['...',],
#                 'SUBSTRATE': [{substrate},],
#                 'PRODUCT': [{product},],
#                 'PDB': [{pdb}]
#                 'EC_NUMBER': 'ec:1.1.1.1'
#             },
#         }

previous_db = read_past_data(import_file)
sorted_ecs = sorted(list(all_ecs))
db = create_brenda_db(sorted_ecs, previous_db, verbose=True)

Fetching data for 1.1.1.1 ...


----> Progress: ░░░░░░░░░░ 	0% done
Fetching data for 1.1.1.125 ...


----> Progress: ░░░░░░░░░░ 	0% done
Fetching data for 1.1.1.181 ...


----> Progress: ░░░░░░░░░░ 	0% done
Fetching data for 1.1.1.184 ...


----> Progress: ░░░░░░░░░░ 	0% done
Fetching data for 1.1.1.189 ...


----> Progress: ░░░░░░░░░ 	1% done
Fetching data for 1.1.1.193 ...


----> Progress: ░░░░░░░░░ 	1% done
Fetching data for 1.1.1.215 ...


----> Progress: ░░░░░░░░░ 	1% done
Fetching data for 1.1.1.216 ...


----> Progress: ░░░░░░░░░ 	1% done
Fetching data for 1.1.1.217 ...


----> Progress: ░░░░░░░░░ 	1% done
Fetching data for 1.1.1.229 ...


----> Progress: ░░░░░░░░░ 	1% done
Fetching data for 1.1.1.25 ...


----> Progress: ░░░░░░░░░ 	1% done
Fetching data for 1.1.1.267 ...


----> Progress: ░░░░░░░░░ 	2% done
Fetching data for 1.1.1.27 ...


----> Progress: ░░░░░░░░░ 	2% done
Fetching data for 1.1.1.28 ...


----> Progress: ░░░░░░░░░ 	2% done
Fetching data for 1.1.1.284 ...


-

# Exploration:

In [26]:
len(db)

781

In [50]:
db['2.5.1.9']

{'SYSNAME': '6,7-dimethyl-8-(1-D-ribityl)lumazine:6,7-dimethyl-8-(1-D-ribityl)lumazine 2,3-butanediyltransferase',
 'REACTIONS': ['2 6,7-dimethyl-8-(1-D-ribityl)lumazine = riboflavin + 4-(1-D-ribitylamino)-5-amino-2,6-dihydroxypyrimidine'],
 'SYNONYMS': ['RibD',
  'heavy riboflavin synthase',
  'light riboflavin synthase',
  'lumazine synthase/riboflavin synthase complex',
  'riboflavin synthase',
  'riboflavin synthetase',
  'riboflavine synthase',
  'riboflavine synthetase',
  'synthase, riboflavin'],
 'SUBSTRATE': ['2 6,7-dimethyl-8-ribityllumazine',
  '6,7-dimethyl-8-(1-D-ribityl)lumazine',
  "6,7-dimethyl-8-[1'-(5'-deoxy-D-ribityl)]lumazine",
  '6,7-dimethyl-8-ribityllumazine',
  'more'],
 'PRODUCT': ["4-(1'-D-ribitylamino)-5-amino-2,6-dihydroxypyrimidine",
  '4-(1-D-ribitylamino)-5-amino-2,6-dihydroxypyrimidine',
  "5'-deoxyriboflavin",
  '5-amino-6-ribitylamino-2,4(1H,3H)-pyrimidine-dione',
  '5-amino-6-ribitylamino-2,4-(1H,3H)-pyrimidinedione',
  '?',
  'a compound related to 4

In [39]:
# dataframe to easily work with
df = pd.DataFrame(db).T

#### Discuss: Enzymes that have the same sysname

In [43]:
list_of_sysnames = [previous_db[ec]['SYSNAME'] for ec in sorted(previous_db.keys())]
duplicates = [item for item in list_of_sysnames if list_of_sysnames.count(item) > 1]
[print(ec + " \t|\t " + previous_db[ec]['SYSNAME']) for ec in previous_db.keys() if previous_db[ec]['SYSNAME'] in duplicates]
pass

1.14.13.B1 	|	 (-)-bornane-2,5-dione,FMNH2:oxygen oxidoreductase (1,2-lactonizing)
1.14.14.155 	|	 (-)-bornane-2,5-dione,FMNH2:oxygen oxidoreductase (1,2-lactonizing)
1.3.5.1 	|	 succinate:quinone oxidoreductase
1.3.5.4 	|	 succinate:quinone oxidoreductase
1.6.5.9 	|	 NADH:ubiquinone oxidoreductase
4.2.1.167 	|	 (R)-2-hydroxyglutaryl-CoA hydro-lyase ((E)-glutaconyl-CoA-forming)
4.2.1.B26 	|	 (R)-2-hydroxyglutaryl-CoA hydro-lyase ((E)-glutaconyl-CoA-forming)
7.1.1.2 	|	 NADH:ubiquinone oxidoreductase
