# Extract information from ENZYME - Enzyme nomenclature database

Note: Requires internet connection to download information from [ENZYME - Enzyme nomenclature database](https://enzyme.expasy.org/).

## Setup
### Import packages

In [1]:
import re
import pandas as pd
import matplotlib.pyplot as plt

from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    ROOT_PATH,
    DATABASE_PATH,
    ANNOTATION_PATH,
    INTERIM_PATH,
    GEM_NAME,
    read_rbc_model,
    get_annotation_df,
    compare_tables,
    visualize_comparison,
    show_versions,
    check_version,
    split_string,
    build_string,
    check_database_version_online,
    explode_column,
)

from rbc_gem_utils.database.ec import (
    download_database_EC,
    get_version_EC,
    EC_VERSION_EXPECTED,
    EC_DB_TAG,
    EC_PATH,
    EC_URL,
)

# Display versions of last time notebook ran and worked
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Information
-------------------

## Check EC-ENZYME version
If the version does not match the expected version, it is because database has been updated since the last time this code was utilized. 

### Expected EC-ENZYME version: 27-Mar-2024
* Updates to the database are made every eight weeks (need confirmation)
* Last release utilized: **Wed Mar 27 2024**.

In [2]:
if not check_database_version_online("EC"):
    warn(
        "Online version of database has been updated since the last time notebook was used."
    )

version = get_version_EC()
if check_version(version, EC_VERSION_EXPECTED, verbose=True):
    database_dirpath = f"{ROOT_PATH}{DATABASE_PATH}{EC_PATH}"
    annotation_dirpath = f"{ROOT_PATH}{ANNOTATION_PATH}"
else:
    # Use different directory paths for unexpected behavior
    database_dirpath = f"{ROOT_PATH}{INTERIM_PATH}"
    annotation_dirpath = f"{ROOT_PATH}{INTERIM_PATH}"

Current and expected versions match.


#### Download new files and update database
If an argument is not provided (`arg=None`), its default value for the repository used. 

In [3]:
download = False
if download:
    download_database_EC(
        filename="enzyme.dat", database_dirpath=f"{database_dirpath}{EC_PATH}"
    )
    download_database_EC(
        filename="enzclass.txt", database_dirpath=f"{database_dirpath}{EC_PATH}"
    )

## Load RBC-GEM model

In [4]:
model = read_rbc_model(filetype="xml")
model

Set parameter Username
Academic license - for non-commercial use only - expires 2024-11-28


0,1
Name,RBC_GEM
Memory address,1501d11d0
Number of metabolites,2057
Number of reactions,3030
Number of genes,779
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"


In [5]:
annotation_type = "genes"
# Best mapping key is ec-code
mapping_key = "ec-code"
display_nunique = True
overwrite = True


df_model_mappings = (
    get_annotation_df(model.genes, ["ec-code", "uniprot"])
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings = explode_column(df_model_mappings, name=col, sep=";")
df_model_mappings = df_model_mappings.sort_values(annotation_type)
df_model_mappings

Unnamed: 0,genes,ec-code,uniprot
629,A4GALT,2.4.1.228,Q9NPC4
296,AARS1,6.1.1.7,P49588
478,AASDHPPT,2.7.8.7,Q9NRN7
630,ABCA1,7.6.2.1,O95477
436,ABCA7,7.6.2.1,Q8IZY2
...,...,...,...
553,ZDHHC20,2.3.1.-,Q5W0Z9
553,ZDHHC20,2.3.1.225,Q5W0Z9
526,ZDHHC3,2.3.1.225,Q9NYG2
526,ZDHHC3,2.3.1.-,Q9NYG2


### Read data files 

In [6]:
with open(f"{database_dirpath}{EC_PATH}/ec_enzyme.dat") as file:
    lines = file.readlines()

lines
data = {}
idx = -1
for line in lines:
    line = line.strip()
    line_type = line[:2]
    line_value = line[2 + 3 :]
    # All entries start with ID and end with '//' for termination
    if line.startswith("ID"):
        data[idx] = {"ID": line_value.split(" ")[-1]}
        continue
    elif line.startswith("//"):
        idx += 1
        continue
    elif line.startswith("CC"):
        if idx not in data:
            continue

    elif line.startswith("DR"):
        line_value = [x.strip() for x in line_value.split(";") if x.strip()]
        line_value = build_string(
            [
                x.split(", ")[0]
                for x in line_value
                if x.split(", ")[-1].endswith("_HUMAN")
            ]
        )

    if line_value:
        current = data.get(idx, {}).get(line_type, "")
        if current:
            current = build_string([current, line_value.rstrip(".")])
        else:
            current = line_value.rstrip(".")
        data[idx][line_type] = current

df_ec_enzyme = pd.DataFrame.from_dict(data, orient="index")
df_ec_enzyme = df_ec_enzyme[df_ec_enzyme["DE"].str.find("Transferred entry") == -1]
df_ec_enzyme = df_ec_enzyme.drop_duplicates()
df_ec_enzyme = df_ec_enzyme.rename(
    {
        "ID": "ec-code",
        "DE": "description",
        "AN": "alternate",
        "CA": "catalytic activity",
        "CC": "comments",
        "DR": "uniprot",
    },
    axis=1,
)
df_ec_enzyme["uniprot"] = df_ec_enzyme["uniprot"].apply(
    lambda x: x.split(";") if isinstance(x, str) else x
)
df_ec_enzyme = df_ec_enzyme.explode("uniprot")
df_ec_enzyme

Unnamed: 0,ec-code,description,alternate,catalytic activity,comments,uniprot
0,1.1.1.1,alcohol dehydrogenase,aldehyde reductase,(1) a primary alcohol + NAD(+) = an aldehyde +...,-!- Acts on primary or secondary alcohols or h...,P07327
0,1.1.1.1,alcohol dehydrogenase,aldehyde reductase,(1) a primary alcohol + NAD(+) = an aldehyde +...,-!- Acts on primary or secondary alcohols or h...,P00326
0,1.1.1.1,alcohol dehydrogenase,aldehyde reductase,(1) a primary alcohol + NAD(+) = an aldehyde +...,-!- Acts on primary or secondary alcohols or h...,P28332
0,1.1.1.1,alcohol dehydrogenase,aldehyde reductase,(1) a primary alcohol + NAD(+) = an aldehyde +...,-!- Acts on primary or secondary alcohols or h...,P40394
0,1.1.1.1,alcohol dehydrogenase,aldehyde reductase,(1) a primary alcohol + NAD(+) = an aldehyde +...,-!- Acts on primary or secondary alcohols or h...,P11766
...,...,...,...,...,...,...
8231,7.6.2.12,ABC-type capsular-polysaccharide transporter,capsular-polysaccharide-transporting ATPase,ATP + H2O + capsular polysaccharide-[capsular ...,-!- ATP-binding cassette (ABC) type transporte...,
8232,7.6.2.13,ABC-type autoinducer-2 transporter,autoinducer-2 ABC transporter;autoinducer-2 tr...,"ATP + H2O + (2R,4S)-2-methyl-2,3,3,4-tetrahydr...",-!- ATP-binding cassette (ABC) type transporte...,
8233,7.6.2.14,ABC-type aliphatic sulfonate transporter,aliphatic sulfonate ABC transporter;aliphatic ...,ATP + H2O + aliphatic sulfonate-[sulfonate-bin...,-!- ATP-binding cassette (ABC) type transporte...,
8234,7.6.2.15,ABC-type thiamine transporter,thiamine ABC transporter;thiamine transporting...,ATP + H2O + thiamine(out) = ADP + H(+) + phosp...,-!- ATP-binding cassette (ABC) type transporte...,


In [7]:
items = [
    "class",
    "subclass",
    "subsubclass",
    "serial",
    "description",
]

with open(f"{database_dirpath}{EC_PATH}/ec_enzclass.txt") as file:
    lines = file.readlines()

lines = lines[11:-5]

ec_enzclass_data = {}
for i, line in enumerate(lines):
    if not re.search(r"^(\d+|\-)\.", line):
        continue

    line_items = [
        substr.strip().rstrip(".")
        for string in line.split(".", maxsplit=3)
        for substr in string.split(" ", maxsplit=1)
        if substr.strip()
    ]
    if len(line_items) != 5:
        warn(f"Issue with parsing line {i+1}: {repr(line)}")
        continue
    ec_enzclass_data[i] = {
        "ec-code": ".".join(line_items[:4]).strip(),
        "description": line_items[4],
    }
df_ec_enzclass = pd.DataFrame.from_dict(ec_enzclass_data, orient="index")
df_ec_enzclass = pd.concat(
    (df_ec_enzclass, df_ec_enzyme[["ec-code", "description"]]), axis=0
)
df_ec_enzclass = (
    df_ec_enzclass.sort_values("ec-code")
    .reset_index(drop=True)
    .dropna()
    .drop_duplicates()
    .astype(str)
)
description_dict = df_ec_enzclass.set_index("ec-code").to_dict()["description"]
df_ec_enzclass

Unnamed: 0,ec-code,description
0,1.-.-.-,Oxidoreductases
1,1.1.-.-,Acting on the CH-OH group of donors
2,1.1.1.-,With NAD(+) or NADP(+) as acceptor
3,1.1.1.1,alcohol dehydrogenase
8,1.1.1.10,L-xylulose reductase
...,...,...
10142,7.6.2.5,ABC-type heme transporter
10143,7.6.2.6,ABC-type guanine transporter
10144,7.6.2.7,ABC-type taurine transporter
10145,7.6.2.8,ABC-type vitamin B12 transporter


### Map to EC Codes

In [8]:
mapping_key = "ec-code"
df_model_ec_enzyme = df_model_mappings[["genes", "ec-code", "uniprot"]].merge(
    df_ec_enzyme,
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
    suffixes=("", "_drop"),
)
for key in ["ec-code", "uniprot"]:
    if f"{key}_drop" in df_model_ec_enzyme.columns:
        drop_key = f"{key}_drop"
        df_model_ec_enzyme = df_model_ec_enzyme[
            df_model_ec_enzyme[key] == df_model_ec_enzyme[drop_key]
        ].drop(labels=[drop_key], axis=1)
df_model_ec_enzyme = df_model_ec_enzyme.reset_index(drop=True)
df_model_ec_enzyme

Unnamed: 0,genes,ec-code,uniprot,description,alternate,catalytic activity,comments
0,A4GALT,2.4.1.228,Q9NPC4,lactosylceramide 4-alpha-galactosyltransferase,"Galbeta1-4Glcbeta1-Cer alpha1,4-galactosyltran...",a beta-D-Gal-(1->4)-beta-D-Glc-(1<->1)-Cer(d18...,
1,AARS1,6.1.1.7,P49588,alanine--tRNA ligase,alanine translase;alanyl-tRNA synthetase,ATP + L-alanine + tRNA(Ala) = AMP + diphosphat...,
2,AASDHPPT,2.7.8.7,Q9NRN7,holo-[acyl-carrier-protein] synthase,4'-phosphopantetheinyl transferase;ACPS;acyl c...,"apo-[ACP] + CoA = adenosine 3',5'-bisphosphate...","-!- All polyketide synthases, fatty-acid synth..."
3,ABCA1,7.6.2.1,O95477,P-type phospholipid transporter,flippase;phospholipid-transporting ATPase,ATP + H2O + phospholipidSide 1 = ADP + phospha...,-!- A P-type ATPase that undergoes covalent ph...
4,ABCA7,7.6.2.1,Q8IZY2,P-type phospholipid transporter,flippase;phospholipid-transporting ATPase,ATP + H2O + phospholipidSide 1 = ADP + phospha...,-!- A P-type ATPase that undergoes covalent ph...
...,...,...,...,...,...,...,...
735,YES1,2.7.10.2,P07947,non-specific protein-tyrosine kinase,cytoplasmic protein tyrosine kinase,ATP + L-tyrosyl-[protein] = ADP + H(+) + O-pho...,"-!- Unlike EC 2.7.10.1, this protein-tyrosine ..."
736,ZDHHC2,2.3.1.225,Q9UIJ5,protein S-acyltransferase,DHHC palmitoyl transferase;G-protein palmitoyl...,hexadecanoyl-CoA + L-cysteinyl-[protein] = CoA...,-!- The enzyme catalyzes the post-translationa...
737,ZDHHC20,2.3.1.225,Q5W0Z9,protein S-acyltransferase,DHHC palmitoyl transferase;G-protein palmitoyl...,hexadecanoyl-CoA + L-cysteinyl-[protein] = CoA...,-!- The enzyme catalyzes the post-translationa...
738,ZDHHC3,2.3.1.225,Q9NYG2,protein S-acyltransferase,DHHC palmitoyl transferase;G-protein palmitoyl...,hexadecanoyl-CoA + L-cysteinyl-[protein] = CoA...,-!- The enzyme catalyzes the post-translationa...


In [9]:
mapping_key = "ec-code"
df_model_ec_enzyme = df_model_mappings[["genes", "ec-code", "uniprot"]].merge(
    df_ec_enzyme,
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
    suffixes=("", "_drop"),
)
for key in ["ec-code", "uniprot"]:
    if f"{key}_drop" in df_model_ec_enzyme.columns:
        drop_key = f"{key}_drop"
        df_model_ec_enzyme = df_model_ec_enzyme[
            df_model_ec_enzyme[key] == df_model_ec_enzyme[drop_key]
        ].drop(labels=[drop_key], axis=1)
df_model_ec_enzyme = df_model_ec_enzyme.reset_index(drop=True)


df_model_ec_enzyme["subsubclass"] = df_model_ec_enzyme["ec-code"].apply(
    lambda x: ".".join(x.rsplit(".", maxsplit=1)[:1] + 1 * ["-"])
)
df_model_ec_enzyme["subclass"] = df_model_ec_enzyme["ec-code"].apply(
    lambda x: ".".join(x.rsplit(".", maxsplit=2)[:1] + 2 * ["-"])
)
df_model_ec_enzyme["class"] = df_model_ec_enzyme["ec-code"].apply(
    lambda x: ".".join(x.rsplit(".", maxsplit=3)[:1] + 3 * ["-"])
)

df_model_ec_enzyme["ec-code.description"] = df_model_ec_enzyme["ec-code"].apply(
    lambda x: description_dict[x]
)
df_model_ec_enzyme["subsubclass.description"] = df_model_ec_enzyme["subsubclass"].apply(
    lambda x: description_dict[x]
)
df_model_ec_enzyme["subclass.description"] = df_model_ec_enzyme["subclass"].replace(
    description_dict
)
df_model_ec_enzyme["class.description"] = df_model_ec_enzyme["class"].replace(
    description_dict
)


df_model_ec_enzyme = df_model_ec_enzyme.loc[
    :,
    [
        "genes",
        "uniprot",
        "class",
        "class.description",
        "subclass",
        "subclass.description",
        "subsubclass",
        "subsubclass.description",
        "ec-code",
        "ec-code.description",
        "alternate",
        "catalytic activity",
        "comments",
    ],
]
df_model_ec_enzyme_final = df_model_ec_enzyme.rename({"go": "GO"}, axis=1)
df_model_ec_enzyme_final = df_model_ec_enzyme.groupby(
    ["genes", mapping_key], as_index=False
).agg(lambda x: build_string(x.dropna().unique()))
df_model_ec_enzyme_final = df_model_ec_enzyme_final.replace(
    float("nan"), pd.NA
).replace("", pd.NA)


if display_nunique:
    for col in df_model_ec_enzyme_final.columns:
        df = explode_column(df_model_ec_enzyme_final, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_model_ec_enzyme_final.to_csv(
        f"{database_dirpath}/{EC_DB_TAG}_{GEM_NAME}.tsv",
        sep="\t",
        index=False,
    )
else:
    df_model_ec_enzyme_final.to_csv(
        f"{ROOT_PATH}{INTERIM_PATH}/{EC_DB_TAG}_{GEM_NAME}.tsv",
        sep="\t",
        index=False,
    )
df_model_ec_enzyme_final

genes: 570
ec-code: 485
uniprot: 570
class: 7
class.description: 7
subclass: 50
subclass.description: 50
subsubclass: 104
subsubclass.description: 95
ec-code.description: 485
alternate: 1453
catalytic activity: 780
comments: 1865


Unnamed: 0,genes,ec-code,uniprot,class,class.description,subclass,subclass.description,subsubclass,subsubclass.description,ec-code.description,alternate,catalytic activity,comments
0,A4GALT,2.4.1.228,Q9NPC4,2.-.-.-,Transferases,2.4.-.-,Glycosyltransferases,2.4.1.-,Hexosyltransferases,lactosylceramide 4-alpha-galactosyltransferase,"Galbeta1-4Glcbeta1-Cer alpha1,4-galactosyltran...",a beta-D-Gal-(1->4)-beta-D-Glc-(1<->1)-Cer(d18...,
1,AARS1,6.1.1.7,P49588,6.-.-.-,Ligases,6.1.-.-,Forming carbon-oxygen bonds,6.1.1.-,Ligases forming aminoacyl-tRNA and related com...,alanine--tRNA ligase,alanine translase;alanyl-tRNA synthetase,ATP + L-alanine + tRNA(Ala) = AMP + diphosphat...,
2,AASDHPPT,2.7.8.7,Q9NRN7,2.-.-.-,Transferases,2.7.-.-,Transferring phosphorus-containing groups,2.7.8.-,Transferases for other substituted phosphate g...,holo-[acyl-carrier-protein] synthase,4'-phosphopantetheinyl transferase;ACPS;acyl c...,"apo-[ACP] + CoA = adenosine 3',5'-bisphosphate...","-!- All polyketide synthases, fatty-acid synth..."
3,ABCA1,7.6.2.1,O95477,7.-.-.-,Translocases,7.6.-.-,Catalysing the translocation of other compounds,7.6.2.-,Linked to the hydrolysis of a nucleoside triph...,P-type phospholipid transporter,flippase;phospholipid-transporting ATPase,ATP + H2O + phospholipidSide 1 = ADP + phospha...,-!- A P-type ATPase that undergoes covalent ph...
4,ABCA7,7.6.2.1,Q8IZY2,7.-.-.-,Translocases,7.6.-.-,Catalysing the translocation of other compounds,7.6.2.-,Linked to the hydrolysis of a nucleoside triph...,P-type phospholipid transporter,flippase;phospholipid-transporting ATPase,ATP + H2O + phospholipidSide 1 = ADP + phospha...,-!- A P-type ATPase that undergoes covalent ph...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,YES1,2.7.10.2,P07947,2.-.-.-,Transferases,2.7.-.-,Transferring phosphorus-containing groups,2.7.10.-,Protein-tyrosine kinases,non-specific protein-tyrosine kinase,cytoplasmic protein tyrosine kinase,ATP + L-tyrosyl-[protein] = ADP + H(+) + O-pho...,"-!- Unlike EC 2.7.10.1, this protein-tyrosine ..."
736,ZDHHC2,2.3.1.225,Q9UIJ5,2.-.-.-,Transferases,2.3.-.-,Acyltransferases,2.3.1.-,Transferring groups other than amino-acyl groups,protein S-acyltransferase,DHHC palmitoyl transferase;G-protein palmitoyl...,hexadecanoyl-CoA + L-cysteinyl-[protein] = CoA...,-!- The enzyme catalyzes the post-translationa...
737,ZDHHC20,2.3.1.225,Q5W0Z9,2.-.-.-,Transferases,2.3.-.-,Acyltransferases,2.3.1.-,Transferring groups other than amino-acyl groups,protein S-acyltransferase,DHHC palmitoyl transferase;G-protein palmitoyl...,hexadecanoyl-CoA + L-cysteinyl-[protein] = CoA...,-!- The enzyme catalyzes the post-translationa...
738,ZDHHC3,2.3.1.225,Q9NYG2,2.-.-.-,Transferases,2.3.-.-,Acyltransferases,2.3.1.-,Transferring groups other than amino-acyl groups,protein S-acyltransferase,DHHC palmitoyl transferase;G-protein palmitoyl...,hexadecanoyl-CoA + L-cysteinyl-[protein] = CoA...,-!- The enzyme catalyzes the post-translationa...
