# Extract relevant data from ENZYME - Enzyme nomenclature database

The purpose of this notebook is to extract relevant data about enzymes and the reactions they catalyze from the database.

## Notebook Requirements:
*  Model genes **must** have the at least one of following annotations stored in the `object.annotation`. Values are expected to be seperated by semicolons. Accepted keys currently include:
    * `"ec-code"`
    * `"uniprot"`
 

Note: Requires internet connection to download information from [ENZYME - Enzyme nomenclature database](https://enzyme.expasy.org/).

## Setup
### Import packages

In [None]:
import re
from warnings import warn

import pandas as pd
from rbc_gem_utils import (
    GEM_NAME,
    build_string,
    check_database_release_online,
    explode_column,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
)
from rbc_gem_utils.database.ec import (
    EC_DB_TAG,
    EC_RELEASE_EXPECTED,
    download_database_EC,
)

# Display versions of last time notebook ran and worked
show_versions()

## Set notebook options

In [None]:
db_tag = EC_DB_TAG
expected_release = EC_RELEASE_EXPECTED
download_database = True
display_nunique = True
overwrite = True

# Best mapping key is ec-code or uniprot
mapping_key = "uniprot"

## Check EC-ENZYME version
If the version does not match the expected version, it is because database has been updated since the last time this code was utilized. 

### Expected EC-ENZYME version: 27-Nov-2024
* Updates to the database are made every eight weeks (need confirmation)
* Last release utilized: **27-Nov-2024**.

In [None]:
use_interim = not check_database_release_online(db_tag, verbose=True, **{})
# Use different directory paths for unexpected behavior
if use_interim:
    warn(
        "Online release of database has been updated since the last time notebook was used."
    )


database_dirpath = get_dirpath(
    "database", db_tag, use_temp="interim" if use_interim else None
)
annotation_dirpath = get_dirpath(
    "annotation", use_temp="interim" if use_interim else None
)

# Ensure directories exist
database_dirpath.mkdir(exist_ok=True, parents=True)
annotation_dirpath.mkdir(exist_ok=True, parents=True)

#### Download new files and update database
If an argument is not provided (`arg=None`), its default value for the repository used. 

In [None]:
if download_database:
    download_database_EC(filename="enzyme.dat", database_dirpath=database_dirpath)
    download_database_EC(filename="enzclass.txt", database_dirpath=database_dirpath)

## Read data files
### Enzymes

In [None]:
with open(database_dirpath / "ec_enzyme.dat") as file:
    lines = file.readlines()

lines
data = {}
idx = -1
for line in lines:
    line = line.strip()
    line_type = line[:2]
    line_value = line[2 + 3 :]
    # All entries start with ID and end with '//' for termination
    if line.startswith("ID"):
        data[idx] = {"ID": line_value.split(" ")[-1]}
        continue
    elif line.startswith("//"):
        idx += 1
        continue
    elif line.startswith("CC"):
        if idx not in data:
            continue

    elif line.startswith("DR"):
        line_value = [x.strip() for x in line_value.split(";") if x.strip()]
        line_value = build_string(
            [
                x.split(", ")[0]
                for x in line_value
                if x.split(", ")[-1].endswith("_HUMAN")
            ]
        )

    if line_value:
        current = data.get(idx, {}).get(line_type, "")
        if current:
            current = build_string([current, line_value.rstrip(".")])
        else:
            current = line_value.rstrip(".")
        data[idx][line_type] = current

df_ec_enzyme = pd.DataFrame.from_dict(data, orient="index")
df_ec_enzyme = df_ec_enzyme[df_ec_enzyme["DE"].str.find("Transferred entry") == -1]
df_ec_enzyme = df_ec_enzyme.drop_duplicates()
df_ec_enzyme = df_ec_enzyme.rename(
    {
        "ID": "ec-code",
        "DE": "description",
        "AN": "alternate",
        "CA": "catalytic activity",
        "CC": "comments",
        "DR": "uniprot",
    },
    axis=1,
)
df_ec_enzyme["uniprot"] = df_ec_enzyme["uniprot"].apply(
    lambda x: x.split(";") if isinstance(x, str) else x
)
df_ec_enzyme = df_ec_enzyme.explode("uniprot")
df_ec_enzyme

### Enzyme classes

In [None]:
items = [
    "class",
    "subclass",
    "subsubclass",
    "serial",
    "description",
]

with open(database_dirpath / "ec_enzclass.txt") as file:
    lines = file.readlines()

lines = lines[11:-5]

ec_enzclass_data = {}
for i, line in enumerate(lines):
    if not re.search(r"^(\d+|\-)\.", line):
        continue

    line_items = [
        substr.strip().rstrip(".")
        for string in line.split(".", maxsplit=3)
        for substr in string.split(" ", maxsplit=1)
        if substr.strip()
    ]
    if len(line_items) != 5:
        warn(f"Issue with parsing line {i+1}: {repr(line)}")
        continue
    ec_enzclass_data[i] = {
        "ec-code": ".".join(line_items[:4]).strip(),
        "description": line_items[4],
    }
df_ec_enzclass = pd.DataFrame.from_dict(ec_enzclass_data, orient="index")
df_ec_enzclass = pd.concat(
    (df_ec_enzclass, df_ec_enzyme[["ec-code", "description"]]), axis=0
)
df_ec_enzclass = (
    df_ec_enzclass.sort_values("ec-code")
    .reset_index(drop=True)
    .dropna()
    .drop_duplicates()
    .astype(str)
)
description_dict = df_ec_enzclass.set_index("ec-code").to_dict()["description"]
df_ec_enzclass

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

### Extract current annotations from model

In [None]:
annotation_type = "genes"
df_model_mappings = (
    get_annotation_df(model.genes, ["ec-code", "uniprot"])
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings = explode_column(df_model_mappings, name=col, sep=";")
df_model_mappings = df_model_mappings.sort_values(annotation_type)
df_model_mappings

### Map to EC Codes

In [None]:
df_model_ec_enzyme = df_model_mappings[["genes", "ec-code", "uniprot"]].merge(
    df_ec_enzyme,
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
    suffixes=("", "_drop"),
)
for key in ["ec-code", "uniprot"]:
    if f"{key}_drop" in df_model_ec_enzyme.columns:
        drop_key = f"{key}_drop"
        df_model_ec_enzyme = df_model_ec_enzyme[
            df_model_ec_enzyme[key] == df_model_ec_enzyme[drop_key]
        ].drop(labels=[drop_key], axis=1)
df_model_ec_enzyme = df_model_ec_enzyme.reset_index(drop=True)


df_model_ec_enzyme["subsubclass"] = df_model_ec_enzyme["ec-code"].apply(
    lambda x: ".".join(x.rsplit(".", maxsplit=1)[:1] + 1 * ["-"])
)
df_model_ec_enzyme["subclass"] = df_model_ec_enzyme["ec-code"].apply(
    lambda x: ".".join(x.rsplit(".", maxsplit=2)[:1] + 2 * ["-"])
)
df_model_ec_enzyme["class"] = df_model_ec_enzyme["ec-code"].apply(
    lambda x: ".".join(x.rsplit(".", maxsplit=3)[:1] + 3 * ["-"])
)

df_model_ec_enzyme["ec-code.description"] = df_model_ec_enzyme["ec-code"].apply(
    lambda x: description_dict[x]
)
df_model_ec_enzyme["subsubclass.description"] = df_model_ec_enzyme["subsubclass"].apply(
    lambda x: description_dict[x]
)
df_model_ec_enzyme["subclass.description"] = df_model_ec_enzyme["subclass"].replace(
    description_dict
)
df_model_ec_enzyme["class.description"] = df_model_ec_enzyme["class"].replace(
    description_dict
)


df_model_ec_enzyme = df_model_ec_enzyme.loc[
    :,
    [
        "genes",
        "uniprot",
        "class",
        "class.description",
        "subclass",
        "subclass.description",
        "subsubclass",
        "subsubclass.description",
        "ec-code",
        "ec-code.description",
        "alternate",
        "catalytic activity",
        "comments",
    ],
]
df_model_ec_enzyme = df_model_ec_enzyme.rename({"go": "GO"}, axis=1)
df_model_ec_enzyme = df_model_ec_enzyme.groupby(
    ["genes", mapping_key], as_index=False
).agg(lambda x: build_string(x.dropna().unique()))
df_model_ec_enzyme = df_model_ec_enzyme.replace(float("nan"), pd.NA).replace("", pd.NA)


if display_nunique:
    for col in df_model_ec_enzyme.columns:
        df = explode_column(df_model_ec_enzyme, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_model_ec_enzyme.to_csv(
        database_dirpath / f"{EC_DB_TAG}_{GEM_NAME}.tsv",
        sep="\t",
        index=False,
    )

df_model_ec_enzyme