In [4]:
from tqdm import tqdm
import pandas as pd
from ergochemics import mapping
from typing import List
import multiprocessing
import os
multiprocessing.set_start_method("fork", force=True)

In [None]:
rxns_df_input_filepath = "../data/raw/enzymemap_v2_brenda2023.csv"
rxns_df_output_filepath = "../data/interim/enzymemap_v2_brenda2023_JN_mapped_unique_rxns.parquet"

# enable batching to map large numbers of reactions without running out of memory
use_batching = True
batch_size = 2500  # number of reactions to process in each batch
batch_num = 3  # current batch number
start_idx = batch_num * batch_size  # starting index for the current batch
end_idx = (batch_num + 1) * batch_size  # ending index for the current batch

In [None]:
def make_rule_id(n: int, prefix: str = "rule", width: int = 4) -> str:
    """
    Convert an integer into a zero-padded rule ID of the form 'rule0001'.

    Args:
        n (int): The integer to convert.
        prefix (str): Optional prefix before the number. Defaults to "rule".
        width (int): Zero-padding width. Defaults to 4.

    Returns:
        str: Formatted rule ID (e.g., 'rule0004').
    """
    if n < 1:
        raise ValueError("Input must be >= 1.")
    return f"{prefix}{n:0{width}d}"

# extract and create a list of all minimal operators' SMARTS strings
gen_rxn_operators_df = pd.read_csv("../data/raw/JN1224MIN_rules.tsv", delimiter='\t')
gen_rxn_operators_list: List[str] = gen_rxn_operators_df["SMARTS"].to_list()

# extract and create a list of all unmapped reactions
enzymatic_rxns_df = pd.read_csv(rxns_df_input_filepath)
enzymatic_rxns_df = enzymatic_rxns_df[~enzymatic_rxns_df['mapped'].duplicated()]

# rewrite output filepath if batching is used
if use_batching:
    rxns_df_output_filepath = f"{rxns_df_output_filepath.replace('.parquet', f'_batch{batch_num}.parquet')}"
    enzymatic_rxns_df = enzymatic_rxns_df[start_idx:end_idx]

unmapped_rxns_list: List[str] = enzymatic_rxns_df["unmapped"].to_list()

# remove all hydrogen ions from rxn strings so that they can be mapped by Stefan's ergochemics
cleaned_rxns_list: List[str] = []

for rxn in unmapped_rxns_list:
    rxn = rxn.replace(".[H+]","").replace("[H+].","")
    cleaned_rxns_list.append(rxn)

def map_single_reaction(args):
    """(index, rxn, operator_list) → (index, mapped_ops)"""
    idx, rxn, gen_rxn_operators_list = args

    mapped_ops = []
    try:
        for i, operator in enumerate(gen_rxn_operators_list):
            try:
                mapped_rxn = mapping.operator_map_reaction(rxn=rxn, operator=operator)
                if mapped_rxn.did_map:
                    mapped_ops.append(make_rule_id(i+1))
            except Exception:
                pass
    except Exception as e:
        return idx, f"__WORKER_FAILED__: {repr(e)}"

    return idx, mapped_ops

# ---- MAIN ----
rxns = cleaned_rxns_list
tasks = [(i, rxn, gen_rxn_operators_list) for i, rxn in enumerate(rxns)]

results = [None] * len(tasks)

with multiprocessing.Pool(os.cpu_count()) as p:
    for idx, mapped_ops in tqdm(
        p.imap_unordered(map_single_reaction, tasks),
        total=len(tasks),
        desc="Mapping reactions",
    ):
        results[idx] = mapped_ops

all_mapped_operators = results

enzymatic_rxns_df["all_mapped_operators"] = all_mapped_operators

def get_top_operator(op_list):
    """
    Given a list like ['rule0002', 'rule0754'], return the one with
    the smallest integer value (e.g. 'rule0002').
    """
    if not op_list:
        return None  # or np.nan if you prefer

    # extract integer part: "rule0034" → 34
    nums = [int(op.replace("rule", "")) for op in op_list]

    # lowest rule number
    min_num = min(nums)

    # convert back to rule format
    return f"rule{min_num:04d}"

enzymatic_rxns_df["top_mapped_operator"] = (
    enzymatic_rxns_df["all_mapped_operators"]
    .apply(get_top_operator))

Mapping reactions:   0%|          | 0/7500 [00:00<?, ?it/s][15:55:50] Initializing MetalDisconnector
[15:55:50] Initializing MetalDisconnector
[15:55:50] Running MetalDisconnector
[15:55:50] Running MetalDisconnector
[15:55:50] Initializing Normalizer
[15:55:50] Initializing Normalizer
[15:55:50] Running Normalizer
[15:55:50] Running Normalizer
[15:55:50] Initializing MetalDisconnector
[15:55:50] Initializing MetalDisconnector
[15:55:50] Initializing MetalDisconnector
[15:55:50] Running MetalDisconnector
[15:55:50] Running MetalDisconnector
[15:55:50] Running MetalDisconnector
[15:55:50] Initializing MetalDisconnector
[15:55:50] Initializing Normalizer
[15:55:50] Initializing Normalizer
[15:55:50] Initializing MetalDisconnector
[15:55:50] Initializing Normalizer
[15:55:50] Running MetalDisconnector
[15:55:50] Initializing MetalDisconnector
[15:55:50] Initializing MetalDisconnector
[15:55:50] Running Normalizer
[15:55:50] Running Normalizer
[15:55:50] Running MetalDisconnector
[15:55:50

In [None]:
enzymatic_rxns_df.head(5)

Unnamed: 0,rxn_idx,mapped,unmapped,orig_rxn_text,rule,rule_id,source,steps,quality,natural,organism,protein_refs,protein_db,ec_num,all_mapped_operators,top_mapped_operator
0,0,[CH3:1][CH:2]=[O:3].[H+].[NH2:4][C:5](=[O:6])[...,CC=O.NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O...,acetaldehyde + NADH + H+ = ethanol + NAD+ {r},[#6:1]1=[#6:2]-[#7:3]-[#6:4]=[#6:5]-[#6:6]-1.[...,0,direct,single,0.991708,True,Saccharomyces cerevisiae,[],,1.1.1.1,"[rule0003, rule0753]",rule0003
1,0,[CH3:1][CH2:2][OH:3].[NH2:4][C:5](=[O:6])[c:7]...,CCO.NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP...,acetaldehyde + NADH + H+ = ethanol + NAD+ {r},[#6:1]1=[#6:2]-[#7:3]-[#6:4]=[#6:5]-[#6:6]-1.[...,0,direct reversed,single,0.991708,True,Saccharomyces cerevisiae,[],,1.1.1.1,"[rule0002, rule0754]",rule0002
25,15,[CH3:1][C:2](=[O:3])[CH:4]=[O:5].[H+].[NH2:6][...,CC(=O)C=O.NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)...,methylglyoxal + NADH + H+ = acetol + NAD+ {r},[#6:1]1=[#6:2]-[#7:3]-[#6:4]=[#6:5]-[#6:6]-1.[...,0,direct,single,0.991708,True,Candida albicans,['A0A1D8PP43'],uniprot,1.1.1.1,"[rule0003, rule0753]",rule0003
26,15,[CH3:1][C:2](=[O:3])[CH2:4][OH:5].[NH2:6][C:7]...,CC(=O)CO.NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)...,methylglyoxal + NADH + H+ = acetol + NAD+ {r},[#6:1]1=[#6:2]-[#7:3]-[#6:4]=[#6:5]-[#6:6]-1.[...,0,direct reversed,single,0.991708,True,Candida albicans,['A0A1D8PP43'],uniprot,1.1.1.1,"[rule0002, rule0754]",rule0002
27,16,[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][n...,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)...,2-dehydro-3-deoxy-D-gluconate + NADH + H+ = 4-...,[#6:1]-[#8:2].[#6:3]1:[#6:4]:[#6:5]:[#6:6]:[#7...,1,direct,single,0.991708,True,Sphingomonas sp. A1,['A0A075B5H4'],uniprot,1.1.1.1,"[rule0002, rule0347]",rule0002


In [None]:
enzymatic_rxns_df.to_parquet(rxns_df_output_filepath, index=False)