In [1]:
from tqdm import tqdm
import pandas as pd
from ergochemics import mapping
from typing import List
import multiprocessing
import os
multiprocessing.set_start_method("fork", force=True)

In [2]:
def make_rule_id(n: int, prefix: str = "rule", width: int = 4) -> str:
    """
    Convert an integer into a zero-padded rule ID of the form 'rule0001'.

    Args:
        n (int): The integer to convert.
        prefix (str): Optional prefix before the number. Defaults to "rule".
        width (int): Zero-padding width. Defaults to 4.

    Returns:
        str: Formatted rule ID (e.g., 'rule0004').
    """
    if n < 1:
        raise ValueError("Input must be >= 1.")
    return f"{prefix}{n:0{width}d}"


In [3]:
# extract and create a list of all minimal operators' SMARTS strings
gen_rxn_operators_df = pd.read_csv("../data/raw/JN1224MIN_rules.tsv", delimiter='\t')
gen_rxn_operators_list: List[str] = gen_rxn_operators_df["SMARTS"].to_list()

# extract and create a list of all unmapped MetaCyc reactions
EnzymeMap_MetaCyc_rxns_df = pd.read_csv("../data/raw/enzymemap_MetaCyc_processed.csv")
EnzymeMap_MetaCyc_rxns_unmapped: List[str] = EnzymeMap_MetaCyc_rxns_df["unmapped"].to_list()

# remove all hydrogen ions from rxn strings so that they can be mapped by Stefan's ergochemics
EnzymeMap_MetaCyc_rxns_cleaned: List[str] = []

for rxn in EnzymeMap_MetaCyc_rxns_unmapped:
    rxn = rxn.replace(".[H+]","").replace("[H+].","")
    EnzymeMap_MetaCyc_rxns_cleaned.append(rxn)

In [None]:
def map_single_reaction(args):
    rxn, gen_rxn_operators_list = args
    mapped_ops = []

    for i, operator in enumerate(gen_rxn_operators_list):
        try:
            mapped_rxn = mapping.operator_map_reaction(rxn=rxn, operator=operator)
            if mapped_rxn.did_map:
                mapped_ops.append(make_rule_id(i+1))
        except:
            pass

    return mapped_ops


# ---- MAIN ----
rxns = EnzymeMap_MetaCyc_rxns_cleaned
tasks = [(rxn, gen_rxn_operators_list) for rxn in rxns]

with multiprocessing.Pool(os.cpu_count()) as p:
    # imap returns results IN ORDER but streams results for tqdm
    results = list(tqdm(p.imap(map_single_reaction, tasks), 
                        total=len(tasks),
                        desc="Mapping reactions"))

all_mapped_operators = results

Mapping reactions:   0%|          | 0/4581 [00:00<?, ?it/s][15:52:38] Initializing MetalDisconnector
[15:52:38] Running MetalDisconnector
[15:52:38] Initializing Normalizer
[15:52:38] Initializing MetalDisconnector
[15:52:38] Running Normalizer
[15:52:38] Initializing MetalDisconnector
[15:52:38] Running MetalDisconnector
[15:52:38] Running MetalDisconnector
[15:52:38] Initializing Normalizer
[15:52:38] Initializing MetalDisconnector
[15:52:38] Initializing MetalDisconnector
[15:52:38] Initializing MetalDisconnector
[15:52:38] Initializing Normalizer
[15:52:38] Running MetalDisconnector
[15:52:38] Running Normalizer
[15:52:38] Running MetalDisconnector
[15:52:38] Running Normalizer
[15:52:38] Running MetalDisconnector
[15:52:38] Initializing Normalizer
[15:52:38] Initializing Normalizer
[15:52:38] Initializing Normalizer
[15:52:38] Initializing MetalDisconnector
[15:52:38] Initializing MetalDisconnector
[15:52:38] Running Normalizer
[15:52:38] Running Normalizer
[15:52:38] Running Norm

In [None]:
all_mapped_operators

[['rule0002', 'rule0754'],
 ['rule0003', 'rule0753'],
 ['rule0003'],
 ['rule0003', 'rule0348'],
 ['rule0002', 'rule0347']]