In [2]:
import glob
import pandas as pd
from rdkit import Chem
from aizynthfinder.reactiontree import ReactionTree

# Use glob to get a list of all files in "NP_atlas_search" 
json_files = glob.glob("NP_atlas_search/output_*.json*")

all_rxn = []
count = 0

for json_file in json_files:
    print(f"Processing file: {json_file}")
    
    data = pd.read_json(json_file, orient="table")
    all_trees = data.trees.values

    for molecule in all_trees:
        count += 1
        for itree, tree in enumerate(molecule):
            reaction_tree = ReactionTree.from_dict(tree)
            rxn_generator = reaction_tree.reactions()

            for rxn in rxn_generator:
                try:
                    # 1) Convert to SmilesBasedRetroReaction object:
                    retro_reaction_obj = rxn.to_smiles_based_retroreaction()

                    # 2) Convert the object to a string, then split on ">>"
                    smiles_str = str(retro_reaction_obj)
                    split_smi = smiles_str.split(">>")

                    # 3) Create a "forward" SMILES if it splits correctly:
                    if len(split_smi) == 2:
                        product_side, reactant_side = split_smi
                        forward_smiles_str = f"{reactant_side}>>{product_side}"
                    else:
                        forward_smiles_str = smiles_str  # fallback

                    template = rxn.metadata.get("template", "")
                    template_hash = rxn.metadata.get("template_hash", "")

                    # 4) Store the info
                    reaction_info = {
                        'retro_smiles': smiles_str,
                        'forward_smiles': forward_smiles_str,
                        'template': template,
                        'template_hash': template_hash
                    }
                    all_rxn.append(reaction_info)

                except Exception as e:
                    print(f"Error: {e}")
                    print(f"Error in rxn.metadata: {rxn.metadata}")

df_all_rxn = pd.DataFrame(all_rxn)
df_all_rxn.to_csv("all_reactions_NP_atlas.csv", index=False)
print("Saved 'all_reactions_NP_atlas.csv' with retro and forward SMILES.")

Processing file: NP_atlas_search/output_858.json.gz
Processing file: NP_atlas_search/output_155.json.gz
Processing file: NP_atlas_search/output_116.json.gz
Processing file: NP_atlas_search/output_839.json.gz
Processing file: NP_atlas_search/output_784.json.gz
Processing file: NP_atlas_search/output_121.json.gz
Processing file: NP_atlas_search/output_818.json.gz
Processing file: NP_atlas_search/output_790.json.gz
Processing file: NP_atlas_search/output_93.json.gz
Processing file: NP_atlas_search/output_928.json.gz
Processing file: NP_atlas_search/output_98.json.gz
Processing file: NP_atlas_search/output_977.json.gz
Processing file: NP_atlas_search/output_44.json.gz
Processing file: NP_atlas_search/output_137.json.gz
Processing file: NP_atlas_search/output_501.json.gz
Processing file: NP_atlas_search/output_320.json.gz
Processing file: NP_atlas_search/output_622.json.gz
Processing file: NP_atlas_search/output_428.json.gz
Processing file: NP_atlas_search/output_1065.json.gz
Processing fil

In [2]:
import pandas as pd
from rxnrule.models.rxnrule import RxnRule

# 1. Read in the CSV with columns: "smiles_reaction", "template_hash"
df = pd.read_csv("all_reactions.csv")

# 2. Initialize the RxnRule classifier
classifier = RxnRule()

# 3. Define a function that applies the classifier
def savi_classify_reaction(rxn_smiles: str) -> str:
    """
    Passes reaction SMILES to the RxnRule classifier and returns the predicted label.
    """
    return classifier.predict(rxn_smiles)

# 4. Generate a new column with the classifier’s output
df["savi_classifier"] = df["smiles_reaction"].apply(savi_classify_reaction)

# 5. Rename the "smiles_reaction" column to "reactions_smiles" as requested
df.rename(columns={"smiles_reaction": "reactions_smiles"}, inplace=True)

# 6. Select only the three columns you want in the final CSV
df_final = df[["reactions_smiles", "template_hash", "savi_classifier"]].copy()

# 7. Save to CSV
df_final.to_csv("classified_reactions.csv", index=False)

print("Classification complete. Results saved in 'classified_reactions.csv'.")

KeyboardInterrupt: 