In [1]:
import os
import pickle
from tqdm import tqdm

from rmgpy.molecule.molecule import Molecule
from rmgpy.molecule.translator import to_smiles

In [2]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

In [3]:
path_prefix = "/rmg/RMG-database/input/kinetics/"

# process all molecule smiles

In [4]:
for mode in ["families", "libraries"]:
    # find all dictionary.txt files
    dict_file_list = []
    for root, dirs, files in os.walk(path_prefix + mode):
        for file in files:
            if file.lower().endswith('.txt') and file == "dictionary.txt":
                dict_file_list.append(os.path.join(root, file))
    
    data = [('formula', 'smiles', 'path')]
    
    # for each file, parse adjacency lists and convert them to smiles
    for path in tqdm(dict_file_list):
        print(path)
        f = open(path, "r")
        text = f.readlines()

        names = []
        adj_lists = []

        # split adjacency lists 
        begin = 0

        # handle the last compound in the file
        if len(text) > 0 and text[-1] != "\n":
            text.append("\n")

        for i in range(len(text)):
            if text[i] == "\n":
                # handle incorrect format for families
                if  text[begin] != "\n":
                    names.append(text[begin].replace("\n","").replace(" ",""))
                    adj_lists.append("".join(text[begin+1: i+1]))

                begin = i+1

        # convert to smiles and append to data
        for i in range(len(adj_lists)):
            cc = Molecule().from_adjacency_list(adj_lists[i])
            s = to_smiles(cc,backend="openbabel")
            
            data.append((names[i], s, path.replace(path_prefix, "RMG-database/input/kinetics/")))
            
    # store pickle
    with open('molecule_smiles_{}.pickle'.format(mode), 'wb') as data_file:
        pickle.dump(data, data_file)

  0%|                                                                                                                                         | 0/144 [00:00<?, ?it/s]

/rmg/RMG-database/input/kinetics/families/1,2_Insertion_carbene/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,2_Insertion_carbene/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Peroxyl_Termination/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Bimolec_Hydroperoxide_Decomposition/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_Dissociative_Double/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Surface_Abstraction/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/halocarbene_recombination_double/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,2_Insertion_CO/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,2_Insertion_CO/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Cyclic_Thioether_Formation/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Cyclic_Thioether_Formation/training/dictionary.txt
/rmg/R

 12%|███████████████                                                                                                                 | 17/144 [00:00<00:02, 44.44it/s]

/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_Dissociative/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Cyclopentadiene_scission/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Baeyer-Villiger_step1_cat/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Surface_Addition_Single_vdW/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Baeyer-Villiger_step2_cat/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_R_Add_Exocyclic/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_R_Add_Exocyclic/training/dictionary.txt


 29%|█████████████████████████████████████▎                                                                                          | 42/144 [00:00<00:01, 54.53it/s]

/rmg/RMG-database/input/kinetics/families/Surface_vdW_to_Bidentate/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Diels_alder_addition/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Diels_alder_addition/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/intra_substitutionCS_cyclization/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/intra_substitutionCS_cyclization/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,2_NH3_elimination/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,3_sigmatropic_rearrangement/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,3_sigmatropic_rearrangement/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/CO_Disproportionation/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,3_NH3_elimination/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Surface_DoubleBond_to_Bidentate/training/dictionary.txt

 44%|████████████████████████████████████████████████████████▉                                                                       | 64/144 [00:01<00:01, 79.26it/s]

/rmg/RMG-database/input/kinetics/families/R_Addition_MultipleBond/training/dictionary.txt


 53%|███████████████████████████████████████████████████████████████████▌                                                            | 76/144 [00:03<00:04, 15.72it/s]

/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_vdW/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Surface_Dissociation/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/SubstitutionS/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/SubstitutionS/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Retroene/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/XY_elimination_hydroxyl/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/2+2_cycloaddition/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/2+2_cycloaddition/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_Disproportionation/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_Disproportionation/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/R_Addition_COm/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/R_Addition_COm/training/dictionary.txt
/rm

 60%|████████████████████████████████████████████████████████████████████████████▍                                                   | 86/144 [00:03<00:03, 19.01it/s]

/rmg/RMG-database/input/kinetics/families/Surface_Dissociation_vdW/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Surface_Dissociation_to_Bidentate/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Diels_alder_addition_Aromatic/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,2-Birad_to_alkene/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,2-Birad_to_alkene/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/H_Abstraction/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/H_Abstraction/training/dictionary.txt


 65%|██████████████████████████████████████████████████████████████████████████████████▋                                             | 93/144 [00:03<00:02, 21.66it/s]

/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_Bidentate/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_R_Add_Exo_scission/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/intra_NO2_ONO_conversion/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/intra_NO2_ONO_conversion/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_Retro_Diels_alder_bicyclic/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,4_Linear_birad_scission/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_R_Add_Endocyclic/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_R_Add_Endocyclic/training/dictionary.txt


 69%|████████████████████████████████████████████████████████████████████████████████████████                                        | 99/144 [00:04<00:03, 13.27it/s]

/rmg/RMG-database/input/kinetics/families/Intra_RH_Add_Endocyclic/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_RH_Add_Endocyclic/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/R_Recombination/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/R_Recombination/training/dictionary.txt


 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 126/144 [00:04<00:00, 28.06it/s]

/rmg/RMG-database/input/kinetics/families/Surface_Abstraction_Single_vdW/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/intra_substitutionS_isomerization/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/intra_substitutionS_isomerization/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/H2_Loss/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Surface_Dissociation_Beta/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,3_Insertion_RSR/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,3_Insertion_RSR/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Peroxyl_Disproportionation/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,4_Cyclic_birad_scission/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Singlet_Val6_to_triplet/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Singlet_Val6_to_triplet/training/dictionary.txt
/rmg/RMG-databas

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 144/144 [00:05<00:00, 28.03it/s]


/rmg/RMG-database/input/kinetics/families/Surface_Abstraction_Beta_double_vdW/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_ene_reaction/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_RH_Add_Exocyclic/NIST/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_RH_Add_Exocyclic/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,2_shiftC/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/1,2_XY_interchange/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_Double/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_Abstraction_vdW/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Intra_5_membered_conjugated_C=C_C=C_addition/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Singlet_Carbene_Intra_Disproportionation/training/dictionary.txt
/rmg/RMG-database/input/kinetics/families/Birad_R_Recombination/NI

  3%|███▋                                                                                                                             | 5/174 [00:00<00:03, 47.60it/s]

/rmg/RMG-database/input/kinetics/libraries/Nitrogen_Glarborg_Lucassen_et_al/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Nitrogen_Glarborg_Zhang_et_al/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/primaryNitrogenLibrary/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/primaryNitrogenLibrary/LowT/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Chernov/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/CurranPentane/dictionary.txt


  8%|██████████▎                                                                                                                     | 14/174 [00:00<00:05, 28.02it/s]

/rmg/RMG-database/input/kinetics/libraries/2001_Tokmakov_H_Toluene_to_CH3_Benzene/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Ethylamine/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Fulvene_H/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/2006_Joshi_OH_CO/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/naphthalene_H/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/TEOS/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/DTU_mech_CH3Cl/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Dooley/methylformate_all_ARHEbathgas/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Dooley/methylformate_all_N2bathgas/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Dooley/C1/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Dooley/methylformate_2/dictionary.txt


 32%|████████████████████████████████████████▊                                                                                      | 56/174 [00:00<00:00, 124.84it/s]

/rmg/RMG-database/input/kinetics/libraries/Dooley/methylformate/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/primaryH2O2/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/2009_Sharma_C5H5_CH3_highP/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Surface/DOC/Mhadeshwar_Pt111/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Surface/DOC/Nitrogen/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Surface/DOC/Ishikawa_Rh111/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Surface/DOC/Arevalo_Pt111/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Schneider_Rh211/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Offermans_Pt111/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Schneider_Pt111/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Novell_Rh111/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Scheuer_Pt/dict

 41%|████████████████████████████████████████████████████▌                                                                          | 72/174 [00:00<00:00, 102.33it/s]

/rmg/RMG-database/input/kinetics/libraries/Lai_Hexylbenzene/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/BurkeH2O2inArHe/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Glarborg/highP/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Glarborg/C0/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Glarborg/C1/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Glarborg/C2/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Glarborg/C3/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/1989_Stewart_2CH3_to_C2H5_H/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromatic_Ring/2017_Mebel_C6H5C2H2_C2H2_highP/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromatic_Ring/2016_Mebel_C10H9_highP/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromatic_Ring/2012_Matsugi_C3H3_C7H7_highP/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromat

 49%|██████████████████████████████████████████████████████████████▌                                                                 | 85/174 [00:01<00:00, 93.04it/s]

/rmg/RMG-database/input/kinetics/libraries/Nitrogen_Dean_and_Bozzelli/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/NOx2018/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/2015_Buras_C2H3_C4H6_highP/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/combustion_core/version4/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/combustion_core/version3/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/combustion_core/version2/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/combustion_core/version5/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/GRI-Mech3.0-N/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/N-S_interactions/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/YF/seed/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/YF/full/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/HydrazinePDep/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/GRI-Mech3.0/dictionary.txt
/rmg/RMG-da

 62%|██████████████████████████████████████████████████████████████████████████████▊                                                | 108/174 [00:01<00:00, 84.49it/s]

/rmg/RMG-database/input/kinetics/libraries/BurkeH2O2inN2/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/2-BTP/seed/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/2-BTP/full/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Iodine-R_recombination/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/primarySulfurLibrary/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Butadiene_Dimerization/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/2005_Senosiain_OH_C2H2/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Mebel_Naphthyl/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/C2H4+O_Klipp2017/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Narayanaswamy/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/C12H11_pdep/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/fascella/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/GRI-HCO/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Su

 72%|███████████████████████████████████████████████████████████████████████████████████████████▏                                   | 125/174 [00:01<00:00, 75.96it/s]

/rmg/RMG-database/input/kinetics/libraries/JetSurF2.0/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/C10H11/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/C3/dictionary.txt


 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 145/174 [00:01<00:00, 78.26it/s]


/rmg/RMG-database/input/kinetics/libraries/Aromatics_high_pressure/C9H8_2/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Aromatics_high_pressure/C9H9_2/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Aromatics_high_pressure/C10H11_4/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Aromatics_high_pressure/C10H9_2/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Aromatics_high_pressure/C10H7/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Aromatics_high_pressure/C10H9_4/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Aromatics_high_pressure/C14H11_3/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Aromatics_high_pressure/C10H9_1/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Aromatics_high_pressure/C9H9_1/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Aromatics_high_pressure/C8H7/dictionary.txt
/rmg/RMG-database/input/kinetics/libraries/Aromatics_high_pressure/C9H11/dictionary.txt
/rmg/RMG-database/

ValueError: invalid literal for int() with base 10: 'CH4'

# process all reaction smiles

In [5]:
import codecs
import logging
import os.path
import re
from collections import OrderedDict

import numpy as np

from rmgpy.data.base import DatabaseError, Database, Entry
from rmgpy.data.kinetics.common import save_entry
from rmgpy.data.kinetics.family import TemplateReaction
from rmgpy.kinetics import Arrhenius, ThirdBody, Lindemann, Troe, \
                           PDepArrhenius, MultiArrhenius, MultiPDepArrhenius, Chebyshev 
from rmgpy.kinetics.surface import StickingCoefficient
from rmgpy.molecule import Molecule
from rmgpy.reaction import Reaction
from rmgpy.species import Species

from rmgpy.data.kinetics import *
from rmgpy.data.reference import *

from rmgpy.kinetics import Arrhenius, ArrheniusEP, ThirdBody, Lindemann, Troe, \
                           PDepArrhenius, MultiArrhenius, MultiPDepArrhenius, \
                           Chebyshev, KineticsData, StickingCoefficient, \
                           StickingCoefficientBEP, SurfaceArrhenius, SurfaceArrheniusBEP, ArrheniusBM

In [6]:
# changed definition 
# https://github.com/ReactionMechanismGenerator/RMG-Py/blob/300c78290fdb1e6c928068c0049c7f73093d373d/rmgpy/data/kinetics/library.py#L525
def load_entry(
                   index,
                   label,
                   kinetics,
                   rank = None,
                   degeneracy=1,
                   duplicate=False,
                   reversible=True,
                   reference=None,
                   referenceType='',
                   shortDesc='',
                   longDesc='',
                   allow_pdep_route=False,
                   elementary_high_p=False,
                   allow_max_rate_violation=False,
                   metal=None,
                   site=None,
                   facet=None,
                   ):
        """
        Method for parsing entries in database files.
        Note that these argument names are retained for backward compatibility.
        """

        # reactants = [Species(label=reactant1.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(reactant1)])]
        # if reactant2 is not None: reactants.append(Species(label=reactant2.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(reactant2)]))
        # if reactant3 is not None: reactants.append(Species(label=reactant3.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(reactant3)]))
        #
        # products = [Species(label=product1.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(product1)])]
        # if product2 is not None: products.append(Species(label=product2.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(product2)]))
        # if product3 is not None: products.append(Species(label=product3.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(product3)]))

        # Make a blank reaction
        rxn = Reaction(reactants=[], products=[], degeneracy=degeneracy, duplicate=duplicate, reversible=reversible,
                       allow_pdep_route=allow_pdep_route, elementary_high_p=elementary_high_p,
                       allow_max_rate_violation=allow_max_rate_violation)
        # if not rxn.is_balanced():
        #    raise DatabaseError('Reaction {0} in kinetics library {1} was not balanced! Please reformulate.'.format(rxn, self.label))
        # label = str(rxn)
        return Entry(
            index=index,
            label=label,
            item=rxn,
            data=kinetics,
            reference=reference,
            reference_type=referenceType,
            short_desc=shortDesc,
            long_desc=longDesc.strip(),
            metal=metal,
            site=site,
            facet=facet,
        )

In [8]:
for mode in ["families", "libraries"]:
    # find all reactions.py files first
    dict_file_list = []
    for root, dirs, files in os.walk(path_prefix + mode):
        for file in files:
            if file.lower().endswith('.py') and file == "reactions.py":
                dict_file_list.append(os.path.join(root, file))
    
    # load molecule smiles
    data_file = open('molecule_smiles_{}.pickle'.format(mode), 'rb')
    molecule_smiles = pickle.load(data_file)
    
    # create empty dataframe
    data = [('label', 'smiles', 'path', "entry")]

    # for each file, parse entries and convert them to smiles
    for path in tqdm(dict_file_list):
        
        print(path)
        f = open(path, "r")
        text = f.readlines()

        # remove first several lines of comments at the beginning of the .py file
        origin_length = len(text)
        for i in range(len(text)):
            if text[i] == 'entry(\n':
                text = text[i:]
                break

        entry_list = []

        # split entries
        begin = 0

        # empty file
        if len(text) == origin_length or len(text) == 0:
            continue

        for i in range(len(text)):
            if text[i] == ")\n":
                # handle incorrect format for families
                if  text[begin] != "\n" and text[begin][:6] == "entry(":
#                     text[begin] = "Entry(" + text[begin][6:]
                    text[begin] = "load_entry(" + text[begin][6:]
                    entry_list.append(eval("".join(text[begin: i+1])))

                begin = i+2
 
        # convert to smiles and append to data 
        for i in range(len(entry_list)):

            # found related molecule smiles, e.g., under the same folder
            abs_path = path.replace(path_prefix, "RMG-database/input/kinetics/")
            
            mapping = dict()
            for elem in molecule_smiles:
                  if elem[2] == abs_path.replace("reactions.py", "dictionary.txt"):
                        mapping[elem[0]] = elem[1]
        
            # parse smiles
            s = ""
            parts = entry_list[i].label.split(" ")
            for p in parts:
                if p == "+":
                    s = s + "."
                elif p == " " or p == "":
                    continue
                elif p == "<=>" or p == "=>" or p == "=":
                    s = s + ">>"
                else:
                    # formal error
                    if p not in mapping:
                        s = None
                        print(p)
                        break
                    else:
                        s = s + mapping[p]

            if s is None:
                print(entry_list[i].label, "is invalid" )
                continue

            # store Arrhenius
            data.append((entry_list[i].label,
                        s,
                        abs_path,
                        entry_list[i]))
            f.close()
    
    # store pickle
    with open('reaction_smiles_{}.pickle'.format(mode), 'wb') as data_file:
        pickle.dump(data, data_file)


 11%|███████████▋                                                                                             | 16/144 [00:00<00:01, 96.16it/s]

/rmg/RMG-database/input/kinetics/families/1,2_Insertion_carbene/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/1,2_Insertion_carbene/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Peroxyl_Termination/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Bimolec_Hydroperoxide_Decomposition/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_Dissociative_Double/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Abstraction/training/reactions.py
/rmg/RMG-database/input/kinetics/families/halocarbene_recombination_double/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,2_Insertion_CO/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/1,2_Insertion_CO/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Cyclic_Thioether_Formation/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/Cyclic_Thioether_Formation/training/reactions.py
/rmg/RMG-database/input/kine

 26%|██████████████████████████▉                                                                              | 37/144 [00:01<00:04, 23.39it/s]

/rmg/RMG-database/input/kinetics/families/Surface_vdW_to_Bidentate/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Diels_alder_addition/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/Diels_alder_addition/training/reactions.py
/rmg/RMG-database/input/kinetics/families/intra_substitutionCS_cyclization/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/intra_substitutionCS_cyclization/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,2_NH3_elimination/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,3_sigmatropic_rearrangement/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/1,3_sigmatropic_rearrangement/training/reactions.py
/rmg/RMG-database/input/kinetics/families/CO_Disproportionation/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,3_NH3_elimination/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_DoubleBond_to_Bidentate/training/reactions.py
/rmg/RMG-database/inp

 31%|████████████████████████████████▊                                                                        | 45/144 [00:01<00:03, 24.90it/s]

/rmg/RMG-database/input/kinetics/families/Intra_Diels_alder_monocyclic/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Cyclic_Ether_Formation/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/Cyclic_Ether_Formation/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,2_shiftS/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/1,2_shiftS/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,3_Insertion_CO2/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/1,3_Insertion_CO2/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Disproportionation-Y/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Migration/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Baeyer-Villiger_step2/training/reactions.py
/rmg/RMG-database/input/kinetics/families/R_Addition_CSm/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/R_Addition_CSm/training/reactions.py
/rmg/RMG-database/input/

 36%|█████████████████████████████████████▉                                                                   | 52/144 [00:01<00:03, 30.30it/s]

/rmg/RMG-database/input/kinetics/families/HO2_Elimination_from_PeroxyRadical/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/HO2_Elimination_from_PeroxyRadical/training/reactions.py
/rmg/RMG-database/input/kinetics/families/intra_substitutionS_cyclization/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/intra_substitutionS_cyclization/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Dissociation_Double_vdW/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Korcek_step1/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/Korcek_step1/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Disproportionation/NIST/reactions.py


 42%|███████████████████████████████████████████▊                                                             | 60/144 [00:02<00:02, 31.51it/s]

/rmg/RMG-database/input/kinetics/families/Disproportionation/training/reactions.py
/rmg/RMG-database/input/kinetics/families/intra_substitutionCS_isomerization/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/intra_substitutionCS_isomerization/training/reactions.py
/rmg/RMG-database/input/kinetics/families/R_Addition_MultipleBond/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/R_Addition_MultipleBond/training/reactions.py


 46%|████████████████████████████████████████████████▏                                                        | 66/144 [00:08<00:22,  3.53it/s]

/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_vdW/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Dissociation/training/reactions.py
/rmg/RMG-database/input/kinetics/families/SubstitutionS/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/SubstitutionS/training/reactions.py


 49%|███████████████████████████████████████████████████                                                      | 70/144 [00:08<00:18,  4.09it/s]

/rmg/RMG-database/input/kinetics/families/Retroene/training/reactions.py
/rmg/RMG-database/input/kinetics/families/XY_elimination_hydroxyl/training/reactions.py
/rmg/RMG-database/input/kinetics/families/2+2_cycloaddition/NIST/reactions.py


 51%|█████████████████████████████████████████████████████▏                                                   | 73/144 [00:08<00:14,  4.75it/s]

/rmg/RMG-database/input/kinetics/families/2+2_cycloaddition/training/reactions.py
C2H4<=>
CH2O + C2H4<=> C3H6O is invalid
/rmg/RMG-database/input/kinetics/families/Intra_Disproportionation/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/Intra_Disproportionation/training/reactions.py
/rmg/RMG-database/input/kinetics/families/R_Addition_COm/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/R_Addition_COm/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Abstraction_vdW/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_Single/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Cl_Abstraction/training/reactions.py


 56%|██████████████████████████████████████████████████████████▎                                              | 80/144 [00:09<00:09,  6.43it/s]

/rmg/RMG-database/input/kinetics/families/Surface_Dissociation_vdW/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Dissociation_to_Bidentate/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Diels_alder_addition_Aromatic/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,2-Birad_to_alkene/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/1,2-Birad_to_alkene/training/reactions.py
/rmg/RMG-database/input/kinetics/families/H_Abstraction/NIST/reactions.py


 60%|██████████████████████████████████████████████████████████████▋                                          | 86/144 [00:11<00:12,  4.69it/s]

/rmg/RMG-database/input/kinetics/families/H_Abstraction/training/reactions.py


 61%|████████████████████████████████████████████████████████████████▏                                        | 88/144 [00:16<00:28,  1.97it/s]

/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_Bidentate/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Intra_R_Add_Exo_scission/training/reactions.py
/rmg/RMG-database/input/kinetics/families/intra_NO2_ONO_conversion/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/intra_NO2_ONO_conversion/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Intra_Retro_Diels_alder_bicyclic/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,4_Linear_birad_scission/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Intra_R_Add_Endocyclic/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/Intra_R_Add_Endocyclic/training/reactions.py


 66%|█████████████████████████████████████████████████████████████████████▎                                   | 95/144 [00:17<00:18,  2.58it/s]

/rmg/RMG-database/input/kinetics/families/Intra_RH_Add_Endocyclic/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/Intra_RH_Add_Endocyclic/training/reactions.py
/rmg/RMG-database/input/kinetics/families/R_Recombination/NIST/reactions.py


 68%|███████████████████████████████████████████████████████████████████████▍                                 | 98/144 [00:18<00:16,  2.76it/s]

/rmg/RMG-database/input/kinetics/families/R_Recombination/training/reactions.py


 69%|████████████████████████████████████████████████████████████████████████▏                                | 99/144 [00:18<00:15,  2.83it/s]

/rmg/RMG-database/input/kinetics/families/Surface_Abstraction_Single_vdW/training/reactions.py
/rmg/RMG-database/input/kinetics/families/intra_substitutionS_isomerization/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/intra_substitutionS_isomerization/training/reactions.py
/rmg/RMG-database/input/kinetics/families/H2_Loss/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Dissociation_Beta/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,3_Insertion_RSR/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/1,3_Insertion_RSR/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Peroxyl_Disproportionation/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,4_Cyclic_birad_scission/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Singlet_Val6_to_triplet/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/Singlet_Val6_to_triplet/training/reactions.py
/rmg/RMG-database/input/kinetics/famil

 82%|█████████████████████████████████████████████████████████████████████████████████████▏                  | 118/144 [00:19<00:03,  8.63it/s]

/rmg/RMG-database/input/kinetics/families/Surface_EleyRideal_Addition_Multiple_Bond/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Intra_2+2_cycloaddition_Cd/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1+2_Cycloaddition/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/1+2_Cycloaddition/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Abstraction_Beta/training/reactions.py
/rmg/RMG-database/input/kinetics/families/6_membered_central_C-C_shift/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Substitution_O/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/Substitution_O/training/reactions.py


 88%|███████████████████████████████████████████████████████████████████████████████████████████             | 126/144 [00:19<00:01, 11.13it/s]

/rmg/RMG-database/input/kinetics/families/halocarbene_recombination/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Ketoenol/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/Ketoenol/training/reactions.py
/rmg/RMG-database/input/kinetics/families/XY_Addition_MultipleBond/training/reactions.py


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 144/144 [00:19<00:00,  7.33it/s]


/rmg/RMG-database/input/kinetics/families/1,3_Insertion_ROR/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/1,3_Insertion_ROR/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Abstraction_Beta_double_vdW/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Intra_ene_reaction/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Intra_RH_Add_Exocyclic/NIST/reactions.py
/rmg/RMG-database/input/kinetics/families/Intra_RH_Add_Exocyclic/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,2_shiftC/training/reactions.py
/rmg/RMG-database/input/kinetics/families/1,2_XY_interchange/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_Double/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Surface_Adsorption_Abstraction_vdW/training/reactions.py
/rmg/RMG-database/input/kinetics/families/Intra_5_membered_conjugated_C=C_C=C_addition/training/reactions.py
/rmg/RMG-database/input/kinetic

  0%|                                                                                                                  | 0/174 [00:00<?, ?it/s]

/rmg/RMG-database/input/kinetics/libraries/Nitrogen_Glarborg_Lucassen_et_al/reactions.py


  1%|▌                                                                                                         | 1/174 [00:00<00:51,  3.33it/s]

/rmg/RMG-database/input/kinetics/libraries/Nitrogen_Glarborg_Zhang_et_al/reactions.py


  1%|█▏                                                                                                        | 2/174 [00:01<01:37,  1.76it/s]

/rmg/RMG-database/input/kinetics/libraries/primaryNitrogenLibrary/reactions.py


  2%|█▊                                                                                                        | 3/174 [00:01<01:19,  2.14it/s]

/rmg/RMG-database/input/kinetics/libraries/primaryNitrogenLibrary/LowT/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Chernov/reactions.py


  3%|███                                                                                                       | 5/174 [00:01<00:51,  3.31it/s]

/rmg/RMG-database/input/kinetics/libraries/CurranPentane/reactions.py


  3%|███▋                                                                                                      | 6/174 [00:05<03:21,  1.20s/it]

/rmg/RMG-database/input/kinetics/libraries/2001_Tokmakov_H_Toluene_to_CH3_Benzene/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Ethylamine/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Fulvene_H/reactions.py
/rmg/RMG-database/input/kinetics/libraries/2006_Joshi_OH_CO/reactions.py
/rmg/RMG-database/input/kinetics/libraries/naphthalene_H/reactions.py
/rmg/RMG-database/input/kinetics/libraries/TEOS/reactions.py
/rmg/RMG-database/input/kinetics/libraries/DTU_mech_CH3Cl/reactions.py


  7%|███████▊                                                                                                 | 13/174 [00:05<00:53,  3.00it/s]

/rmg/RMG-database/input/kinetics/libraries/Dooley/methylformate_all_ARHEbathgas/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Dooley/methylformate_all_N2bathgas/reactions.py


  9%|█████████▋                                                                                               | 16/174 [00:09<01:33,  1.69it/s]

/rmg/RMG-database/input/kinetics/libraries/Dooley/C1/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Dooley/methylformate_2/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Dooley/methylformate/reactions.py


 14%|███████████████                                                                                          | 25/174 [00:10<00:34,  4.28it/s]

/rmg/RMG-database/input/kinetics/libraries/primaryH2O2/reactions.py
/rmg/RMG-database/input/kinetics/libraries/2009_Sharma_C5H5_CH3_highP/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Surface/DOC/Mhadeshwar_Pt111/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Surface/DOC/Nitrogen/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Surface/DOC/Ishikawa_Rh111/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Surface/DOC/Arevalo_Pt111/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Schneider_Rh211/reactions.py
+O_X
NH3_X +O_X <=> NH2_X + OH_X is invalid
+O_X
NH2_X +O_X <=> NH_X + OH_X is invalid
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Offermans_Pt111/reactions.py
+O_X
NH3_X +O_X <=> NH2_X + OH_X is invalid
+O_X
NH2_X +O_X <=> NH_X + OH_X is invalid
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Schneider_Pt111/reactions.py
+O_X
NH3_X +O_X <=> NH2_X + OH_X is invalid
+O_X
NH2_X +O_X <=> NH_X + OH_X is inval

 30%|███████████████████████████████▍                                                                         | 52/174 [00:10<00:06, 18.83it/s]

/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Schneider_Pd111/reactions.py
+O_X
NH3_X +O_X <=> NH2_X + OH_X is invalid
+O_X
NH2_X +O_X <=> NH_X + OH_X is invalid
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Schneider_Pd211/reactions.py
+O_X
NH3_X +O_X <=> NH2_X + OH_X is invalid
+O_X
NH2_X +O_X <=> NH_X + OH_X is invalid
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Kraehnert_Pt111/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Novell_Pd111/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Vlachos_Ru0001/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Surface/Ammonia/Duan_Ni211/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Surface/Hydrazine/Roldan_Ir111/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Surface/Hydrazine/Roldan_Cu111/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Surface/Methane/Deutschmann_Pt/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Sur

 34%|████████████████████████████████████▏                                                                    | 60/174 [00:12<00:13,  8.18it/s]

/rmg/RMG-database/input/kinetics/libraries/Lai_Hexylbenzene/reactions.py
/rmg/RMG-database/input/kinetics/libraries/BurkeH2O2inArHe/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Glarborg/highP/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Glarborg/C0/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Glarborg/C1/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Glarborg/C2/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Glarborg/C3/reactions.py


 41%|███████████████████████████████████████████▍                                                             | 72/174 [00:14<00:12,  8.28it/s]

/rmg/RMG-database/input/kinetics/libraries/1989_Stewart_2CH3_to_C2H5_H/reactions.py
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromatic_Ring/2017_Mebel_C6H5C2H2_C2H2_highP/reactions.py
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromatic_Ring/2016_Mebel_C10H9_highP/reactions.py
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromatic_Ring/2012_Matsugi_C3H3_C7H7_highP/reactions.py
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromatic_Ring/2016_Mebel_Indene_CH3_highP/reactions.py
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromatic_Ring/2017_Buras_C6H5_C3H6_highP/reactions.py
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromatic_Ring/2005_Ismail_C6H5_C4H6_highP/reactions.py
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromatic_Ring/2017_Mebel_C6H4C2H_C2H2_highP/reactions.py
/rmg/RMG-database/input/kinetics/libraries/First_to_Second_Aromatic_Ring/phenyl_diacetylene_effective/reactions.py
/rm

 44%|██████████████████████████████████████████████▍                                                          | 77/174 [00:14<00:09, 10.22it/s]

/rmg/RMG-database/input/kinetics/libraries/2003_Miller_Propargyl_Recomb_High_P/reactions.py
/rmg/RMG-database/input/kinetics/libraries/CF2BrCl/reactions.py
(+AR)
H + O2 (+AR) <=> HO2 (+AR) is invalid
(+HE)
H + O2 (+HE) <=> HO2 (+HE) is invalid
(+H2O)
H2O2 (+H2O) <=> OH + OH (+H2O) is invalid
/rmg/RMG-database/input/kinetics/libraries/Nitrogen_Dean_and_Bozzelli/reactions.py
/rmg/RMG-database/input/kinetics/libraries/NOx2018/reactions.py


 47%|█████████████████████████████████████████████████▍                                                       | 82/174 [00:18<00:26,  3.48it/s]

/rmg/RMG-database/input/kinetics/libraries/2015_Buras_C2H3_C4H6_highP/reactions.py
/rmg/RMG-database/input/kinetics/libraries/combustion_core/version4/reactions.py
/rmg/RMG-database/input/kinetics/libraries/combustion_core/version3/reactions.py
/rmg/RMG-database/input/kinetics/libraries/combustion_core/version2/reactions.py
/rmg/RMG-database/input/kinetics/libraries/combustion_core/version5/reactions.py


 49%|███████████████████████████████████████████████████▉                                                     | 86/174 [00:19<00:23,  3.79it/s]

/rmg/RMG-database/input/kinetics/libraries/GRI-Mech3.0-N/reactions.py
/rmg/RMG-database/input/kinetics/libraries/N-S_interactions/reactions.py
/rmg/RMG-database/input/kinetics/libraries/YF/seed/reactions.py


 51%|█████████████████████████████████████████████████████▋                                                   | 89/174 [00:20<00:24,  3.43it/s]

/rmg/RMG-database/input/kinetics/libraries/YF/full/reactions.py


 52%|██████████████████████████████████████████████████████▉                                                  | 91/174 [00:22<00:31,  2.65it/s]

/rmg/RMG-database/input/kinetics/libraries/HydrazinePDep/reactions.py
/rmg/RMG-database/input/kinetics/libraries/GRI-Mech3.0/reactions.py


 53%|████████████████████████████████████████████████████████                                                 | 93/174 [00:22<00:26,  3.03it/s]

/rmg/RMG-database/input/kinetics/libraries/c-C5H5_CH3_Sharma/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Mebel_C6H5_C2H2/reactions.py
/rmg/RMG-database/input/kinetics/libraries/Nitrogen_Glarborg_Gimenez_et_al/reactions.py


 55%|█████████████████████████████████████████████████████████▎                                               | 95/174 [00:23<00:29,  2.70it/s]

/rmg/RMG-database/input/kinetics/libraries/Klippenstein_Glarborg2016/reactions.py


 55%|█████████████████████████████████████████████████████████▉                                               | 96/174 [00:24<00:32,  2.43it/s]

/rmg/RMG-database/input/kinetics/libraries/biCPD_H_shift/reactions.py
/rmg/RMG-database/input/kinetics/libraries/BurkeH2O2inN2/reactions.py
/rmg/RMG-database/input/kinetics/libraries/2-BTP/seed/reactions.py


 57%|███████████████████████████████████████████████████████████▋                                             | 99/174 [00:25<00:28,  2.67it/s]

/rmg/RMG-database/input/kinetics/libraries/2-BTP/full/reactions.py


 58%|████████████████████████████████████████████████████████████▎                                           | 101/174 [00:27<00:19,  3.73it/s]

/rmg/RMG-database/input/kinetics/libraries/Iodine-R_recombination/reactions.py
/rmg/RMG-database/input/kinetics/libraries/primarySulfurLibrary/reactions.py





SyntaxError: EOF while scanning triple-quoted string literal (<string>, line 31)

In [None]:
with open('reaction_smiles_families.pickle', 'rb') as f:
    reaction_family = pickle.load(f)

In [None]:
len(reaction_family)

In [None]:
instance = []

for l, s, p, e in reaction_family[1:]:
    instance.append( str(type(e.data)) )



In [None]:
from collections import Counter

In [None]:
Counter(instance)

In [None]:
with open('reaction_smiles_libraries.pickle', 'rb') as f:
    reaction_libraries = pickle.load(f)

In [None]:
instance = []

non_arr_count = 0
p_dep_count = 0

for l, s, p, e in reaction_libraries[1:]:
    instance.append( str(type(e.data)) )
    if not isinstance(e.data, Arrhenius):
        non_arr_count += 1
    if e.data.is_pressure_dependent():
        p_dep_count += 1

    
c = Counter(instance)
print(c, non_arr_count, p_dep_count, len(reaction_libraries)-1)

In [None]:
len(reaction_libraries)-1

In [None]:
reaction_libraries[128][-1].data

# Output the family to CSV

In [None]:
reaction_family[1][3].data

In [None]:
from collections import defaultdict

In [None]:
data = defaultdict(list)

for l, s, p, e in reaction_family:
    if isinstance(e, str):
        continue

    if not isinstance(e.data, Arrhenius):
        continue

    data['label'].append(l)
    data['smiles'].append(s)
    data['path'].append(p)
    data['A_value_si'].append(np.log10(e.data.A.value_si))
    data['n_value_si'].append(e.data.n.value_si)
    data['Ea_value_si'].append(e.data.Ea.value_si/1000)


In [None]:
family_df = pd.DataFrame(data)

In [None]:
family_df

In [None]:
family_df.to_csv("family.csv", index=False)

# Output the library to CSV

## Temperature dependent

### identify the temperature gap

In [None]:
t_min_list = []
t_max_list = []

for l, s, p, e in tqdm(reaction_libraries[1:]):
    if e.data.is_pressure_dependent():
        continue
    k = e.data
    if k.Tmax:
        t_max_list.append(k.Tmax.value_si)
    if k.Tmin:
        t_min_list.append(k.Tmin.value_si)

print(len(t_min_list), len(t_max_list))
print(np.mean(t_min_list), np.mean(t_max_list))

print(np.quantile(t_min_list, q=np.arange(0, 1, 0.01)))
print(np.quantile(t_max_list, q=np.arange(0, 1, 0.01)))

In [None]:
data = defaultdict(list)

reaction_count = 0

print(len(reaction_libraries[1:]))

for l, s, p, e in tqdm(reaction_libraries[1:]):
    if e.data.is_pressure_dependent():
        continue

    k = e.data

    try:
        k.get_rate_coefficient(300)
    except:
        print(type(k), l, s, p, e)
        continue

    tmax, tmin = k.Tmax, k.Tmin
    if tmax is None:
        tmax = 3000
    else:
        tmax = tmax.value_si

    if tmin is None:
        tmin = 300
    else:
        tmin = tmin.value_si
        
    reaction_count += 1
    
    for T in np.arange(tmin, tmax, 1000).tolist():
        
        if not k.is_temperature_valid(T):
            continue

        data['label'].append(l)
        data['smiles'].append(s)
        data['path'].append(p)
        data['T'].append(T)
        data['log10rate'].append(np.log10(k.get_rate_coefficient(T)))

df = pd.DataFrame(data)
print(reaction_count, len(df))
df.to_csv("library_T_dependent_90k.csv", index=False)

## Pressure dependent data sampling

In [None]:
t_min_list = []
t_max_list = []

p_min_list = []
p_max_list = []

for l, s, p, e in tqdm(reaction_libraries[1:]):
    if not e.data.is_pressure_dependent():
        continue
    k = e.data

    if k.Tmax:
        t_max_list.append(k.Tmax.value_si)
    if k.Tmin:
        t_min_list.append(k.Tmin.value_si)
        
    if k.Pmax:
        p_max_list.append(k.Pmax.value_si)
    if k.Pmin:
        p_min_list.append(k.Pmin.value_si)

print(len(t_min_list), len(t_max_list))
print(np.mean(t_min_list), np.mean(t_max_list))

print(np.quantile(t_min_list, q=np.arange(0, 1, 0.1)))
print(np.quantile(t_max_list, q=np.arange(0, 1, 0.1)))


print(len(p_min_list), len(p_max_list))
print(np.mean(p_min_list), np.mean(p_max_list))

print(np.quantile(p_min_list, q=np.arange(0, 1, 0.01)))
print(np.quantile(p_max_list, q=np.arange(0, 1, 0.01)))

In [None]:
data = defaultdict(list)

reaction_count = 0

print(len(reaction_libraries[1:]))

for l, s, p, e in tqdm(reaction_libraries[1:]):
    if not e.data.is_pressure_dependent():
        continue

    k = e.data

    try:
        k.get_rate_coefficient(300, 100000)
    except:
        print(type(k), l, s, p, e)
        continue

    tmax, tmin = k.Tmax, k.Tmin
    if tmax is None:
        tmax = 3000
    else:
        tmax = tmax.value_si

    if tmin is None:
        tmin = 300
    else:
        tmin = tmin.value_si
        
        
    pmax, pmin = k.Pmax, k.Pmin
    if pmax is None:
        pmax = 10132500
    else:
        pmax = pmax.value_si

    if pmin is None:
        pmin = 114.116268
    else:
        pmin = pmin.value_si

    reaction_count += 1
        
    for T in np.arange(tmin, tmax, 1000).tolist():
        for P in np.arange(pmin, pmax, 1000000).tolist():
        
            if not (k.is_temperature_valid(T) and k.is_pressure_valid(P)):
                continue
            data['label'].append(l)
            data['smiles'].append(s)
            data['path'].append(p)
            data['T'].append(T)
            data['P'].append(P)
            data['log10rate'].append(np.log10(k.get_rate_coefficient(T, P)))

df = pd.DataFrame(data)
print(reaction_count, len(df))
df.to_csv("library_TP_dependent_120k.csv", index=False)