In [9]:
import os
import pickle

from rmgpy.molecule.molecule import Molecule
from rmgpy.molecule.translator import to_smiles

In [None]:
os.getcwd()

In [10]:
path_prefix = "/rmg/RMG-Py/myrmgfiles/"

# process all molecule smiles

In [None]:
for mode in ["families", "libraries"]:
    # find all dictionary.txt files
    dict_file_list = []
    for root, dirs, files in os.walk(path_prefix + mode):
        for file in files:
            if file.lower().endswith('.txt') and file == "dictionary.txt":
                dict_file_list.append(os.path.join(root, file))
    
    data = [('formula', 'smiles', 'path')]
    
    # for each file, parse adjacency lists and convert them to smiles
    for path in dict_file_list:
        f = open(path, "r")
        text = f.readlines()

        names = []
        adj_lists = []

        # split adjacency lists 
        begin = 0

        # handle the last compound in the file
        if len(text) > 0 and text[-1] != "\n":
            text.append("\n")

        for i in range(len(text)):
            if text[i] == "\n":
                # handle incorrect format for families
                if  text[begin] != "\n":
                    names.append(text[begin].replace("\n","").replace(" ",""))
                    adj_lists.append("".join(text[begin+1: i+1]))

                begin = i+1

        # convert to smiles and append to data
        for i in range(len(adj_lists)):
            cc = Molecule().from_adjacency_list(adj_lists[i])
            s = to_smiles(cc,backend="openbabel")
            
            data.append((names[i], s, path.replace(path_prefix, "RMG-database/input/kinetics/")))
            
    # store pickle
    data_file = open(path_prefix + 'molecule_smiles_{}.pickle'.format(mode), 'wb')
    pickle.dump(data, data_file)

# process all reaction smiles

In [20]:
import codecs
import logging
import os.path
import re
from collections import OrderedDict

import numpy as np

from rmgpy.data.base import DatabaseError, Database, Entry
from rmgpy.data.kinetics.common import save_entry
from rmgpy.data.kinetics.family import TemplateReaction
from rmgpy.kinetics import Arrhenius, ThirdBody, Lindemann, Troe, \
                           PDepArrhenius, MultiArrhenius, MultiPDepArrhenius, Chebyshev 
from rmgpy.kinetics.surface import StickingCoefficient
from rmgpy.molecule import Molecule
from rmgpy.reaction import Reaction
from rmgpy.species import Species

from rmgpy.data.kinetics import *
from rmgpy.data.reference import *

# import rmgpy
# import rmgpy.data.base
# from rmgpy.data.base import Entry
# from rmgpy.data.kinetics.library import KineticsLibrary
# from rmgpy.data.thermo import ThermoLibrary
# from rmgpy.species import Species

from rmgpy.kinetics import Arrhenius, ArrheniusEP, ThirdBody, Lindemann, Troe, \
                           PDepArrhenius, MultiArrhenius, MultiPDepArrhenius, \
                           Chebyshev, KineticsData, StickingCoefficient, \
                           StickingCoefficientBEP, SurfaceArrhenius, SurfaceArrheniusBEP, ArrheniusBM

# from rmgpy import settings

# from rmgpy.data.rmg import RMGDatabase
# from rmgpy.data.kinetics.library import LibraryReaction
# from rmgpy.chemkin import save_chemkin_file, save_species_dictionary
# from rmgpy.rmg.model import Species

# from rmgpy.kinetics.arrhenius import Arrhenius

# from rmgpy.kinetics.model import KineticsModel, PDepKineticsModel, TunnelingModel, \
#                    get_rate_coefficient_units_from_reaction_order, get_reaction_order_from_rate_coefficient_units
# from rmgpy.kinetics.arrhenius import Arrhenius, ArrheniusEP, PDepArrhenius, MultiArrhenius, MultiPDepArrhenius, ArrheniusBM
# from rmgpy.kinetics.chebyshev import Chebyshev
# from rmgpy.kinetics.falloff import ThirdBody, Lindemann, Troe
# from rmgpy.kinetics.kineticsdata import KineticsData, PDepKineticsData
# from rmgpy.kinetics.tunneling import Wigner, Eckart
# from rmgpy.kinetics.surface import SurfaceArrhenius, SurfaceArrheniusBEP, \
#                      StickingCoefficient, StickingCoefficientBEP

# from rmgpy.data.reference import Reference, Article, Book, Thesis
# from rmgpy.exceptions import DatabaseError, InvalidAdjacencyListError
# from rmgpy.kinetics.uncertainties import RateUncertainty
# from rmgpy.molecule import Molecule, Group

In [21]:
# changed definition 
# https://github.com/ReactionMechanismGenerator/RMG-Py/blob/300c78290fdb1e6c928068c0049c7f73093d373d/rmgpy/data/kinetics/library.py#L525
def load_entry(
                   index,
                   label,
                   kinetics,
                   rank = None,
                   degeneracy=1,
                   duplicate=False,
                   reversible=True,
                   reference=None,
                   referenceType='',
                   shortDesc='',
                   longDesc='',
                   allow_pdep_route=False,
                   elementary_high_p=False,
                   allow_max_rate_violation=False,
                   metal=None,
                   site=None,
                   facet=None,
                   ):
        """
        Method for parsing entries in database files.
        Note that these argument names are retained for backward compatibility.
        """

        # reactants = [Species(label=reactant1.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(reactant1)])]
        # if reactant2 is not None: reactants.append(Species(label=reactant2.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(reactant2)]))
        # if reactant3 is not None: reactants.append(Species(label=reactant3.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(reactant3)]))
        #
        # products = [Species(label=product1.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(product1)])]
        # if product2 is not None: products.append(Species(label=product2.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(product2)]))
        # if product3 is not None: products.append(Species(label=product3.strip().splitlines()[0].strip(), molecule=[Molecule().from_adjacency_list(product3)]))

        # Make a blank reaction
        rxn = Reaction(reactants=[], products=[], degeneracy=degeneracy, duplicate=duplicate, reversible=reversible,
                       allow_pdep_route=allow_pdep_route, elementary_high_p=elementary_high_p,
                       allow_max_rate_violation=allow_max_rate_violation)
        # if not rxn.is_balanced():
        #    raise DatabaseError('Reaction {0} in kinetics library {1} was not balanced! Please reformulate.'.format(rxn, self.label))
        # label = str(rxn)
        return Entry(
            index=index,
            label=label,
            item=rxn,
            data=kinetics,
            reference=reference,
            reference_type=referenceType,
            short_desc=shortDesc,
            long_desc=longDesc.strip(),
            metal=metal,
            site=site,
            facet=facet,
        )

In [None]:
for mode in ["families", "libraries"]:
    # find all reactions.py files first
    dict_file_list = []
    for root, dirs, files in os.walk(path_prefix + mode):
        for file in files:
            if file.lower().endswith('.py') and file == "reactions.py":
                dict_file_list.append(os.path.join(root, file))
    
    # load molecule smiles
    data_file = open(path_prefix + 'molecule_smiles_{}.pickle'.format(mode), 'rb')
    molecule_smiles = pickle.load(data_file)
    
    # create empty dataframe
    data = [('label', 'smiles', 'path', "entry")]

    # for each file, parse entries and convert them to smiles
    for path in dict_file_list:
        
        print(path)
        f = open(path, "r")
        text = f.readlines()

        # remove first several lines of comments at the beginning of the .py file
        origin_length = len(text)
        for i in range(len(text)):
            if text[i] == 'entry(\n':
                text = text[i:]
                break

        entry_list = []

        # split entries
        begin = 0

        # empty file
        if len(text) == origin_length or len(text) == 0:
            continue

        for i in range(len(text)):
            if text[i] == ")\n":
                # handle incorrect format for families
                if  text[begin] != "\n" and text[begin][:6] == "entry(":
#                     text[begin] = "Entry(" + text[begin][6:]
                    text[begin] = "load_entry(" + text[begin][6:]
                    entry_list.append(eval("".join(text[begin: i+1])))

                begin = i+2
 
        # convert to smiles and append to data 
        for i in range(len(entry_list)):

            # found related molecule smiles, e.g., under the same folder
            abs_path = path.replace(path_prefix, "RMG-database/input/kinetics/")
            
            mapping = dict()
            for elem in molecule_smiles:
                  if elem[2] == abs_path.replace("reactions.py", "dictionary.txt"):
                        mapping[elem[0]] = elem[1]
        
            # parse smiles
            s = ""
            parts = entry_list[i].label.split(" ")
            for p in parts:
                if p == "+":
                    s = s + "."
                elif p == " " or p == "":
                    continue
                elif p == "<=>" or p == "=>" or p == "=":
                    s = s + ">>"
                else:
                    # formal error
                    if p not in mapping:
                        s = None
                        break
                    else:
                        s = s + mapping[p]

            # store Arrhenius
            data.append((entry_list[i].label,
                        s,
                        abs_path,
                        entry_list[i]))
    
    # store pickle
    data_file = open(path_prefix + 'reaction_smiles_{}.pickle'.format(mode), 'wb')
    pickle.dump(data, data_file)
            
            