# The ATG tree fitting notebook for RMG kinetics database
This is an ATG tree fitting notebook used to create an isomorphic tree for rate estimation based on the training reactions stored in the RMG database. This notebook is originally written by Matthew Johnson and is slightly modified here.

In [None]:
from rmgpy import settings
from rmgpy.molecule.molecule import *
from rmgpy.species import *
from rmgpy.chemkin import *
from rmgpy.data.rmg import RMGDatabase
from rmgpy.species import Species
import time
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('TkAgg')
%matplotlib inline

Load RMG database

In [None]:
database = RMGDatabase()
database.load(
            path = settings['database.directory'],
            thermo_libraries = [
                'primaryThermoLibrary',
                'primaryNS',
                'thermo_DFT_CCSDTF12_BAC',
                'DFT_QCI_thermo',
                'SABIC_aromatics',
                'Klippenstein_Glarborg2016',
                'BurkeH2O2',
                'NitrogenCurran',
                'NOx2018',
                'FFCM1(-)',
                'SulfurLibrary',
                'SulfurGlarborgH2S',],
            transport_libraries = [],
            reaction_libraries = [],
            seed_mechanisms = [],
            kinetics_families = 'all',
            kinetics_depositories = ['training'],
            depository = False, # Don't bother loading the depository information, as we don't use it
        )

Enter the family name of the tree to be created

In [None]:
family_name = "1,3_sigmatropic_rearrangement"

family = database.kinetics.families[family_name]

Training

In [None]:
start = time.time()
family.clean_tree()
family.generate_tree(thermo_database=database.thermo,
                     nprocs=1,  # number of process
                     new_fraction_threshold_to_reopt_node=0.25,
                     max_batch_size=800,
                     extension_iter_max=2,
                     extension_iter_item_cap=100)
family.check_tree()
end = time.time()
print(f'Training time: {end - start:.3f} s')

In [None]:
print(f'Number of nodes: {len(family.groups.entries)}')
print('Tree nodes:')
family.groups.entries

Tree regularization

In [None]:
start = time.time()
family.regularize(thermo_database=database.thermo)
end = time.time()
print(f'Tree regularization: {end - start:.3f} s')

Match training reactions for each node

In [None]:
start = time.time()
templateRxnMap = family.get_reaction_matches(thermo_database=database.thermo,
                                             remove_degeneracy=True,
                                             get_reverse=True,
                                             exact_matches_only=False,
                                             fix_labels=True)
end = time.time()
print(f'Training reaction match: {end - start:.3f} s')

Clean rate rules and make new rate rules based on ATG

In [None]:
start = time.time()
family.clean_tree_rules()
family.make_bm_rules_from_template_rxn_map(templateRxnMap)
family.check_tree()
end = time.time()
print(f'Build rate rules: {end - start:.3f} s')

Estimate the uncertainty of the ATG rate rule by cross validation

In [None]:
start = time.time()
errors, uncertainties = family.cross_validate(iters=0,
                                              random_state=5,
                                              folds=0,  # 0 for Leave one out 
                                              ascend=False)
end = time.time()
print(f'Cross validation: {end - start:.3f} s')

Visualize rate estimation error

In [None]:
plt.figure(figsize=(10,8))
plt.hist(np.abs(list(errors.values())), bins=30, density=True)
plt.title('Decision Tree Estimator', fontsize=18)
plt.xlim(0,20)
plt.ylabel('Probability density', fontsize=18)
plt.xlabel(r'$|Ln(k_{est}/k_{rxn})|$', fontsize=18)

Save the ATG rate rule to RMG-database

In [None]:
family.save(os.path.join(settings['database.directory'], 'kinetics', 'families', family_name))