In [1]:
from collections import defaultdict
import pickle
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from rmgpy.data.kinetics import KineticsDatabase
from rmgpy.kinetics import (Arrhenius, ArrheniusBM, ArrheniusEP, Chebyshev,
                            KineticsData, Lindemann, MultiArrhenius,
                            MultiPDepArrhenius, PDepArrhenius,
                            StickingCoefficient, StickingCoefficientBEP,
                            SurfaceArrhenius, SurfaceArrheniusBEP, ThirdBody,
                            Troe)
from rmgpy.kinetics.model import KineticsModel
from rmgpy.reaction import Reaction

In [2]:
@dataclass
class Reaction:
    family_name: str
    depo_label: str
    indepo_index: int
    reaction: Reaction
    kinetics_property: KineticsModel
    rank: int

# Load from the kinetic database

In [3]:
print("loading rmg kinetic database ....")
kdb = KineticsDatabase()
kdb.load_families("/rmg/RMG-database/input/kinetics/families", families='all', depositories='all')
print("done")

loading rmg kinetic database ....
done


In [4]:
family_reaction_list = []
for family_name in tqdm(kdb.families, desc="family"):
    depo_list = kdb.families[family_name].depositories
    for depo in depo_list:
        for reaction_index in depo.entries:
            rxn = depo.entries[reaction_index]
            reaction = Reaction(
                family_name=family_name,
                depo_label=depo.label,
                indepo_index=rxn.index,
                reaction=rxn.item,
                kinetics_property=rxn.data,
                rank=rxn.rank
            )
            family_reaction_list.append(reaction)

with open("family_reactions.pickle", 'wb') as f:
    pickle.dump(family_reaction_list, f)
print(len(family_reaction_list), " reactions collected")

family:   0%|          | 0/102 [00:00<?, ?it/s]

12166  reactions collected


# Reaction to Input and Output

In [5]:
def input2reaction_smiles(sample: Reaction):
    reaction_smiles = ""
    
    smiles = []
    for sp in sample.reaction.reactants:
        smiles.append(
            sp.smiles
        )
        
    reaction_smiles += ".".join(smiles)
    
    reaction_smiles += ">>"
    
    smiles = []
    for sp in sample.reaction.products:
        smiles.append(
            sp.smiles
    )
    
    reaction_smiles += ".".join(smiles)
    return reaction_smiles

In [6]:
def input2reaction_label(sample: Reaction):
    return sample.reaction.to_labeled_str()

In [7]:
def input2Arrenius_coefs(sample: Reaction):
    return {
        'A': sample.kinetics_property.A.value_si,
        'n': sample.kinetics_property.n.value_si,
        'Ea': sample.kinetics_property.Ea.value_si/1000
    }

# RMG-Family for mechanism OOD

In [8]:
with open("family_reactions.pickle", 'rb') as f:
    reaction_list = pickle.load(f)

In [9]:
data = defaultdict(list)

for sample_id, sample in enumerate(family_reaction_list):
    if not isinstance(sample.kinetics_property, Arrhenius):
        continue
    
    data['sample_id'].append(sample_id)
    data['smiles'].append(input2reaction_smiles(sample))
    data['label'].append(input2reaction_label(sample))
    data['domain_index'].append(sample.family_name)
    data['indepo_index'].append(sample.indepo_index)
    data['depository'].append(sample.depo_label)
    data['rank'].append(sample.rank)
    data['target'].append(input2Arrenius_coefs(sample)['Ea'])

In [10]:
family_MOOD_df = pd.DataFrame(data)

In [11]:
family_MOOD_df.to_csv("family_MOOD.csv", index=False)

In [12]:
len(family_MOOD_df)

12153

In [13]:
len(family_MOOD_df.smiles.unique())

9905

## Handle duplication

In [14]:
reduced_subdfs = [
    subdf for smiles, subdf in family_MOOD_df.groupby('smiles') 
    if len(subdf) == 1
]

In [15]:
family_MOOD_df_reduced = pd.concat(reduced_subdfs)

In [16]:
print(len(family_MOOD_df_reduced))

9276


In [17]:
family_MOOD_df_reduced.to_csv("family_MOOD_reduced.csv", index=False)