# Perera Multitask Model

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from summit import *
from multitask import *


from botorch.models import MultiTaskGP
from botorch.fit import fit_gpytorch_model
from botorch.optim import optimize_acqf, optimize_acqf_mixed
from gpytorch.mlls.exact_marginal_log_likelihood import (
    ExactMarginalLogLikelihood,
)


import ord_schema
from ord_schema import message_helpers, validations
from ord_schema.proto.dataset_pb2 import *
from ord_schema.proto.reaction_pb2 import *
from ord_schema.message_helpers import find_submessages, get_reaction_smiles
from ord_schema import units

from rdkit import Chem
import rdkit.Chem.rdChemReactions as react
from pint import UnitRegistry

from pathlib import Path
from typing import Iterable
import pandas as pd
from copy import deepcopy

## Load Data

In [3]:
data_path = Path("../data")
perera_suzuki_path = data_path / "perera_suzuki" / "ord"

In [4]:
# Load datasets
perera_datasets = [
    message_helpers.load_message(
        str(path), 
        dataset_pb2.Dataset
    )
    for path in perera_suzuki_path.glob("*.pb")
]

In [5]:
def change_pd_acetate_representation(dataset: Dataset, copy=True):
    if copy:
        dataset = deepcopy(dataset)
    for reaction in dataset.reactions:
        inputs = reaction.inputs
        for inp in inputs:
            components = inputs[inp].components
            for c in components:
                if c is not None:
                    if c.reaction_role == ReactionRole.CATALYST:
                        for id_ in c.identifiers:
                            if id_.type == CompoundIdentifier.SMILES:
                                if "CC(=O)[O-].CC(=O)[O-].[Pd+2]" in id_.value:
                                     id_.value = "CC(=O)O[Pd]OC(=O)C"
    return dataset

In [6]:
# Correct Pd acetate representation
perera_datasets = [
    change_pd_acetate_representation(dataset)
    for dataset in perera_datasets
]

## Create Summit DataSets

In [15]:
perera_ds_list = [
    suzuki_reaction_to_dataframe(
        dataset.reactions
    )
    for dataset in perera_datasets
]

In [31]:
# Remove non-phosphorus containing ligands
# Which are just encoded as empty strings
for i, df in enumerate(perera_ds_list):
    perera_ds_list[i] = df[df["ligand_smiles"] != ""]

In [16]:
approved_ligands = [
    'CC(C)Oc1cccc(OC(C)C)c1-c1ccccc1P(C1CCCCC1)C1CCCCC1',
    'COc1cccc(OC)c1-c1ccccc1P(C1CCCCC1)C1CCCCC1',
    'c1ccc(P(c2ccccc2)c2ccccc2)cc1', 'CC(C)(C)P(C(C)(C)C)C(C)(C)C',
    'CC1(C)c2cccc(P(c3ccccc3)c3ccccc3)c2Oc2c(P(c3ccccc3)c3ccccc3)cccc21',
    'C1CCC(P(C2CCCCC2)C2CCCCC2)CC1',
    'CC(C)c1cc(C(C)C)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2)c(C(C)C)c1'
]

In [17]:
df = perera_ds_list[0]
ligands = df["ligand_smiles"].unique()
print(f"Number of unique ligands in DataSet: {len(ligands)}")
count = 0
for ligand in ligands:
    if ligand in approved_ligands:
        count += 1
print(f"Matching approved ligands: {count}")

Number of unique ligands in DataSet: 12
Matching approved ligands: 6


In [18]:
df["ligand_smiles"].value_counts()

                                                                                 32
c1ccc(P(c2ccccc2)[c-]2cccc2)cc1                                                  32
CC(C)(C)P(C(C)(C)C)C(C)(C)C                                                      32
CC(C)(C)P([C]1[CH][CH][CH][CH]1)C(C)(C)C                                         32
c1ccc(P(c2ccccc2)c2ccccc2)cc1                                                    32
Cc1ccccc1P(c1ccccc1C)c1ccccc1C                                                   32
CN(C)c1ccc(P(C(C)(C)C)C(C)(C)C)cc1                                               32
CCCCP([C@]12C[C@H]3C[C@H](C[C@H](C3)C1)C2)[C@]12C[C@H]3C[C@H](C[C@H](C3)C1)C2    32
CC(C)c1cc(C(C)C)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2)c(C(C)C)c1                        32
COc1cccc(OC)c1-c1ccccc1P(C1CCCCC1)C1CCCCC1                                       32
C1CCC(P(C2CCCCC2)C2CCCCC2)CC1                                                    32
CC1(C)c2cccc(P(c3ccccc3)c3ccccc3)c2Oc2c(P(c3ccccc3)c3ccccc3)cccc21          

## Train Single Models

## Train Multitask Model