## Accelerating Metal-Organic Framework Discovery via Synthesisability Prediction: The MFD Evaluation Method for One-Class Classification Models

Chi Zhang, Dmytro Antypov, Matthew J. Rosseinsky, and Matthew S. Dyer*<br/>
Email: M.S.Dyer@liverpool.ac.uk <br/>

Please cite the corresponding paper if you used the Maximum Fractional Difference (MFD) method or these trained machine learning (ML) models in your work.

### The Jupyter Notebook hereafter presents the code to predict the synthesisability of a [metal, linker] combination given by user

This notebook requires the following input file from the `best_vs_poor_model` folder:<br/>
`deep_model.tar` - the saved best-performing DeepSVDD model.<br/>

and the following input file from current folder:<br/>
`deep_scaler.joblib` - the saved DeepSVDD model scaler used to normalise the prediction score for the input.<br/>

### Please input the [metal, linker] combination you would like to predict before calculation. Enter the metal as an element symbol and the organic linker as a SMILES string.

Here we use ['Zr', 'O=C(O)CCC(=O)Nc1ccc(C(=O)O)cc1'] as an example.

### Import training score and model

In [1]:
#import the basic libraries
import pandas as pd
import numpy as np
from numpy import nan as NaN

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets, metrics

In [2]:
#import the trained DeepSVDD model
import sys
paths = ['model_architecture/Deep-SVDD-PyTorch/', 'model_architecture/Deep-SVDD-PyTorch/src', 'model_architecture/set_transformer/']
sys.path.extend(paths)

import torch
import torch.nn as nn
from modules import SAB, PMA, ISAB
import deepSVDD
from base.base_net import BaseNet

def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)
        
def build_autoencoder(net_name):
    return PairsAutoEncoder()

def build_network(net_name):  
    return PairsEncoder()

INPUT_DIM = 2253

class PairsEncoder(BaseNet):

    def __init__(self):
        super().__init__()
        self.rep_dim = 100
        self.seq = nn.Sequential(SAB(dim_in=2253, dim_out=1000, num_heads=10),
            SAB(dim_in=1000, dim_out=500, num_heads=10),
            SAB(dim_in=500, dim_out=100, num_heads=10),
            PMA(dim=100, num_heads=5, num_seeds=1))
        
    def forward(self, x):
      x = torch.split(x, 2253, dim=1)     
      x= torch.stack(x).transpose(0,1) 
      return self.seq(x).squeeze()

class PairsAutoEncoder(BaseNet):

    def __init__(self):
        super().__init__()
        self.encoder = PairsEncoder()
        self.encoder.apply(init_weights)
        self.decoder = nn.Sequential(nn.Linear(in_features=100, out_features=2253), nn.Sigmoid())
        self.decoder.apply(init_weights)
    def forward(self, x):
        return self.decoder(self.encoder(x))

#load deep model
net_name = 'mof_Net'
clf_deep = deepSVDD.DeepSVDD()
clf_deep.net = build_network(net_name)
clf_deep.ae_net = build_autoencoder(net_name)
clf_deep.net_name = net_name
clf_deep.load_model(model_path='best_vs_poor_model/deep_model.tar')

  


# Get the input and generate metal + linker features for input

In [3]:
#get the input with single metal and linker (SMILES string)
metal = 'Zr' # sample, please input your combination
linker = 'O=C(O)CCC(=O)Nc1ccc(C(=O)O)cc1' #sample, please input your combination

In [4]:
#generate metal features for the input
metal_scaled = pd.read_csv('dataset_generation_and_featurization/metal_scaled_205.csv', index_col=0)
metal_df = metal_scaled.loc[metal,:]

#generate linker features for the input
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

mol = Chem.MolFromSmiles(linker)
linker_modified = Chem.MolToSmiles(mol)
fpts_dl = AllChem.GetMorganFingerprintAsBitVect(mol,3,2048)
linker_df_dl = np.array(fpts_dl) # linker features to be used in deep model

#concatenate metal features & linker features
df_dl = np.concatenate((metal_df.to_numpy(), linker_df_dl)) # to be used in deep model

In [7]:
from joblib import dump, load

'''
#DeepSVDD model scaler
deep_train = pd.read_csv('one_class_classification_models/DeepSVDD_train.csv', index_col=0)
deep_test = pd.read_csv('one_class_classification_models/DeepSVDD_test.csv', index_col=0)
deep_range = np.concatenate((deep_train,deep_test)).reshape(-1,1)

MinMax_scaler = preprocessing.MinMaxScaler()
deep_scaler = MinMax_scaler.fit(np.array(deep_range).reshape(-1,1))

#save deep_scaler
dump(deep_scaler, 'deep_scaler.joblib')
'''

#load deep_scaler
deep_scaler = load('deep_scaler.joblib')

device = 'cpu'
def score(deep_SVDD, X):
    with torch.no_grad():
        net = deep_SVDD.net.to(device)
        X = torch.FloatTensor(X).to(device)
        y = net(X)
        c, R = torch.FloatTensor([deep_SVDD.c]).to(device), torch.FloatTensor([deep_SVDD.R]).to(device)
        dist = torch.sum((y - c)**2, dim=1)
        if deep_SVDD.objective == 'soft-boundary':
            scores = dist - R ** 2
        else:
            scores = dist
    return scores

output_deep = score(clf_deep, df_dl.reshape(1,-1)).cpu().detach().numpy()*(-1)
output_deep = deep_scaler.transform(output_deep.reshape(-1,1)) 
output_deep = np.round(output_deep[0][0], 3)
output_deep_predict = output_deep > 0.92

In [9]:
#output score
print('The normalised prediction score for your input is: ', output_deep)
#output prediction, True represent synthesisable while False represent not synthesisable (by the model)
print('Our model predicts the synthesisability of the input combination is: ', output_deep_predict)

The normalised prediction score for your input is:  0.618
Our model predicts the synthesisability of the input combination is:  False
