# Input data and parameters

In [1]:
from rdkit.Chem import rdFingerprintGenerator
from rdkit import Chem
import warnings
warnings.filterwarnings('ignore')
import math
import re
import numpy as np
import time
import pygad  #version==3.3.1
import csv
import pickle 

#SMILES-functional groups 
fgps=['*', '*C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F', '*C=C', '*C', '*CC=C', '*OC(C)=O', '*OC', '*OCOC','*OC(=O)OC', '*C(=O)OC', '*C#N', 
      '*N=C=O', '*F', '*Cl', '*Br', '*c1ccccc1', '*c1cccs1', '*C(F)(F)F', '*[Si](C)(C)C','*CCC','*NC(C)=O','*C#C','*C=CC(=O)OC']

#SMILES-skeletons 
skeletons=['*COC(*)=O','*C1OC(=O)OC1*','*C1OC(=O)C(*)OC1=O','*C1OCC2(CO1)COC(*)OC2','*C(*)(OC)OC','*COC(=O)CCC(=O)OC*', '*COC(=O)CCCC(=O)OC*',
           '*COC(=O)OC*', '*C1COC(=O)C1*', '*C1CC(=O)OC1*', '*C1CC(*)C(=O)O1', '*C1OS(=O)OC1*', '*C1CCOS(=O)C1*', '*C1COS(=O)C(*)C1', 
           '*C1CCC(*)S(=O)O1','*C1COS(=O)CC1*','*C1CCS(=O)OC1*', '*C1CCS(=O)(=O)OC1*', '*C1CC(*)OS(=O)(=O)C1', '*C1CCC(*)S(=O)(=O)O1', 
           '*C1COS(=O)(=O)CC1*', '*C1COS(=O)(=O)C(*)C1', '*C1CCOS(=O)(=O)C1*', '*C(*)C', '*CC*','*C=C*','*C(*)=C','*C1C(=O)OC(=O)C1*',
           '*C#C*','*C1CC(=O)OC(=O)C1*','*C1CC(*)C(=O)OC1=O','*C1OS(=O)(=O)C(*)S(=O)(=O)O1']

#-----set of parameters for LUMO or chemical hardeness  -------#
#clustering by lUMO value:

chainsk=[[0, 25, 26, 28], [4, 5, 6, 7, 23, 24]]
cycsk=[[2, 29,27, 30,31],[1, 3, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]]
task='LUMO' 

#clustering by chemical hardeness:

#cycsk=[[12, 13, 14, 15, 16],[1, 2, 3, 8, 9, 10, 11, 17, 18, 19, 20, 21, 22, 27, 29, 30, 31]]
#chainsk=[[0, 4, 5, 6, 7, 23, 24], [25, 26, 28]]
#task='chemical_hardness'

#-----------------------------#



path = 'morgen_rad2_1024bits_'+task+'_coeffi.pkl'

with open(path, 'rb') as f:
    loaded_dict = pickle.load(f)
regT=[[],[]]
for i in range(2):
    for j in range(2):
        reg={}
        reg["coeff"]=loaded_dict["reg"+str(i)+str(j)+"_Co_In"][0]
        reg["intercept"]=loaded_dict["reg"+str(i)+str(j)+"_Co_In"][1]
        regT[i].append(reg)



#morgen fingerprint as ECFP 4
bits=1024
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=bits)

print("---finish loading data and parameters---")

---finish loading data and parameters---


# Functions 

In [2]:
#generate defined features 
def code_Generator_defined(sklt,fg1,fg2,bits):
  
    #establish morgen fingerorint
   
    fg1=np.array( mfpgen.GetFingerprint (sk_fg_glue(sklt,fg1,"*"))) 
    fg2=np.array( mfpgen.GetFingerprint (sk_fg_glue(sklt,"*",fg2))) 
    sk=np.array( mfpgen.GetFingerprint (sk_fg_glue(sklt,"*","*"))) 

    #establish defined features
    code=(fg1-fg1*sk)+(fg2-fg2*sk)+sk
        
  
    x=[]

    x.append(code)
 
    return x
    
 #calculate the prediction results yred:prediction  
def ypred1(test,Reg_model):
    ypred=0
    
    ypred=np.matmul(Reg_model["coeff"],test) +Reg_model["intercept"]

    return ypred

  # code to glue skeleton and functional group together
def sk_fg_glue(i_skeleton,func_group1,func_group2):
    skeleton = Chem.MolFromSmiles(i_skeleton)
    func_group1 = Chem.MolFromSmiles(func_group1)
    func_group2 = Chem.MolFromSmiles(func_group2)
    combination = Chem.CombineMols(skeleton, func_group1)
    combination = Chem.CombineMols(combination, func_group2)

    r_loc = []
    for k in range(combination.GetNumAtoms()):
        atom = combination.GetAtoms()[k].GetSymbol()
        if atom == '*':
            r_loc.append(k)
            # connect between the skeleton and functional group
    edit_mol = Chem.EditableMol(combination)
    edit_mol.AddBond(r_loc[0], r_loc[2], order=Chem.rdchem.BondType.SINGLE)
    edit_mol.AddBond(r_loc[1], r_loc[3], order=Chem.rdchem.BondType.SINGLE)
    combination = edit_mol.GetMol()
    combination_smiles = Chem.MolToSmiles(combination)
            # correct the SMLIES afther gluing
    combination_smiles = combination_smiles.replace('**', '')
    combination_smiles = combination_smiles.replace('()', '')
    if combination_smiles.count('*(*') > 0:    
      
        combination_smiles = smiles_fix(combination_smiles)
    mol = Chem.MolFromSmiles(combination_smiles)
    return  mol

def smiles_fix(smiles):
    smiles = smiles.replace('*(*', '(', 1)
    branches = []
    # find (..), (..(..)..), and (..(..)..(..)..), use & to represent a branch
    num_of_parens = len(re.findall('\(\w+\)|\(\w*\(\w+\)\w*\)|\(\w*\(\w+\)\w*\(\w+\)\w*\)', smiles))
    for i in range(num_of_parens):
        branches.append(re.search('\(\w+\)|\(\w*\(\w+\)\w*\)|\(\w*\(\w+\)\w*\(\w+\)\w*\)', smiles).group(0))
        smiles = smiles.replace(branches[i], '&', 1)
    # deal with smiles start from branch
    if smiles[0] == '&':
        smiles = re.sub('(&)([A-Za-z]\d?)', r'\2\1', smiles, 1)

    for i in range(num_of_parens):
        branches[i] = branches[i].replace('1', f'{i*2+4}')
        branches[i] = branches[i].replace('2', f'{i*2+5}')
        smiles = smiles.replace('&', branches[i], 1)
    return smiles


# cost function for GA
def energy(solution):
    code11=[[],[],[]]
    code12=solution
   # print(code12)

    # convert solution to code11=[[skeleton],[function group at site 1],[function group at site 2]]
    for i in range(len(code12)):
        if i <len(skeletons) and code12[i]==1:
            code11[0].append(i)
        
        if i >=len(skeletons) and i <len(skeletons)+len(fgps) and code12[i]==1:
            code11[1].append(i-len(skeletons))

        if i >=len(skeletons)+len(fgps) and code12[i]==1:
            code11[2].append(i-len(skeletons)-len(fgps))
   

    
    E=0
    con=[0,0,0] #establish penalty [penalty for skeleton,penalty for site 1,penalty for site 2]
    penal=0
    for j in  range(len(code11)):
        if len(code11[j])==1 :
            pass
        else:
            con[j]=abs(len(code11[j])-1)

    # when the sum of penalties equals to zero, calculate prediction of chemical property
    if sum(con)==0:
        for i in range(len(cycsk)):
            #identify skeleton belongs to which ring-tpye cluster
            thing_index = cycsk[i].index(code11[0][0]) if code11[0][0] in cycsk[i] else -1 
            if thing_index!=-1:      
                
                #calculte prediction value
                reg=regT[0][i]
                linear_reg=reg
                atest=code_Generator_defined(skeletons[code11[0][0]],fgps[code11[1][0]],fgps[code11[2][0]],bits)
                E=ypred1(atest[0],linear_reg) #calculte prediction value
     
        

        for i in range(len(chainsk)):
            thing_index = chainsk[i].index(code11[0][0]) if code11[0][0] in chainsk[i] else -1
            if thing_index!=-1:
                reg=regT[1][i]
                linear_reg=reg
                atest=code_Generator_defined(skeletons[code11[0][0]],fgps[code11[1][0]],fgps[code11[2][0]],bits)
                E=ypred1(atest[0],linear_reg) 
                
        penal=0
    else:
        # when the sum of penalties equals to non-zero, penalize the solution
        penal=0.2
    E=E+ penal
     
    return E
        
        
#fitness function for GA
def fitness_func(ga_instance, solution, solution_idx):
   
    output1 =energy(solution)

    fitness1 =1/abs((-desired_output1+output1 )*1.5+1 ) 

    return fitness1    

# Solve by GA

In [80]:

    
# -----------Running the GA to optimize the parameters of the function----------
#desired_output1 = -0.10  #possible minimum of LUMO
desired_output1 = 0.1   #possible minimum of chemical hardness

num_generations = 100
num_parents_mating =400
sol_per_pop =440
num_genes = 78


start_time = time.time()
ga_instance = pygad.GA(num_generations=num_generations,
                       num_parents_mating=num_parents_mating,
                       sol_per_pop=sol_per_pop,
                       num_genes=num_genes,
                       fitness_func=fitness_func,
                       gene_type=int,
                       init_range_low=0,
                       init_range_high=2,
                       crossover_probability=0.5,
                       crossover_type="two_points"
                       ) 

ga_instance.run()


#ga_instance.run()
print("Running time :"+" %s seconds " % (time.time() - start_time))

#ga_instance.plot_fitness()

# Returning the details of the best solution.
solution, solution_fitness, solution_idx = ga_instance.best_solution(ga_instance.last_generation_fitness)
print(f"Parameters of the best solution : {solution}")
print(f"Fitness value of the best solution = {solution_fitness}")
print(f"Index of the best solution : {solution_idx}")

prediction =energy(solution)
print(f"Predicted output based on the best solution : {prediction}")

if ga_instance.best_solution_generation != -1:
    print(f"Best fitness value reached after {ga_instance.best_solution_generation} generations.")



Running time : 8.771552562713623 seconds 
Parameters of the best solution : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1]
Fitness value of the best solution = 1.4177715404307882
Index of the best solution : 0
Predicted output based on the best solution : -0.09644516224565058
Best fitness value reached after 64 generations.


# Notes

In [None]:
#optimal solution for samples: morgen_rad2_1024bits+" lumo" or "chemical_hardness"+'_coeffi.pkl'
# lUMO optimal solution -0.09645715875236538 
#optimal config =[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]

#chemical hardness optimal solution 0.10340270447059448
#optimal config= [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]