In [1]:
'''
用于离子液体的定向生成
在指定的T (K),P (bar)条件下，寻找高的CO2_solubility对应的ILs

通过指定性质，筛选分子，再进行拆分，寻找相似结构（扩充），合成新分子,实现分子自动进化寻优
优化目标为 分子具备较好的CO2_solubility, 其次考虑可合成性sa score，用区间限定

sa score:越靠近1表明越容易合成，越靠近10表明合成越困难。介于1和10之间
CO2_solubility:二氧化碳溶解度越高越好。介于0,1之间

    # 容忍范围 {"SAscore":(1,3),target_property:(0.3,1)}
    # 理想目标{"SAscore":1,target_property:1}
    # 温压条件：25 ℃ (298.15 K), 10bar
'''
import random
import numpy as np
import pandas as pd
import os
os.chdir(r'E:\Pycharm projects')

from calc_logP_QED_sa import mol_properties
from test_fp_similarity import *

from littlecode.tools.chouyang import chouyang
from littlecode.tools.mkdir import mkdir

from IL_Generator.ions_tools import split_Ions,clean_ion_smiles
from IL_Generator.similar_mol_search import *
from IL_Generator.ion_mutate import ion_mutate
from rdkit import Chem
from icecream.icecream import ic
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

root = "F:\\manuscript4\\rundata_for_manuscript04\\"
target_property = "CO2_solubility"

temperature_input = 298.15  # 温度，单位为K
pressure_input = 10  # 压力，单位为bar


In [2]:
run_init = False
# 初始种群生成
if run_init:
    cleaned_all_ILs = pd.DataFrame()

    all_ILs = pd.read_csv(r"E:\Pycharm projects\IL_Generator\CO2_solubility_prediction\dataset\数据库所有ILs.csv")
    all_ILs_smiles = all_ILs['SMILES'].tolist()
    all_ILs_smiles = clean_ion_smiles(all_ILs_smiles) # 清理IL中的重复离子
    cal_prop = mol_properties(all_ILs_smiles)

    cleaned_all_ILs["SMILES"] = all_ILs_smiles
    cleaned_all_ILs[target_property] = cal_prop.cal_co2_solubility(temperature_input,pressure_input)
    cleaned_all_ILs['SAscore'] = cal_prop.cal_sa()

    gen0_ils  = cleaned_all_ILs[(cleaned_all_ILs['SAscore']<3) & (cleaned_all_ILs[target_property]>0.3)]

    gen0_ils.to_csv(root+"generation\\gen0\\selected_mols.csv")
    cleaned_all_ILs.to_csv(root+"generation\\gen0\\数据库ILs总结果.csv")

In [3]:
run_init = False
if run_init:
    # 总阴阳离子数据库生成
    init_mols = pd.read_csv(root+"generation\\gen0\\数据库ILs总结果.csv")
    init_mols_groups = init_mols["SMILES"].tolist()

    cation_list,anion_list,_ = split_Ions(init_mols_groups)

    # 计算Morgan指纹预备相似性比对
    cation_fp_database = GetMorganFingerprint(cation_list)
    anion_fp_database = GetMorganFingerprint(anion_list)
    cation_fp_database.to_csv(root+"generation\\gen0\\cation_fp_database.csv", index = False)
    anion_fp_database.to_csv(root+"generation\\gen0\\anion_fp_database.csv", index = False)

    # 备份一个
    cation_fp_database.to_csv(root+"generation\\gen0\\cation_fp_database_origin.csv")
    anion_fp_database.to_csv(root+"generation\\gen0\\anion_fp_database_origin.csv")



In [4]:
# 根据smiles计算properties：logP_QED_sa，再计算new_mertric
# 最好再排序筛选
class screen_smi:
    def __init__(self, smi_list):
        '''
        :param smi_list: list of smiles,或者已经计算了props的dataframe
        '''

        if type(smi_list) == list:
            self.smi_list = smi_list
            mols = {"SMILES":self.smi_list}
            a = mol_properties(mol_list=mols["SMILES"])
            mols[target_property], mols["SAscore"] = a.cal_co2_solubility(temperature_input,pressure_input), a.cal_sa()
            self.mols = pd.DataFrame(mols)
            self.mols_prop = self.mols.sort_values(by=target_property, ascending=False) # 降序排列
        else:
            self.mols = smi_list[['SMILES',target_property,'SAscore']]
            self.mols_prop = self.mols.sort_values(by=target_property, ascending=False)

        # 重新设置每一列的数据类型
        self.mols_prop = self.mols_prop.astype({'SMILES': str,target_property: float,'SAscore': float})

        # 删除索引并创建一个新的 DataFrame
        self.mols_prop = self.mols_prop.reset_index(drop=True)

    def get_top_smi(self, select_n = 10):
        # 取前几名
        if select_n == None:
            self.selected_mols = self.mols_prop
        else:
            self.selected_mols = self.mols_prop[:select_n]
        return self.selected_mols["SMILES"].tolist()

    def get_screened_smi(self,target_requirement=0.3):
        # # 满足范围约束

        condition1 = (self.mols_prop["SAscore"]<3)
        condition2 = (self.mols_prop[target_property]>target_requirement)

        # 控制结构长度，非氢原子数量
        b = mol_properties(self.mols_prop["SMILES"].tolist())
        self.mols_prop["num_atoms"] = b.calc_num_atoms()
        condition3 = (self.mols_prop["num_atoms"]<35)

        self.selected_mols = self.mols_prop[condition1 & condition2 & condition3]
        return self.selected_mols["SMILES"].tolist()

    def save_prop(self, root = r"F:\manuscript4\rundata_for_manuscript04\generation\gen0"):
        # 保存
        path = mkdir(root)

        self.selected_mols.to_csv(path+"\\selected_mols.csv", index=False)

        self.mols_prop.to_csv(path+"\\all_mol_props.csv", index=False)


# 离子库查询相似结构
def add_similar_frags(cation=[], anion=[], select_n=5,root=r'F:\manuscript4\rundata_for_manuscript04\generation\gen0'):
    '''
    相似性补加+结构变异
    :param cation: smiles list 搜寻离子
    :param anion: smiles list 搜寻离子
    :param select_n: 补加相似分子个数
    :param: root: 保存相似性结果的根目录
    :return: 补加后的cation, anion
    '''

    # 1.相似性补加
    ## 加载总数据集，该数据集会持续更新
    database_path = r'F:\manuscript4\rundata_for_manuscript04\generation\gen0'
    cation_fp_database = pd.read_csv(database_path+'\\cation_fp_database_origin.csv')
    anion_fp_database = pd.read_csv(database_path+'\\anion_fp_database_origin.csv')

    # 创建一个空的 DataFrame 作为初始值
    result_add_cation,result_add_anion = pd.DataFrame(),pd.DataFrame()
    ## 阳离子
    add_cation1 = []
    for ion in cation:
        result1 = get_smilarity_mols(ion,cation_fp_database,top_n=select_n)
        result1.reset_index(drop=True)
        add_ion = result1['SMILES'].tolist()
        add_cation1.extend(add_ion)
        # 添加一列，所有值为 ion的smiles
        result1['root_cation'] = ion

        result_add_cation = pd.concat([result_add_cation,result1], ignore_index=True)

    ## 阴离子
    add_anion1 = []
    for ion in anion:
        result2 = get_smilarity_mols(ion,anion_fp_database,top_n=select_n)
        result2.reset_index(drop=True)
        add_ion = result2['SMILES'].tolist()
        add_anion1.extend(add_ion)
        # 添加一列，所有值为 ion的smiles
        result2['root_anion'] = ion


        result_add_anion = pd.concat([result_add_anion,result2], ignore_index=True)

    # 保存相似性搜索结果
    result_add_cation.to_csv(root+'\\similar_cation.csv')
    result_add_anion.to_csv(root+'\\similar_anion.csv')

    # 2.结构变异补加

    # 创建一个空的 DataFrame 作为初始值
    result_add_cation2,result_add_anion2 = pd.DataFrame(),pd.DataFrame()
    ## 阳离子
    add_cation2 = []
    for ion in cation:
        result1 = pd.DataFrame()
        add_ion = ion_mutate([ion],max_size=2,select_N=select_n)
        result1['SMILES'] = add_ion
        add_cation2.extend(add_ion)
        # 添加一列，所有值为 ion的smiles
        result1['root_anion'] = ion


        result_add_cation2 = pd.concat([result_add_cation2,result1], ignore_index=True)

    ## 阴离子
    add_anion2 = []
    for ion in anion:
        result2 = pd.DataFrame()
        add_ion = ion_mutate([ion],max_size=2,select_N=select_n)
        result2['SMILES'] = add_ion
        add_anion2.extend(add_ion)
        # 添加一列，所有值为 ion的smiles
        result2['root_anion'] = ion


        result_add_anion2 = pd.concat([result_add_anion2,result2], ignore_index=True)

    #
    # add_cation2 = ion_mutate(cation,max_size=2,select_N=select_n)
    # add_anion2 = ion_mutate(anion,max_size=2,select_N=select_n)

    # 保存变异结果
    result_add_cation2.to_csv(root+'\\mutation_cation.csv')
    result_add_anion2.to_csv(root+'\\mutation_anion.csv')

    # 3.集合化去重
    add_cation1.extend(add_cation2)
    add_anion1.extend(add_anion2)

    add_cation, add_anion = set(add_cation1), set(add_anion1)

    return list(add_cation), list(add_anion)
    '''
    add_cation, add_anion = add_similar_frags(['CCS(=O)(=O)[O-]','CCC(=O)[O-]'],
                  ['COC(=O)[C@H]1[NH2+]CCC1','OC1=CN([NH2+]O1)C'])
    '''

# 种群获得
# 将备选的精英IL分别拆分、去重得到对应的阳离子和阴离子集，
# 在进行变异扩充，得到初始种群
def get_init_group(init_mols=None,path = r"F:\manuscript4\rundata_for_manuscript04\generation\gen0",
                   mutation=True,select_n=2,file='selected_mols.csv'):

    '''

    :param init_mols: 原始分子
    :param path: 当代计算基础文件目录
    :param mutation: 如果False，关闭变异和相似性搜索功能
    :return: 列表阴阳离子
    '''

    if init_mols == None:
        init_mols = pd.read_csv(path+"\\"+file)

    init_mols_groups = init_mols["SMILES"].tolist()

    cation_list0,anion_list0,_ = split_Ions(init_mols_groups)
    print('------------------------------\n cation_list0 = %d \n anion_list0= %d \n------------------------------\n'%(len(cation_list0),len(anion_list0)))    # cation_groups = chouyang(cation_list, 3)
    # anion_groups = chouyang(anion_list, 3)
    # save_cation_groups_info = {i:[cation_groups[i]] for i in range(len(cation_groups))}
    # save_anion_groups_info = {i:[anion_groups[i]] for i in range(len(anion_groups))}

    if mutation:
    # 补充变异结构
        cation_list1,anion_list1 = add_similar_frags(cation_list0,anion_list0,select_n=select_n,root=path)
        print('------------------------------\n cation_list1 = %d \n anion_list1= %d \n------------------------------\n'%(len(cation_list1),len(anion_list1)))

        # 再把原始的补充上
        cation_list1.extend(cation_list0)
        anion_list1.extend(anion_list0)

        cation_list,anion_list = set(cation_list1),set(anion_list1)
    else:
        cation_list,anion_list = cation_list0,anion_list0

    save_cation_groups_info = {'SMILES':list(cation_list)}
    save_anion_groups_info = {'SMILES':list(anion_list)}

    save_cation_groups_info = pd.DataFrame(save_cation_groups_info)
    save_anion_groups_info = pd.DataFrame(save_anion_groups_info)
    save_cation_groups_info.to_csv(path+"\\init_cation_groups.csv")
    save_anion_groups_info.to_csv(path+"\\init_anion_groups.csv")
    return cation_list,anion_list


# ILs由阴阳离子生成
def generate_ILs(cation_list, anion_list):

    new_ILs = []
    for cation in cation_list:
        # if charge_state(cation) < 1:  # 保障为阳离子
        #     continue
        for anion in anion_list:
            # if charge_state(anion) > -1: # 保障为阴离子
            #     continue
            new_ILs.append(cation+'.'+anion)

    return new_ILs


In [7]:
# 进化框架
# 通过上一代总结的cation, anion，
# 1.生成新的IL，去重
# 2.评价、筛选，得到优异ILs，保存
# 3.获得优异ILs的阴阳离子集合，并通过相似性、变异进行拓展,
# 得到下一代的生成基础cation, anion，保存


class run_evolution:
    def __init__(self, cation_list = None, anion_list = None, gen = 0,
                 root = r'F:\manuscript4\rundata_for_manuscript04\generation\gen',
                 test_run = False):
        '''

        :param anion_list,anion_list: (阳离子列表，阴离子列表)，如果为None则去文件读取
        :param gen: 代数
        :param root: 根目录
        :param test_run: 测试的话则抽小部分来运行
        '''

        if test_run:
            root = r'F:\manuscript4\rundata_for_manuscript04\generation_test\gen'

        self.cation_list ,self.anion_list = cation_list,anion_list
        self.from_path = root+str(gen)
        self.to_path = root+str(gen+1)
        self.gen = gen
        self.root = root

    def step1(self):
        # 获得离子作为进化基础
        if self.anion_list == None:
            if self.gen == 0: # 0 初次运行要改为0
                cation_list,anion_list = get_init_group(path = self.from_path,mutation=True,select_n=1)
            else:
                cation_table = pd.read_csv(self.from_path+'\\init_cation_groups.csv')
                anion_table = pd.read_csv(self.from_path+'\\init_anion_groups.csv')
                cation_list,anion_list = cation_table['SMILES'].to_list(),anion_table['SMILES'].to_list()
        else:
            cation_list,anion_list = self.cation_list ,self.anion_list

        if self.gen != 0:
            ## 优异ILs对应的离子更新到数据库
            ## 加载总离子数据集，数据集持续更新
            database_path = self.root+'0'
            cation_fp_database = pd.read_csv(database_path+'\\cation_fp_database.csv')
            anion_fp_database = pd.read_csv(database_path+'\\anion_fp_database.csv')
            all_cation_smiles = set(cation_fp_database["SMILES"].tolist())
            all_anion_smiles = set(anion_fp_database["SMILES"].tolist())

            ## 查找新产生的离子
            unique_cation = set(cation_list) - all_cation_smiles
            unique_anion = set(anion_list) - all_anion_smiles
            self.unique_cation, self.unique_anion = list(unique_cation), list(unique_anion)

            ## 保存新离子
            ## 计算Morgan指纹
            if len(self.unique_cation) != 0:
                unique_cation_fp = GetMorganFingerprint(self.unique_cation)
                unique_cation_fp.to_csv(database_path+'\\cation_fp_database.csv', mode='a', header=False, index = False)
            if len(self.unique_anion) != 0:
                unique_anion_fp = GetMorganFingerprint(self.unique_anion)
                unique_anion_fp.to_csv(database_path+'\\anion_fp_database.csv', mode='a', header=False, index = False)


        # 生成新ILs
        all_ILs = []
        # 如果例子太多进行随机分组,避免组合爆炸
        if max(len(cation_list),len(anion_list))>100:
            cation_chouyang = chouyang(cation_list,100,2024)
            cation_list_lines = cation_chouyang[0]
            anion_chouyang = chouyang(anion_list,100,2024)
            anion_list_lines = anion_chouyang[0]
            new_ILs = generate_ILs(cation_list_lines,anion_list_lines)
            all_ILs.extend(new_ILs)
        else:
            new_ILs = generate_ILs(cation_list,anion_list)
            all_ILs.extend(new_ILs)

        # 去重
        self.all_ILs = list(set(all_ILs))

    def step2(self, target_requirement=0.3):

        # 评价、筛选
        screen_ILs = screen_smi(self.all_ILs)
        screen_ILs.get_screened_smi(target_requirement=target_requirement)

        # 保存到生成结果到下一代中
        screen_ILs.save_prop(root=self.to_path)

        # 下一代ILs基础
        self.selected_ILs = screen_ILs.selected_mols

        print('------------------------------\n No of all_ILs = %d \n No of selected_ILs= %d \n------------------------------\n'%(len(self.all_ILs),len(self.selected_ILs)))


    def step3(self):
        # 相对前N代selected_mols进行去重保存

        all_old_selected_ILs = []
        for i in range(0,self.gen+1):
            old_selected_ILs = pd.read_csv(self.root+str(i)+'\\selected_mols.csv')
            all_old_selected_ILs.extend(old_selected_ILs['SMILES'].tolist())


        # 去重旧的ILs
        all_old_selected_ILs_smiles = set(all_old_selected_ILs)

        # 抽取唯一ILs
        unique_selected_ILs_smiles = set(self.selected_ILs['SMILES'].tolist()) - all_old_selected_ILs_smiles
        unique_selected_ILs_smiles = list(unique_selected_ILs_smiles)

        try:
            self.unique_selected_ILs = self.selected_ILs[self.selected_ILs['SMILES'].isin(unique_selected_ILs_smiles)]
        except:
            self.unique_selected_ILs = pd.DataFrame(columns=["SMILES","CO2_solubility","SAscore"])

        self.unique_selected_ILs.to_csv(self.to_path+'\\unique_selected_ILs.csv')

    def step4(self,mutation=True,select_n=2):

        # 生成下一代离子基础
        self.next_cation_list,self.next_anion_list = get_init_group(path=self.to_path,mutation=mutation,
                                                                    select_n=select_n,file="unique_selected_ILs.csv")

'''
run1 = run_evolution()
run1.step1()
run1.step2()
run1.step3()
run1.step4()
D = run1.unique_selected_ILs
print(D.head())
'''


'\nrun1 = run_evolution()\nrun1.step1()\nrun1.step2()\nrun1.step3()\nrun1.step4()\nD = run1.unique_selected_ILs\nprint(D.head())\n'

In [15]:
'''
前30代宽容约束0.3，以保障足够多样的结构被学到
30代-60代，缩紧约束0.8，以细致挖掘高性能结构
60代以后，再控制结构长度，即非氢原子数量<35
'''

import warnings
warnings.filterwarnings("ignore")
# 正式运行 生成迭代
gen_min, gen_max = 83,90

## 初始状态
test_run = False
mutation_state = False
mutation_select_n = 5

target_requirement = 0.9 #30代以前0.3，以后0.6
for i in range(gen_min,gen_max):

    print('--------------%d-----------------\n'%i)

    run1 = run_evolution(gen=i,test_run=test_run)
    run1.step1()

    run1.step2(target_requirement=target_requirement)


    # 控制变异，如果上一代给的新离子基础太多，则不变异暂时
    if i > 0:
        len_unique_cation, len_unique_anion = len(run1.unique_cation), len(run1.unique_anion)
        max_unique_ion = max(len_unique_cation, len_unique_anion)
        min_unique_ion = min(len_unique_cation, len_unique_anion)
        print('--------------unique_cation=%d, unique_anion=%d-----------------\n'%(len_unique_cation, len_unique_anion))

        # if  min_unique_ion < 10 :
        #     mutation_state = True
        #     mutation_select_n = 5 # 变异数量增大
        # elif min_unique_ion >= 10 & max_unique_ion < 50 :
        #     mutation_state = True
        #     mutation_select_n = 2 # 变异数量缩小
        # else:
        #     mutation_state = False


    run1.mutation = mutation_state
    run1.step3()
    run1.step4(select_n=mutation_select_n)

    final_num_smiles = len(run1.unique_selected_ILs['SMILES'].tolist())

    # 缩小约束，以求更好表现的结构


    # if final_num_smiles <= 0.8:
    #     target_requirement = 0.5+i*0.05
    # else:
    #     target_requirement = 0.8

    if final_num_smiles <= 0.9:
        target_requirement = target_requirement+0.05
    else:
        target_requirement = 0.9

--------------83-----------------

---  There is this folder!  ---
------------------------------
 No of all_ILs = 1188 
 No of selected_ILs= 29 
------------------------------

--------------unique_cation=30, unique_anion=14-----------------

------------------------------
 cation_list0 = 0 
 anion_list0= 0 
------------------------------

------------------------------
 cation_list1 = 0 
 anion_list1= 0 
------------------------------

--------------84-----------------



IndexError: list index out of range

------------------------------
 cation_list0 = 10 
 anion_list0= 9 
------------------------------

------------------------------
 cation_list1 = 80 
 anion_list1= 44 
------------------------------



({'C=Cn1cc[n+](Cc2ccccc2)c1',
  'CC(=NOCCCCCC(=O)NNC(=O)CNCCC[n+]1ccccc1)c1ccccc1',
  'CC(=O)N(CCCN(C)CC(=O)NNC(=O)NCCCc1ccccc1)[n+]1ccccc1',
  'CC(CNCC(=O)NNC(=O)CCN(c1ccccc1)S(C)(=O)=O)O[n+]1ccccc1',
  'CC(NCC(=O)NCC(=O)NNC(=O)CCCOc1ccccc1)[n+]1ccccc1',
  'CC(OC(=O)CNC(=O)NNC(=O)CN(C)CCC[n+]1ccccc1)c1ccccc1',
  'CCCCCCCCCCCCCCCC[n+]1ccccc1',
  'CCCCCCCCCCCC[n+]1ccccc1',
  'CCCCCCCCCC[n+]1ccccc1',
  'CCCCCCCC[n+]1ccccc1',
  'CCCCCCC[n+]1ccccc1',
  'CCCCC[n+]1ccccc1',
  'CCCCNC(=O)Cn1cc[n+](CC)c1',
  'CCCC[N+]1=CC=CC=C1',
  'CCCC[n+]1ccccc1',
  'CCOC(=O)C[n+]1ccccc1',
  'CN(C)c1cc[n+](CCCOc2ccccc2)cc1',
  'CN(C)c1cc[n+](CCOc2ccccc2)cc1',
  'CN(CCC(=O)NNC(=O)CCCNC(=O)CC[n+]1ccccc1)Cc1ccccc1',
  'CN(CCCC#CCCCC[n+]1ccccc1)CC(=O)NNC(=S)Nc1ccccc1',
  'CN(CCCC(=O)CCC[n+]1ccccc1)CC(=O)NNC(=S)NN=Cc1ccccc1',
  'CN(CCCC(=O)CCC[n+]1ccccc1)CC(=O)NNC(=S)Nc1ccccc1',
  'CN(CCCC(=O)[n+]1ccccc1)CC(=O)NNC(=O)NCC(=O)NCc1ccccc1',
  'CN(CCCC(=O)[n+]1ccccc1)CC(=O)NNC(=O)NCCCc1ccccc1',
  'CN(CCCCC(=O)NNC(=O)