In [None]:
'''
通过指定性质，筛选分子，再进行拆分，寻找相似结构（扩充），合成新分子
实现分子自动进化寻优
优化目标暂定为 分子具备较好的SLOGP, 其次考虑可合成性sa score、类药性qed合适，用区间限定
类药性qed:越大越好。将药物相似性量化为介于0和1之间的数值
sa score:越靠近1表明越容易合成，越靠近10表明合成越困难。介于1和10之间
SLOGP:越靠近1.5的越安全，风险越小。（logP 0-3范围是最佳，高logP化合物水溶性差，低logP化合物脂渗透性差）

    # 容忍范围 {"QED":(0.8,1),"SAscore":(1,3),"SLOGP":(0,3)}
    # 理想目标{"QED":1,"SAscore":1,"SLOGP":1.5}

** 后期引入概率、权重等更新进化思想
'''
import random
import numpy as np
import pandas as pd
from calc_logP_QED_sa import mol_properties
from fragments2mol import *
from test_fp_similarity import *
import time
from littlecode.tools.chouyang import chouyang
from littlecode.tools.mkdir import mkdir
from codes_for_manuscript_02.my_som_cluster import SOM_learner

root = "F:\\WORK\\和外合作\\天宇媛媛毕设\\"

In [2]:
# 校正smiles
def smiles_rewrite(smiles_list):
    smiles_list = read_mol(smiles_list,1)
    smiles_list = read_mol(smiles_list,0)
    return smiles_list

In [3]:
# smiles去重,不区分大小写
def quchong_smiles(smiles):
    unique_lst = []
    for s in smiles:
        if s not in unique_lst:
            unique_lst.append(s)
    return unique_lst

# 未完待续

In [4]:
# 计算分子库的properties：logP_QED_sa
# 因为计算量大，所以只跑一次就关掉
run_cal_mol_properties = 0
if run_cal_mol_properties:
    mols_file = root+"初始数据\\清洗后分子总样本smiles.csv"
    mols = pd.read_csv(mols_file)
    a = mol_properties(mol_list=mols["SMILES"].tolist())
    mols["SLOGP"], mols["QED"], mols["SAscore"] = a.cal_SLOGP(), a.cal_qed(), a.cal_sa()

    mols.to_csv(root+"QSAR\\总样本properties.csv", index=False)

In [5]:
# 定义一个新指标，以其为优化目标（最小值，理想情况<1）
#  = 理想目标的欧氏距离，并考虑进容忍范围（超出容忍范围会显著增大指标>1）
def new_mertric(df):
    # df: dataframe 包含"QED", "SAscore", "SLOGP"
    QED, SAscore, SLOGP = (df[x].values for x in ["QED", "SAscore", "SLOGP"])
    df["new_mertric"] = np.sqrt(((QED-1)/0.2)**2+((SAscore-1)/2)**2+((SLOGP-1.5)/3)**2)
    return df

In [6]:
# 从初始分子集合中，筛选出类药分子
run_init_prop = 0

if run_init_prop:

    target_range = {"QED":(0.8,1),"SAscore":(1,3),"SLOGP":(0,3)}

    c = pd.read_csv("F:\WORK\和外合作\天宇媛媛毕设\QSAR\总样本properties.csv")
    for item in target_range:
        print(item)
        c = c[c[item] >= target_range[item][0]]
        c = c[c[item] <= target_range[item][1]]

    c = new_mertric(c) # 计算新指标
    c = c.sort_values(by="new_mertric") # 排序
    c.to_csv("F:\WORK\和外合作\天宇媛媛毕设\QSAR\类药样本.csv", index=False)


In [7]:
# 根据smiles计算properties：logP_QED_sa，再计算new_mertric
# 最好再排序筛选
class screen_smi:
    def __init__(self, smi_list):
        '''
        :param smi_list: list of smiles,或者已经计算了props的dataframe
        '''

        if type(smi_list) == list:
            self.smi_list = smi_list
            mols = {"SMILES":self.smi_list}
            a = mol_properties(mol_list=mols["SMILES"])
            mols["SLOGP"], mols["QED"], mols["SAscore"] = a.cal_SLOGP(), a.cal_qed(), a.cal_sa()
            mols = pd.DataFrame(mols)
            self.mols_prop = new_mertric(mols).sort_values(by="new_mertric")
        else:
            mols = smi_list[['SMILES','SLOGP','QED','SAscore','new_mertric']]
            self.mols_prop = new_mertric(mols).sort_values(by="new_mertric")

    def get_top_smi(self, select_n = 10):
        # 取前几名
        self.selected_mols = self.mols_prop[:select_n]
        return self.selected_mols["SMILES"].tolist()

    def get_screened_smi(self, max_new_mertric = 0.8):
        # 取所有<max_new_mertric
        self.selected_mols = self.mols_prop[self.mols_prop["new_mertric"] < max_new_mertric]
        return self.selected_mols["SMILES"].tolist()

    def save_prop(self, root = "F:\\WORK\\和外合作\\天宇媛媛毕设\\QSAR\\gen0"):
        # 只保存
        path = mkdir(root)
        try:
            self.selected_mols.to_csv(path+"\\selected_mols.csv", index=False)
        except:
            self.mols_prop.to_csv(path+"\\all_mol_props.csv", index=False)


In [8]:
# 获取类药样本中的精英 new_mertric<0.8
run_get_best_mol = 0
if run_get_best_mol:
    smi_list = pd.read_csv("F:\WORK\和外合作\天宇媛媛毕设\QSAR\类药样本.csv")
    a = screen_smi(smi_list)
    a.get_screened_smi(0.8)
    a.save_prop(root = "F:\\WORK\\和外合作\\天宇媛媛毕设\\QSAR\\gen0")

In [9]:
# 异常格式处理1
run_str2list = 0
if run_str2list:
    a = pd.read_csv(root + "初始数据\\decorations_unique.csv",index_col=0)
    res = []
    for x in a["frag_SMILES"]:

        try:
            x = eval(x)
            x = x[0]
        except:
            x = x

        res.append(x)
    a["frag_SMILES"] = res
    a.to_csv(root + "初始数据\\decorations_unique.csv")

    b = pd.read_csv(root + "初始数据\\Scaffold_unique.csv",index_col=0)
    res = []
    for x in a["frag_SMILES"]:

        try:
            x = eval(x)
            x = x[0]
        except:
            x = x

        res.append(x)
    b["frag_SMILES"] = res
    b.to_csv(root + "初始数据\\Scaffold_unique.csv")

In [10]:
# 异常格式处理2
run_str2list2 = 0
if run_str2list2:
    a = pd.read_csv(root + "som化学空间\\Scaffold\\Scaffold_space.csv",index_col=0)
    res = []
    for x in a["frag_SMILES"]:

        try:
            x = eval(x)
            x = x[0]
        except:
            x = x

        res.append(x)
    a["frag_SMILES"] = res
    a.to_csv(root + "som化学空间\\Scaffold\\Scaffold_space.csv")

    b = pd.read_csv(root + "som化学空间\\decorations\\decorations_space.csv",index_col=0)
    res = []
    for x in b["frag_SMILES"]:

        try:
            x = eval(x)
            x = x[0]
        except:
            x = x

        res.append(x)
    b["frag_SMILES"] = res
    b.to_csv(root + "som化学空间\\decorations\\decorations_space.csv")

In [11]:
def yichangchuli(smi_list):
    result = []
    for i in smi_list:
        try:
            i = eval(i)
            i = i[0]
        except:
            i = i
        result.append(i)

    return result

In [12]:
# 初始 多 种群获得
# 将备选的六百多个精英分子分别拆分、去重得到对应的骨架和碎片集，得到初始种群
run_init_group = 0
def get_init_group(path = "F:\\WORK\\和外合作\\天宇媛媛毕设\\generate\\gen0\\"):
    init_mols = pd.read_csv(path+"\\selected_mols.csv")
    init_mols_groups = init_mols["SMILES"].tolist()


    _, scaffords,_ ,_ = split_mol(init_mols_groups,'smiles',10,10000,"[Fr]")
    _, decorations,_ ,_ = split_mol(init_mols_groups,'smiles',2,10,"[Cs]")

    scaffords_groups = chouyang(scaffords, 5)
    decorations_groups = chouyang(decorations, 5)

    save_scaffords_groups_info = {i:[scaffords_groups[i]] for i in range(len(scaffords_groups))}
    save_decorations_groups_info = {i:[decorations_groups[i]] for i in range(len(decorations_groups))}
    save_scaffords_groups_info = pd.DataFrame(save_scaffords_groups_info)
    save_decorations_groups_info = pd.DataFrame(save_decorations_groups_info)
    save_scaffords_groups_info.to_csv(path+"\\init_scaffords_groups.csv")
    save_decorations_groups_info.to_csv(path+"\\init_decorations_groups.csv")

if run_init_group == 0:
    get_init_group()

In [13]:
# 化学空间查询相似结构
def add_similar_frags(scaffords=[], decorations=[], select_n=5, root="F:\\WORK\\和外合作\\天宇媛媛毕设"):
    '''

    :param scaffords: smiles list with Fr
    :param decorations: smiles list with Cs
    :param select_n: 补加相似分子个数
    :param root: 主路径
    :return: 补加后的scaffords, decorations
    '''

    # 骨架
    if scaffords != []:
        # 补全分子
        scaffords_addH= []
        for item in scaffords:
            item = item.replace("Fr","H")
            scaffords_addH.append(item)
        # 计算fp
        scaffords_fp_cal = cal_fingerprint(scaffords_addH)
        scaffords_fp_cal.RDKitTopological(1024)
        scaffords_fp = scaffords_fp_cal.fp_TopoFingerprint.values
        # 读取SOM化学空间模型
        scaffords_model_path = root + "\\som化学空间\\Scaffold\\size20_sigma3_random_seed2023_model.p"

        scaffords_model = SOM_learner(dataset=scaffords_fp,trained_som=scaffords_model_path,size=20)
        scaffords_model.cluster_results()

        # 获取对应的cluster_index
        scaffords_id = scaffords_model.cluster_results_table["cluster_index"].tolist()

        # 查找相似结构
        Scaffold_space = pd.read_csv(root + "\\som化学空间\\Scaffold\\Scaffold_space.csv")

        # 逐个补充相似结构并去重
        new_scaffords,new_decorations = set(scaffords), set(decorations)
        for i in range(len(scaffords)):
            similar_scaffords = Scaffold_space[Scaffold_space["cluster_index"] == scaffords_id[i]]["frag_SMILES"].tolist()
            if len(similar_scaffords) == 0:
                continue

            try:
                add_scaffords = random.sample(similar_scaffords,select_n)
            except:
                add_scaffords = similar_scaffords

            new_scaffords.update(set(add_scaffords))
    else:
        new_scaffords = set([])

    # 修饰物
    if decorations != []:
        # 补全分子
        decorations_addH = []
        for item in decorations:
            item = item.replace("Cs","H")
            decorations_addH.append(item)
        # 计算fp
        decorations_fp_cal = cal_fingerprint(decorations_addH)
        decorations_fp_cal.RDKitTopological(1024)
        decorations_fp = decorations_fp_cal.fp_TopoFingerprint.values
        # 读取SOM化学空间模型
        decorations_model_path = root + "\\som化学空间\\decorations\\size10_sigma3_random_seed2023_model.p"

        decorations_model = SOM_learner(dataset=decorations_fp,trained_som=decorations_model_path,size=10)
        decorations_model.cluster_results()

        # 获取对应的cluster_index
        decorations_id = decorations_model.cluster_results_table["cluster_index"].tolist()
        # 查找相似结构
        decorations_space = pd.read_csv(root + "\\som化学空间\\decorations\\decorations_space.csv")
        # 逐个补充相似结构并去重
        new_decorations = set(decorations)
        for i in range(len(decorations)):
            similar_decorations = decorations_space[decorations_space["cluster_index"] == decorations_id[i]]["frag_SMILES"].tolist()
            try:
                add_decorations = random.sample(similar_decorations,select_n)
            except:
                add_decorations = similar_decorations

            new_decorations.update(set(add_decorations))
    else:
        new_decorations = set([])



    return list(new_scaffords), list(new_decorations)

In [14]:
# 进化框架
# 通过上一代总结的scaffords, decorations，
# 1，fully_decorated=0生成新的分子all_new_mols，计算指标，初步筛选scaffords, decorations，
# 2，然后用选好的再进行fully_decorated=1的生成，计算指标，筛选，保留最终的all_new_mols，list_all
# 3，最后，将list_all分子拆解为新一代的scaffords, decorations，
# 4，通过化学空间查询，加入部分相似scaffords, decorations，丰富结构
# 5，回到1
class run_evolution:
    def __init__(self, scaffords, decorations):
        self.scaffords = scaffords
        self.decorations = decorations

        # 1
        dict_new_mols_1, self.set_new_mols_1 = frag2mol_2(self.scaffords, self.decorations, fully_decorated=1)
        list_new_mols_1 = list(self.set_new_mols_1) # 生成分子的smiles list
        a1 = screen_smi(list_new_mols_1)
        list_screened_mols_1 = a1.get_screened_smi(0.8)
        self.set_screened_mols_1 =  set(list_screened_mols_1)# 以1为阈值，筛选合格分子
        # index_1 = [index for index, value in enumerate(list_new_mols_1) if value in self.list_screened_mols_1]


In [23]:
# with胡歌 遍历运行得到gen1，再进行相似物扩充得到gen2
# 计算基础
gen = 1

#1 遍历运行，保存新指标<0.8

read_path = mkdir(root + "generate\\gen" + str(gen))
save_path = mkdir(root + "generate\\gen" + str(gen+1))
if gen == 0:
    init_scaffords = pd.read_csv(read_path + "\\" + "init_scaffords_groups.csv", index_col=0)
    init_decorations = pd.read_csv(read_path + "\\" +"init_decorations_groups.csv", index_col=0)
else:
    init_scaffords = pd.read_csv(read_path + "\\" + "similar_scaffords_groups.csv", index_col=0)
    init_decorations = pd.read_csv(read_path + "\\" +"similar_decorations_groups.csv", index_col=0)

all_new_mols, all_screened_mols = set(),set()

len_decorations = init_decorations.shape[1]
len_scaffords = init_scaffords.shape[1]

for i in range(60,110):
    print(i)
    for j in range(len_decorations):

        scaffords1, decorations1 = init_scaffords.iloc[0,i], init_decorations.iloc[0,j]
        scaffords1, decorations1 = eval(scaffords1), eval(decorations1)
        decorations1 = yichangchuli(decorations1)

        decorations = set(decorations1)
        decorations = list(decorations)
        a = run_evolution(scaffords=scaffords1,decorations=decorations)

        all_screened_mols.update(a.set_screened_mols_1)
        all_new_mols.update(a.set_new_mols_1)

all_screened_mols, all_new_mols = list(all_screened_mols), list(all_new_mols)
# 保存这一代结果
save_table1 = pd.DataFrame({"SMILES": all_screened_mols})
save_table2 = pd.DataFrame({"all_new_mols": all_new_mols})
save_table1.to_csv(save_path + "\\" + "selected_mols.csv")
save_table2.to_csv(save_path + "\\" + "all_new_mols.csv")

# 拆分碎片并保存作为下一代的运行基础
get_init_group(save_path)

---  There is this folder!  ---
---  There is this folder!  ---
60
61
62


RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [18:32:36] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [18:32:36] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [18:32:36] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exi

63
64
65
66


RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [21:32:53] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [21:32:53] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ***

67
68


RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [21:39:50] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [21:39:50] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [21:39:50] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exi

69
70
71
72
73
74
75
76
77
78


RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [21:40:36] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [21:40:36] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ***

79


RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [22:17:08] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [22:17:12] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [22:17:12] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exi

80
81


RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [22:17:44] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [22:17:44] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bond already exists
RDKit ERROR: Violation occurred on line 311 in file C:\Users\glandrum\Anaconda3\conda-bld\rdkit_1588910360783\work\Code\GraphMol\RWMol.cpp
RDKit ERROR: Failed Expression: !(boost::edge(atomIdx1, atomIdx2, d_graph).second)
RDKit ERROR: ***

82


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "d:\tristan\anaconda3\envs\awen\lib\site-packages\IPython\core\interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-de509a6b7d83>", line 32, in <module>
    a = run_evolution(scaffords=scaffords1,decorations=decorations)
  File "<ipython-input-14-a5562a2bc21a>", line 17, in __init__
    a1 = screen_smi(list_new_mols_1)
  File "<ipython-input-7-fa5fc06c0b89>", line 14, in __init__
    mols["SLOGP"], mols["QED"], mols["SAscore"] = a.cal_SLOGP(), a.cal_qed(), a.cal_sa()
  File "E:\Pycharm projects\codes_for_manuscript_03\calc_logP_QED_sa.py", line 31, in cal_SLOGP
    SLOGP_list = [QED.properties(mol).ALOGP for mol in self.mol_list]
  File "E:\Pycharm projects\codes_for_manuscript_03\calc_logP_QED_sa.py", line 31, in <listcomp>
    SLOGP_list = [QED.properties(mol).ALOGP for mol in self.mol_list]
  File "d:\tristan\anaconda3\envs\awen\lib\site-packages\rdkit\Chem\QED.py", line

KeyboardInterrupt: 

In [28]:
# 统计全新分子(唯一值)

# 新生成的分子群
gen = 2
new_mols = pd.read_csv(root + "generate\\gen" + str(gen) + "\\selected_mols.csv")
new_mols_set = set(new_mols["SMILES"].tolist())
new_mols_set = set(smiles_rewrite(new_mols_set))

# 读取旧的分子群，作为参照集合，来对新分子集合去重
old_mols_set_all = set()
for i in range(gen):
    old_mols = pd.read_csv(root + "generate\\gen" + str(i) + "\\selected_mols.csv")
    old_mols_set = set(old_mols["SMILES"].tolist())
    old_mols_set = set(smiles_rewrite(old_mols_set))
    old_mols_set_all.update(old_mols_set)

# 统计 新分子集合 相对 旧的分子群的唯一值
c = new_mols_set.difference(old_mols_set_all)
c = list(c)
d = mol_properties(c) # SLOGP,QED,SAscore,new_mertric
c = pd.DataFrame({"SMILES":c,"SLOGP":d.cal_SLOGP(),"QED":d.cal_qed(),"SAscore":d.cal_sa()})
c = new_mertric(c)
c.to_csv(root + "generate\\gen" + str(gen) + "\\final_new_mols.csv")

In [30]:
#2 优秀分子扩充相似物（新指标<0.8）再生成

# 待扩充分子代
gen = 1
# 相似物扩充
file1 = root + "generate\\gen" + str(gen) + "\\init_scaffords_groups.csv"
file2 = root + "generate\\gen" + str(gen) + "\\init_decorations_groups.csv"
init_scaffords = pd.read_csv(file1,index_col=0)
init_decorations = pd.read_csv(file2,index_col=0)
len_decorations = init_decorations.shape[1]
len_scaffords = init_scaffords.shape[1]

# 扩充骨架
all_scaffords = set()
for i in range(len_scaffords):
    scaffords01 = init_scaffords.iloc[0,i]
    scaffords1 = eval(scaffords01)

    new_scaffords,_ = add_similar_frags(scaffords=scaffords1,decorations=[],select_n=5)
    all_scaffords.update(new_scaffords)

# 扩充修饰物
all_decorations = set()
for j in range(len_decorations):
    decorations01 = init_decorations.iloc[0,j]
    decorations1 = eval(decorations01)
    decorations1 = yichangchuli(decorations1)

    _, new_decorations1 = add_similar_frags(scaffords=[],decorations=decorations1,select_n=5)
    all_decorations.update(new_decorations1)

all_scaffords, all_decorations = list(all_scaffords), list(all_decorations)

scaffords_groups = chouyang(all_scaffords, 5)
decorations_groups = chouyang(all_decorations, 5)

save_scaffords_groups_info = {i:[scaffords_groups[i]] for i in range(len(scaffords_groups))}
save_decorations_groups_info = {i:[decorations_groups[i]] for i in range(len(decorations_groups))}
save_scaffords_groups_info = pd.DataFrame(save_scaffords_groups_info)
save_decorations_groups_info = pd.DataFrame(save_decorations_groups_info)

save_scaffords_groups_info.to_csv(root + "generate\\gen" + str(gen) + "\\similar_scaffords_groups.csv")
save_decorations_groups_info.to_csv(root + "generate\\gen" + str(gen) + "\\similar_decorations_groups.csv")


In [27]:
# 合并文件中的分子，set去重
# 将selected_mols1，2，3等合并去重为一个selected_mols.csv
run_hebing = 1
if run_hebing:
    gen = 2
    all_smiles_set = set()

    i = 1
    while i >= 1:
        try:
            table = pd.read_csv(root + "generate\\gen" + str(gen) + "\\selected_mols" + str(i) + ".csv")
        except:
            break

        smiles_list = table["SMILES"].tolist()
        smiles_set = set(smiles_list)
        all_smiles_set.update(smiles_set)
        i += 1

    all_smiles_list = list(all_smiles_set)
    output_table = pd.DataFrame({"SMILES":all_smiles_list})
    output_table.to_csv(root + "generate\\gen" + str(gen) + "\\selected_mols.csv")


In [75]:
# 分子碎片拆分，并统计碎片的出现频次
run_frags_tongji = 1
if run_frags_tongji:
    from collections import Counter
    file = "F:\\WORK\\和外合作\\天宇媛媛毕设\\generate\\result2\\"

    for sheet in ['gen0', 'gen1', 'gen2']:
        table = pd.read_excel(file+"汇总IDP.xlsx", sheet_name = sheet)
        smiles_list = table["SMILES"].tolist()

        all_smiles_list = []
        for smiles in smiles_list:
            frags = split_mol([smiles],"smiles",2,1000)
            a = frag_smi_clean(frags[1], "[Cs]")
            all_smiles_list.extend(a)

        frags_tongji = Counter(all_smiles_list)
        frags_tongji = dict(frags_tongji)
        frags_tongji_table = pd.DataFrame(frags_tongji,index=[0])
        frags_tongji_table.to_csv(file+sheet+"_碎片统计.csv",index=False)
#  这里一定要保存!!!!!!!!

In [None]:
file = "F:\\WORK\\和外合作\\天宇媛媛毕设\\generate\\result2\\"

for sheet in ['gen0', 'gen1', 'gen2']:
    table = pd.read_excel(file+"汇总IDP.xlsx", sheet_name = sheet)
    smiles_list = table["SMILES"].tolist()

    for i in range(len(smiles_list)):
        fig = draw_multi_mol([smiles_list[i]],row_num=1,Size=500)
        fig.save(file+ "分子图\\" + sheet + "_" + str(i)+".png")


NameError: name 'fig' is not defined